cuda-cccl 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/__init__.py +27 -0
- cuda/cccl/_cuda_version_utils.py +24 -0
- cuda/cccl/cooperative/__init__.py +9 -0
- cuda/cccl/cooperative/experimental/__init__.py +24 -0
- cuda/cccl/headers/__init__.py +7 -0
- cuda/cccl/headers/include/__init__.py +1 -0
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +259 -0
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1182 -0
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +81 -0
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +709 -0
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +748 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +786 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +703 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +555 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +589 -0
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +474 -0
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +289 -0
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1117 -0
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +606 -0
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +631 -0
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +963 -0
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1227 -0
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +1313 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +424 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +1264 -0
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1225 -0
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2196 -0
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +667 -0
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +2315 -0
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
- cuda/cccl/headers/include/cub/block/block_store.cuh +1247 -0
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
- cuda/cccl/headers/include/cub/config.cuh +53 -0
- cuda/cccl/headers/include/cub/cub.cuh +120 -0
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +114 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
- cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
- cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
- cuda/cccl/headers/include/cub/device/device_for.cuh +1063 -0
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
- cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
- cuda/cccl/headers/include/cub/device/device_partition.cuh +668 -0
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +2518 -0
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
- cuda/cccl/headers/include/cub/device/device_scan.cuh +2212 -0
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1430 -0
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
- cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
- cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
- cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1744 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1310 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +531 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +517 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +975 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +440 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +627 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +569 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +261 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +583 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +189 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +321 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +522 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1028 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +67 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +118 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +60 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +275 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +76 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +126 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1065 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +942 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +673 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +618 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1010 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +398 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +440 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +481 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +884 -0
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +548 -0
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
- cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
- cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
- cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
- cuda/cccl/headers/include/cub/util_device.cuh +800 -0
- cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
- cuda/cccl/headers/include/cub/util_math.cuh +118 -0
- cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
- cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
- cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
- cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
- cuda/cccl/headers/include/cub/version.cuh +89 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +737 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +408 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +829 -0
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1890 -0
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +521 -0
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
- cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
- cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +487 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
- cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
- cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
- cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
- cuda/cccl/headers/include/cuda/__cccl_config +37 -0
- cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
- cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
- cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
- cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
- cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
- cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
- cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
- cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
- cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
- cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
- cuda/cccl/headers/include/cuda/__complex_ +28 -0
- cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
- cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
- cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +835 -0
- cuda/cccl/headers/include/cuda/__event/event.h +171 -0
- cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
- cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
- cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
- cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
- cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
- cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
- cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
- cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
- cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
- cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
- cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +533 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
- cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
- cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
- cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
- cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
- cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
- cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +652 -0
- cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
- cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2983 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
- cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
- cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
- cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
- cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
- cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
- cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
- cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
- cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
- cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
- cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
- cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
- cuda/cccl/headers/include/cuda/access_property +26 -0
- cuda/cccl/headers/include/cuda/algorithm +27 -0
- cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
- cuda/cccl/headers/include/cuda/atomic +27 -0
- cuda/cccl/headers/include/cuda/barrier +267 -0
- cuda/cccl/headers/include/cuda/bit +29 -0
- cuda/cccl/headers/include/cuda/cmath +37 -0
- cuda/cccl/headers/include/cuda/devices +33 -0
- cuda/cccl/headers/include/cuda/discard_memory +32 -0
- cuda/cccl/headers/include/cuda/functional +32 -0
- cuda/cccl/headers/include/cuda/iterator +39 -0
- cuda/cccl/headers/include/cuda/latch +27 -0
- cuda/cccl/headers/include/cuda/mdspan +28 -0
- cuda/cccl/headers/include/cuda/memory +35 -0
- cuda/cccl/headers/include/cuda/memory_resource +35 -0
- cuda/cccl/headers/include/cuda/numeric +29 -0
- cuda/cccl/headers/include/cuda/pipeline +579 -0
- cuda/cccl/headers/include/cuda/ptx +129 -0
- cuda/cccl/headers/include/cuda/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
- cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
- cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
- cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
- cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
- cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
- cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
- cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
- cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
- cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
- cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
- cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
- cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +673 -0
- cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +91 -0
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
- cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
- cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
- cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
- cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
- cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
- cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
- cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
- cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
- cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +259 -0
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
- cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
- cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
- cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
- cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
- cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
- cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
- cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
- cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
- cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
- cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
- cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
- cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
- cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
- cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
- cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
- cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
- cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
- cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
- cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
- cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
- cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
- cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
- cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +139 -0
- cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
- cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
- cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
- cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +165 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
- cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
- cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
- cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
- cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
- cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
- cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
- cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
- cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
- cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
- cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
- cuda/cccl/headers/include/cuda/std/__format_ +45 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
- cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
- cuda/cccl/headers/include/cuda/std/__functional/function.h +1275 -0
- cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
- cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
- cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
- cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
- cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
- cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
- cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
- cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
- cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
- cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
- cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
- cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
- cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +72 -0
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
- cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
- cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
- cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
- cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
- cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
- cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
- cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
- cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
- cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
- cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
- cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
- cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +759 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
- cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +525 -0
- cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
- cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
- cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
- cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
- cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
- cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
- cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
- cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
- cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
- cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
- cuda/cccl/headers/include/cuda/std/__new_ +29 -0
- cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
- cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
- cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
- cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
- cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
- cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
- cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
- cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
- cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
- cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
- cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
- cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
- cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
- cuda/cccl/headers/include/cuda/std/__random_ +29 -0
- cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
- cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
- cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
- cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
- cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
- cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
- cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
- cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
- cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
- cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
- cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
- cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
- cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
- cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
- cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
- cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
- cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
- cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
- cuda/cccl/headers/include/cuda/std/__string_ +29 -0
- cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
- cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +84 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
- cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
- cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
- cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
- cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
- cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
- cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
- cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
- cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
- cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
- cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
- cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
- cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
- cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
- cuda/cccl/headers/include/cuda/std/array +518 -0
- cuda/cccl/headers/include/cuda/std/atomic +810 -0
- cuda/cccl/headers/include/cuda/std/barrier +42 -0
- cuda/cccl/headers/include/cuda/std/bit +35 -0
- cuda/cccl/headers/include/cuda/std/bitset +994 -0
- cuda/cccl/headers/include/cuda/std/cassert +28 -0
- cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
- cuda/cccl/headers/include/cuda/std/cfloat +59 -0
- cuda/cccl/headers/include/cuda/std/chrono +26 -0
- cuda/cccl/headers/include/cuda/std/climits +61 -0
- cuda/cccl/headers/include/cuda/std/cmath +87 -0
- cuda/cccl/headers/include/cuda/std/complex +50 -0
- cuda/cccl/headers/include/cuda/std/concepts +48 -0
- cuda/cccl/headers/include/cuda/std/cstddef +28 -0
- cuda/cccl/headers/include/cuda/std/cstdint +178 -0
- cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
- cuda/cccl/headers/include/cuda/std/cstring +110 -0
- cuda/cccl/headers/include/cuda/std/ctime +154 -0
- cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2144 -0
- cuda/cccl/headers/include/cuda/std/execution +29 -0
- cuda/cccl/headers/include/cuda/std/expected +30 -0
- cuda/cccl/headers/include/cuda/std/functional +56 -0
- cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
- cuda/cccl/headers/include/cuda/std/iterator +70 -0
- cuda/cccl/headers/include/cuda/std/latch +34 -0
- cuda/cccl/headers/include/cuda/std/limits +28 -0
- cuda/cccl/headers/include/cuda/std/linalg +30 -0
- cuda/cccl/headers/include/cuda/std/mdspan +38 -0
- cuda/cccl/headers/include/cuda/std/memory +39 -0
- cuda/cccl/headers/include/cuda/std/numbers +346 -0
- cuda/cccl/headers/include/cuda/std/numeric +41 -0
- cuda/cccl/headers/include/cuda/std/optional +31 -0
- cuda/cccl/headers/include/cuda/std/ranges +69 -0
- cuda/cccl/headers/include/cuda/std/ratio +416 -0
- cuda/cccl/headers/include/cuda/std/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/source_location +83 -0
- cuda/cccl/headers/include/cuda/std/span +628 -0
- cuda/cccl/headers/include/cuda/std/string_view +925 -0
- cuda/cccl/headers/include/cuda/std/tuple +26 -0
- cuda/cccl/headers/include/cuda/std/type_traits +177 -0
- cuda/cccl/headers/include/cuda/std/utility +70 -0
- cuda/cccl/headers/include/cuda/std/variant +25 -0
- cuda/cccl/headers/include/cuda/std/version +240 -0
- cuda/cccl/headers/include/cuda/stream +31 -0
- cuda/cccl/headers/include/cuda/stream_ref +59 -0
- cuda/cccl/headers/include/cuda/type_traits +27 -0
- cuda/cccl/headers/include/cuda/utility +28 -0
- cuda/cccl/headers/include/cuda/version +16 -0
- cuda/cccl/headers/include/cuda/warp +28 -0
- cuda/cccl/headers/include/cuda/work_stealing +26 -0
- cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
- cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
- cuda/cccl/headers/include/nv/target +240 -0
- cuda/cccl/headers/include/thrust/addressof.h +22 -0
- cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
- cuda/cccl/headers/include/thrust/advance.h +57 -0
- cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
- cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
- cuda/cccl/headers/include/thrust/complex.h +858 -0
- cuda/cccl/headers/include/thrust/copy.h +506 -0
- cuda/cccl/headers/include/thrust/count.h +245 -0
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
- cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +626 -0
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +192 -0
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +96 -0
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +78 -0
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +115 -0
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +116 -0
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
- cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
- cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
- cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
- cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
- cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
- cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
- cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
- cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config.h +36 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
- cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
- cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
- cuda/cccl/headers/include/thrust/detail/count.h +55 -0
- cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
- cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
- cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
- cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +81 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
- cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
- cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
- cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
- cuda/cccl/headers/include/thrust/detail/function.h +49 -0
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
- cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
- cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
- cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
- cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
- cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
- cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
- cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
- cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
- cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
- cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
- cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
- cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
- cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
- cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
- cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
- cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
- cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
- cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
- cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
- cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
- cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
- cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
- cuda/cccl/headers/include/thrust/device_delete.h +74 -0
- cuda/cccl/headers/include/thrust/device_free.h +85 -0
- cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
- cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
- cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
- cuda/cccl/headers/include/thrust/device_new.h +112 -0
- cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
- cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
- cuda/cccl/headers/include/thrust/device_reference.h +983 -0
- cuda/cccl/headers/include/thrust/device_vector.h +576 -0
- cuda/cccl/headers/include/thrust/distance.h +43 -0
- cuda/cccl/headers/include/thrust/equal.h +247 -0
- cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
- cuda/cccl/headers/include/thrust/extrema.h +657 -0
- cuda/cccl/headers/include/thrust/fill.h +200 -0
- cuda/cccl/headers/include/thrust/find.h +382 -0
- cuda/cccl/headers/include/thrust/for_each.h +261 -0
- cuda/cccl/headers/include/thrust/functional.h +395 -0
- cuda/cccl/headers/include/thrust/gather.h +464 -0
- cuda/cccl/headers/include/thrust/generate.h +193 -0
- cuda/cccl/headers/include/thrust/host_vector.h +576 -0
- cuda/cccl/headers/include/thrust/inner_product.h +264 -0
- cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
- cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
- cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
- cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
- cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
- cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
- cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
- cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
- cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
- cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
- cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
- cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
- cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
- cuda/cccl/headers/include/thrust/logical.h +290 -0
- cuda/cccl/headers/include/thrust/memory.h +299 -0
- cuda/cccl/headers/include/thrust/merge.h +725 -0
- cuda/cccl/headers/include/thrust/mismatch.h +261 -0
- cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +528 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
- cuda/cccl/headers/include/thrust/mr/new.h +100 -0
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
- cuda/cccl/headers/include/thrust/mr/pool.h +528 -0
- cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
- cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
- cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
- cuda/cccl/headers/include/thrust/pair.h +99 -0
- cuda/cccl/headers/include/thrust/partition.h +1391 -0
- cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
- cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
- cuda/cccl/headers/include/thrust/random.h +120 -0
- cuda/cccl/headers/include/thrust/reduce.h +1113 -0
- cuda/cccl/headers/include/thrust/remove.h +768 -0
- cuda/cccl/headers/include/thrust/replace.h +826 -0
- cuda/cccl/headers/include/thrust/reverse.h +215 -0
- cuda/cccl/headers/include/thrust/scan.h +1671 -0
- cuda/cccl/headers/include/thrust/scatter.h +446 -0
- cuda/cccl/headers/include/thrust/sequence.h +277 -0
- cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
- cuda/cccl/headers/include/thrust/shuffle.h +182 -0
- cuda/cccl/headers/include/thrust/sort.h +1320 -0
- cuda/cccl/headers/include/thrust/swap.h +147 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +273 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +233 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +170 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +223 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +341 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +469 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
- cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
- cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
- cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
- cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +73 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.inl +172 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +36 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +33 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system_error.h +57 -0
- cuda/cccl/headers/include/thrust/tabulate.h +125 -0
- cuda/cccl/headers/include/thrust/transform.h +1045 -0
- cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
- cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
- cuda/cccl/headers/include/thrust/tuple.h +139 -0
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
- cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
- cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
- cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
- cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
- cuda/cccl/headers/include/thrust/unique.h +1088 -0
- cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
- cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
- cuda/cccl/headers/include/thrust/version.h +93 -0
- cuda/cccl/headers/include/thrust/zip_function.h +176 -0
- cuda/cccl/headers/include_paths.py +51 -0
- cuda/cccl/parallel/__init__.py +9 -0
- cuda/cccl/parallel/experimental/__init__.py +24 -0
- cuda/cccl/py.typed +0 -0
- cuda/compute/__init__.py +79 -0
- cuda/compute/_bindings.py +79 -0
- cuda/compute/_bindings.pyi +475 -0
- cuda/compute/_bindings_impl.pyx +2273 -0
- cuda/compute/_caching.py +71 -0
- cuda/compute/_cccl_interop.py +422 -0
- cuda/compute/_utils/__init__.py +0 -0
- cuda/compute/_utils/protocols.py +132 -0
- cuda/compute/_utils/temp_storage_buffer.py +86 -0
- cuda/compute/algorithms/__init__.py +54 -0
- cuda/compute/algorithms/_histogram.py +243 -0
- cuda/compute/algorithms/_merge_sort.py +225 -0
- cuda/compute/algorithms/_radix_sort.py +312 -0
- cuda/compute/algorithms/_reduce.py +182 -0
- cuda/compute/algorithms/_scan.py +331 -0
- cuda/compute/algorithms/_segmented_reduce.py +257 -0
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/compute/algorithms/_transform.py +329 -0
- cuda/compute/algorithms/_unique_by_key.py +252 -0
- cuda/compute/cccl/.gitkeep +0 -0
- cuda/compute/cu12/_bindings_impl.cp313-win_amd64.pyd +0 -0
- cuda/compute/cu12/cccl/cccl.c.parallel.dll +0 -0
- cuda/compute/cu12/cccl/cccl.c.parallel.lib +0 -0
- cuda/compute/cu13/_bindings_impl.cp313-win_amd64.pyd +0 -0
- cuda/compute/cu13/cccl/cccl.c.parallel.dll +0 -0
- cuda/compute/cu13/cccl/cccl.c.parallel.lib +0 -0
- cuda/compute/iterators/__init__.py +21 -0
- cuda/compute/iterators/_factories.py +219 -0
- cuda/compute/iterators/_iterators.py +817 -0
- cuda/compute/iterators/_zip_iterator.py +199 -0
- cuda/compute/numba_utils.py +53 -0
- cuda/compute/op.py +3 -0
- cuda/compute/struct.py +272 -0
- cuda/compute/typing.py +37 -0
- cuda/coop/__init__.py +8 -0
- cuda/coop/_caching.py +48 -0
- cuda/coop/_common.py +275 -0
- cuda/coop/_nvrtc.py +92 -0
- cuda/coop/_scan_op.py +181 -0
- cuda/coop/_types.py +937 -0
- cuda/coop/_typing.py +107 -0
- cuda/coop/block/__init__.py +39 -0
- cuda/coop/block/_block_exchange.py +251 -0
- cuda/coop/block/_block_load_store.py +215 -0
- cuda/coop/block/_block_merge_sort.py +125 -0
- cuda/coop/block/_block_radix_sort.py +214 -0
- cuda/coop/block/_block_reduce.py +294 -0
- cuda/coop/block/_block_scan.py +983 -0
- cuda/coop/warp/__init__.py +9 -0
- cuda/coop/warp/_warp_merge_sort.py +92 -0
- cuda/coop/warp/_warp_reduce.py +153 -0
- cuda/coop/warp/_warp_scan.py +78 -0
- cuda_cccl-0.3.3.dist-info/METADATA +41 -0
- cuda_cccl-0.3.3.dist-info/RECORD +1968 -0
- cuda_cccl-0.3.3.dist-info/WHEEL +5 -0
- cuda_cccl-0.3.3.dist-info/licenses/LICENSE +1 -0
|
@@ -0,0 +1,1496 @@
|
|
|
1
|
+
/******************************************************************************
|
|
2
|
+
* Copyright (c) 2011, Duane Merrill. All rights reserved.
|
|
3
|
+
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
|
|
4
|
+
*
|
|
5
|
+
* Redistribution and use in source and binary forms, with or without
|
|
6
|
+
* modification, are permitted provided that the following conditions are met:
|
|
7
|
+
* * Redistributions of source code must retain the above copyright
|
|
8
|
+
* notice, this list of conditions and the following disclaimer.
|
|
9
|
+
* * Redistributions in binary form must reproduce the above copyright
|
|
10
|
+
* notice, this list of conditions and the following disclaimer in the
|
|
11
|
+
* documentation and/or other materials provided with the distribution.
|
|
12
|
+
* * Neither the name of the NVIDIA CORPORATION nor the
|
|
13
|
+
* names of its contributors may be used to endorse or promote products
|
|
14
|
+
* derived from this software without specific prior written permission.
|
|
15
|
+
*
|
|
16
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
17
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
18
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
19
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
20
|
+
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
21
|
+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
22
|
+
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
23
|
+
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
24
|
+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
25
|
+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
26
|
+
*
|
|
27
|
+
******************************************************************************/
|
|
28
|
+
|
|
29
|
+
//! @file
|
|
30
|
+
//! cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across
|
|
31
|
+
//! multiple, non-overlapping sequences of data items residing within device-accessible memory.
|
|
32
|
+
|
|
33
|
+
#pragma once
|
|
34
|
+
|
|
35
|
+
#include <cub/config.cuh>
|
|
36
|
+
|
|
37
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
38
|
+
# pragma GCC system_header
|
|
39
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
40
|
+
# pragma clang system_header
|
|
41
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
42
|
+
# pragma system_header
|
|
43
|
+
#endif // no system header
|
|
44
|
+
|
|
45
|
+
#include <cub/detail/choose_offset.cuh>
|
|
46
|
+
#include <cub/device/dispatch/dispatch_radix_sort.cuh>
|
|
47
|
+
|
|
48
|
+
CUB_NAMESPACE_BEGIN
|
|
49
|
+
|
|
50
|
+
//! @rst
|
|
51
|
+
//! DeviceSegmentedRadixSort provides device-wide, parallel operations
|
|
52
|
+
//! for computing a batched radix sort across multiple, non-overlapping
|
|
53
|
+
//! sequences of data items residing within device-accessible memory.
|
|
54
|
+
//!
|
|
55
|
+
//! Overview
|
|
56
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
57
|
+
//!
|
|
58
|
+
//! The `radix sorting method <http://en.wikipedia.org/wiki/Radix_sort>`_
|
|
59
|
+
//! arranges items into ascending (or descending) order. The algorithm relies
|
|
60
|
+
//! upon a positional representation for keys, i.e., each key is comprised of an
|
|
61
|
+
//! ordered sequence of symbols (e.g., digits, characters, etc.) specified from
|
|
62
|
+
//! least-significant to most-significant. For a given input sequence of keys
|
|
63
|
+
//! and a set of rules specifying a total ordering of the symbolic alphabet, the
|
|
64
|
+
//! radix sorting method produces a lexicographic ordering of those keys.
|
|
65
|
+
//!
|
|
66
|
+
//! See Also
|
|
67
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
68
|
+
//!
|
|
69
|
+
//! DeviceSegmentedRadixSort shares its implementation with DeviceRadixSort. See
|
|
70
|
+
//! that algorithm's documentation for more information.
|
|
71
|
+
//!
|
|
72
|
+
//! Segments are not required to be contiguous. Any element of input(s) or
|
|
73
|
+
//! output(s) outside the specified segments will not be accessed nor modified.
|
|
74
|
+
//!
|
|
75
|
+
//! Usage Considerations
|
|
76
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
77
|
+
//!
|
|
78
|
+
//! @cdp_class{DeviceSegmentedRadixSort}
|
|
79
|
+
//!
|
|
80
|
+
//! @endrst
|
|
81
|
+
struct DeviceSegmentedRadixSort
|
|
82
|
+
{
|
|
83
|
+
private:
|
|
84
|
+
// Name reported for NVTX ranges
|
|
85
|
+
_CCCL_HOST_DEVICE static constexpr auto GetName() -> const char*
|
|
86
|
+
{
|
|
87
|
+
return "cub::DeviceSegmentedRadixSort";
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
public:
|
|
91
|
+
//! @name Key-value pairs
|
|
92
|
+
//! @{
|
|
93
|
+
|
|
94
|
+
//! @rst
|
|
95
|
+
//! Sorts segments of key-value pairs into ascending order. (``~2N`` auxiliary storage required)
|
|
96
|
+
//!
|
|
97
|
+
//! - The contents of the input data are not altered by the sorting operation
|
|
98
|
+
//! - When input a contiguous sequence of segments, a single sequence
|
|
99
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
100
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
101
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
102
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
103
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
104
|
+
//! yield a corresponding performance improvement.
|
|
105
|
+
//! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and ``out`` be any of
|
|
106
|
+
//! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
|
|
107
|
+
//! not overlap ``[in, in + num_items)``,
|
|
108
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
109
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
110
|
+
//! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
|
|
111
|
+
//! the sorting interface using DoubleBuffer wrappers below.
|
|
112
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
113
|
+
//! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
|
|
114
|
+
//! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
|
|
115
|
+
//! - Note, the size of any segment may not exceed ``INT_MAX``. Please consider using ``DeviceSegmentedSort`` instead,
|
|
116
|
+
//! if the size of at least one of your segments could exceed ``INT_MAX``.
|
|
117
|
+
//! - @devicestorage
|
|
118
|
+
//!
|
|
119
|
+
//! Snippet
|
|
120
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
121
|
+
//!
|
|
122
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
123
|
+
//! (with one zero-length segment) of ``int`` keys with associated vector of ``int`` values.
|
|
124
|
+
//!
|
|
125
|
+
//! .. code-block:: c++
|
|
126
|
+
//!
|
|
127
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_radix_sort.cuh>
|
|
128
|
+
//!
|
|
129
|
+
//! // Declare, allocate, and initialize device-accessible pointers for sorting data
|
|
130
|
+
//! int num_items; // e.g., 7
|
|
131
|
+
//! int num_segments; // e.g., 3
|
|
132
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
133
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
134
|
+
//! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
|
|
135
|
+
//! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
136
|
+
//! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
|
|
137
|
+
//! ...
|
|
138
|
+
//!
|
|
139
|
+
//! // Determine temporary device storage requirements
|
|
140
|
+
//! void *d_temp_storage = nullptr;
|
|
141
|
+
//! size_t temp_storage_bytes = 0;
|
|
142
|
+
//! cub::DeviceSegmentedRadixSort::SortPairs(
|
|
143
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
144
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out,
|
|
145
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
146
|
+
//!
|
|
147
|
+
//! // Allocate temporary storage
|
|
148
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
149
|
+
//!
|
|
150
|
+
//! // Run sorting operation
|
|
151
|
+
//! cub::DeviceSegmentedRadixSort::SortPairs(
|
|
152
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
153
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out,
|
|
154
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
155
|
+
//!
|
|
156
|
+
//! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
|
|
157
|
+
//! // d_values_out <-- [1, 2, 0, 5, 4, 3, 6]
|
|
158
|
+
//!
|
|
159
|
+
//! @endrst
|
|
160
|
+
//!
|
|
161
|
+
//! @tparam KeyT
|
|
162
|
+
//! **[inferred]** Key type
|
|
163
|
+
//!
|
|
164
|
+
//! @tparam ValueT
|
|
165
|
+
//! **[inferred]** Value type
|
|
166
|
+
//!
|
|
167
|
+
//! @tparam BeginOffsetIteratorT
|
|
168
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
|
|
169
|
+
//!
|
|
170
|
+
//! @tparam EndOffsetIteratorT
|
|
171
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
|
|
172
|
+
//!
|
|
173
|
+
//! @param[in] d_temp_storage
|
|
174
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
175
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
176
|
+
//!
|
|
177
|
+
//! @param[in,out] temp_storage_bytes
|
|
178
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
179
|
+
//!
|
|
180
|
+
//! @param[in] d_keys_in
|
|
181
|
+
//! Device-accessible pointer to the input data of key data to sort
|
|
182
|
+
//!
|
|
183
|
+
//! @param[out] d_keys_out
|
|
184
|
+
//! Device-accessible pointer to the sorted output sequence of key data
|
|
185
|
+
//!
|
|
186
|
+
//! @param[in] d_values_in
|
|
187
|
+
//! Device-accessible pointer to the corresponding input sequence of
|
|
188
|
+
//! associated value items
|
|
189
|
+
//!
|
|
190
|
+
//! @param[out] d_values_out
|
|
191
|
+
//! Device-accessible pointer to the correspondingly-reordered output
|
|
192
|
+
//! sequence of associated value items
|
|
193
|
+
//!
|
|
194
|
+
//! @param[in] num_items
|
|
195
|
+
//! The total number of items within the segmented array, including items not
|
|
196
|
+
//! covered by segments. `num_items` should match the largest element within
|
|
197
|
+
//! the range `[d_end_offsets, d_end_offsets + num_segments)`.
|
|
198
|
+
//!
|
|
199
|
+
//! @param[in] num_segments
|
|
200
|
+
//! The number of segments that comprise the sorting data
|
|
201
|
+
//!
|
|
202
|
+
//! @param[in] d_begin_offsets
|
|
203
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
204
|
+
//! length `num_segments`, such that `d_begin_offsets[i]` is the first
|
|
205
|
+
//! element of the *i*<sup>th</sup> data segment in `d_keys_*` and `d_values_*`
|
|
206
|
+
//!
|
|
207
|
+
//! @param[in] d_end_offsets
|
|
208
|
+
//! @rst
|
|
209
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
210
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
211
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``. If
|
|
212
|
+
//! ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
213
|
+
//! @endrst
|
|
214
|
+
//!
|
|
215
|
+
//! @param[in] begin_bit
|
|
216
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for key comparison
|
|
217
|
+
//!
|
|
218
|
+
//! @param[in] end_bit
|
|
219
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
220
|
+
//! comparison (e.g., `sizeof(unsigned int) * 8`)
|
|
221
|
+
//!
|
|
222
|
+
//! @param[in] stream
|
|
223
|
+
//! @rst
|
|
224
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
225
|
+
//! @endrst
|
|
226
|
+
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
227
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
|
|
228
|
+
void* d_temp_storage,
|
|
229
|
+
size_t& temp_storage_bytes,
|
|
230
|
+
const KeyT* d_keys_in,
|
|
231
|
+
KeyT* d_keys_out,
|
|
232
|
+
const ValueT* d_values_in,
|
|
233
|
+
ValueT* d_values_out,
|
|
234
|
+
::cuda::std::int64_t num_items,
|
|
235
|
+
::cuda::std::int64_t num_segments,
|
|
236
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
237
|
+
EndOffsetIteratorT d_end_offsets,
|
|
238
|
+
int begin_bit = 0,
|
|
239
|
+
int end_bit = sizeof(KeyT) * 8,
|
|
240
|
+
cudaStream_t stream = 0)
|
|
241
|
+
{
|
|
242
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
243
|
+
|
|
244
|
+
// Signed integer type for global offsets
|
|
245
|
+
using SegmentSizeT = ::cuda::std::int32_t;
|
|
246
|
+
|
|
247
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
248
|
+
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
|
|
249
|
+
|
|
250
|
+
return DispatchSegmentedRadixSort<
|
|
251
|
+
SortOrder::Ascending,
|
|
252
|
+
KeyT,
|
|
253
|
+
ValueT,
|
|
254
|
+
BeginOffsetIteratorT,
|
|
255
|
+
EndOffsetIteratorT,
|
|
256
|
+
SegmentSizeT>::Dispatch(d_temp_storage,
|
|
257
|
+
temp_storage_bytes,
|
|
258
|
+
d_keys,
|
|
259
|
+
d_values,
|
|
260
|
+
num_items,
|
|
261
|
+
num_segments,
|
|
262
|
+
d_begin_offsets,
|
|
263
|
+
d_end_offsets,
|
|
264
|
+
begin_bit,
|
|
265
|
+
end_bit,
|
|
266
|
+
false,
|
|
267
|
+
stream);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
//! @rst
|
|
271
|
+
//! Sorts segments of key-value pairs into ascending order. (``~N`` auxiliary storage required)
|
|
272
|
+
//!
|
|
273
|
+
//! - The sorting operation is given a pair of key buffers and a corresponding
|
|
274
|
+
//! pair of associated value buffers. Each pair is managed by a DoubleBuffer
|
|
275
|
+
//! structure that indicates which of the two buffers is "current" (and thus
|
|
276
|
+
//! contains the input data to be sorted).
|
|
277
|
+
//! - The contents of both buffers within each pair may be altered by the sorting operation.
|
|
278
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
279
|
+
//! indicator within each DoubleBuffer wrapper to reference which of the two
|
|
280
|
+
//! buffers now contains the sorted output sequence (a function of the number
|
|
281
|
+
//! of key bits specified and the targeted device architecture).
|
|
282
|
+
//! - When input a contiguous sequence of segments, a single sequence
|
|
283
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
|
|
284
|
+
//! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is
|
|
285
|
+
//! specified as ``segment_offsets + 1``).
|
|
286
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
287
|
+
//! bits can be specified. This can reduce overall sorting overhead and yield
|
|
288
|
+
//! a corresponding performance improvement.
|
|
289
|
+
//! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
|
|
290
|
+
//! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
|
|
291
|
+
//! ``[cur, cur + num_items)`` shall not overlap
|
|
292
|
+
//! ``[alt, alt + num_items)``. Both ranges shall not overlap
|
|
293
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
294
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
295
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
296
|
+
//! outside the specified segments ``d_keys.Current()[i]``,
|
|
297
|
+
//! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
|
|
298
|
+
//! ``d_values.Alternate()[i]`` will not be accessed nor modified.
|
|
299
|
+
//! - Note, the size of any segment may not exceed ``INT_MAX``. Please consider using ``DeviceSegmentedSort`` instead,
|
|
300
|
+
//! if the size of at least one of your segments could exceed ``INT_MAX``.
|
|
301
|
+
//! - @devicestorageP
|
|
302
|
+
//! - @devicestorage
|
|
303
|
+
//!
|
|
304
|
+
//! Snippet
|
|
305
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
306
|
+
//!
|
|
307
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
308
|
+
//! (with one zero-length segment) of `int` keys with associated vector of ``int`` values.
|
|
309
|
+
//!
|
|
310
|
+
//! .. code-block:: c++
|
|
311
|
+
//!
|
|
312
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_radix_sort.cuh>
|
|
313
|
+
//!
|
|
314
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
315
|
+
//! // for sorting data
|
|
316
|
+
//! int num_items; // e.g., 7
|
|
317
|
+
//! int num_segments; // e.g., 3
|
|
318
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
319
|
+
//! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
320
|
+
//! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
321
|
+
//! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
322
|
+
//! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
323
|
+
//! ...
|
|
324
|
+
//!
|
|
325
|
+
//! // Create a set of DoubleBuffers to wrap pairs of device pointers
|
|
326
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
327
|
+
//! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
|
|
328
|
+
//!
|
|
329
|
+
//! // Determine temporary device storage requirements
|
|
330
|
+
//! void *d_temp_storage = nullptr;
|
|
331
|
+
//! size_t temp_storage_bytes = 0;
|
|
332
|
+
//! cub::DeviceSegmentedRadixSort::SortPairs(
|
|
333
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, d_values,
|
|
334
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
335
|
+
//!
|
|
336
|
+
//! // Allocate temporary storage
|
|
337
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
338
|
+
//!
|
|
339
|
+
//! // Run sorting operation
|
|
340
|
+
//! cub::DeviceSegmentedRadixSort::SortPairs(
|
|
341
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, d_values,
|
|
342
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
343
|
+
//!
|
|
344
|
+
//! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
|
|
345
|
+
//! // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6]
|
|
346
|
+
//!
|
|
347
|
+
//! @endrst
|
|
348
|
+
//!
|
|
349
|
+
//! @tparam KeyT
|
|
350
|
+
//! **[inferred]** Key type
|
|
351
|
+
//!
|
|
352
|
+
//! @tparam ValueT
|
|
353
|
+
//! **[inferred]** Value type
|
|
354
|
+
//!
|
|
355
|
+
//! @tparam BeginOffsetIteratorT
|
|
356
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
|
|
357
|
+
//!
|
|
358
|
+
//! @tparam EndOffsetIteratorT
|
|
359
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
|
|
360
|
+
//!
|
|
361
|
+
//! @param[in] d_temp_storage
|
|
362
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
363
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
364
|
+
//!
|
|
365
|
+
//! @param[in,out] temp_storage_bytes
|
|
366
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
367
|
+
//!
|
|
368
|
+
//! @param[in,out] d_keys
|
|
369
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
370
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
371
|
+
//! point to the sorted output keys
|
|
372
|
+
//!
|
|
373
|
+
//! @param[in,out] d_values
|
|
374
|
+
//! Double-buffer of values whose "current" device-accessible buffer
|
|
375
|
+
//! contains the unsorted input values and, upon return, is updated to point
|
|
376
|
+
//! to the sorted output values
|
|
377
|
+
//!
|
|
378
|
+
//! @param[in] num_items
|
|
379
|
+
//! The total number of items within the segmented array, including items not
|
|
380
|
+
//! covered by segments. `num_items` should match the largest element within
|
|
381
|
+
//! the range `[d_end_offsets, d_end_offsets + num_segments)`.
|
|
382
|
+
//!
|
|
383
|
+
//! @param[in] num_segments
|
|
384
|
+
//! The number of segments that comprise the sorting data
|
|
385
|
+
//!
|
|
386
|
+
//! @param[in] d_begin_offsets
|
|
387
|
+
//! @rst
|
|
388
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
389
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
390
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
391
|
+
//! @endrst
|
|
392
|
+
//!
|
|
393
|
+
//! @param[in] d_end_offsets
|
|
394
|
+
//! @rst
|
|
395
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
396
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
397
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
398
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
399
|
+
//! @endrst
|
|
400
|
+
//!
|
|
401
|
+
//! @param[in] begin_bit
|
|
402
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for key comparison
|
|
403
|
+
//!
|
|
404
|
+
//! @param[in] end_bit
|
|
405
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
406
|
+
//! comparison (e.g., `sizeof(unsigned int) * 8`)
|
|
407
|
+
//!
|
|
408
|
+
//! @param[in] stream
|
|
409
|
+
//! @rst
|
|
410
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
411
|
+
//! @endrst
|
|
412
|
+
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
413
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(
|
|
414
|
+
void* d_temp_storage,
|
|
415
|
+
size_t& temp_storage_bytes,
|
|
416
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
417
|
+
DoubleBuffer<ValueT>& d_values,
|
|
418
|
+
::cuda::std::int64_t num_items,
|
|
419
|
+
::cuda::std::int64_t num_segments,
|
|
420
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
421
|
+
EndOffsetIteratorT d_end_offsets,
|
|
422
|
+
int begin_bit = 0,
|
|
423
|
+
int end_bit = sizeof(KeyT) * 8,
|
|
424
|
+
cudaStream_t stream = 0)
|
|
425
|
+
{
|
|
426
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
427
|
+
|
|
428
|
+
// Signed integer type for global offsets
|
|
429
|
+
using SegmentSizeT = ::cuda::std::int32_t;
|
|
430
|
+
|
|
431
|
+
return DispatchSegmentedRadixSort<
|
|
432
|
+
SortOrder::Ascending,
|
|
433
|
+
KeyT,
|
|
434
|
+
ValueT,
|
|
435
|
+
BeginOffsetIteratorT,
|
|
436
|
+
EndOffsetIteratorT,
|
|
437
|
+
SegmentSizeT>::Dispatch(d_temp_storage,
|
|
438
|
+
temp_storage_bytes,
|
|
439
|
+
d_keys,
|
|
440
|
+
d_values,
|
|
441
|
+
num_items,
|
|
442
|
+
num_segments,
|
|
443
|
+
d_begin_offsets,
|
|
444
|
+
d_end_offsets,
|
|
445
|
+
begin_bit,
|
|
446
|
+
end_bit,
|
|
447
|
+
true,
|
|
448
|
+
stream);
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
//! @rst
|
|
452
|
+
//! Sorts segments of key-value pairs into descending order. (``~2N`` auxiliary storage required).
|
|
453
|
+
//!
|
|
454
|
+
//! - The contents of the input data are not altered by the sorting operation
|
|
455
|
+
//! - When input a contiguous sequence of segments, a single sequence
|
|
456
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
|
|
457
|
+
//! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is
|
|
458
|
+
//! specified as ``segment_offsets + 1``).
|
|
459
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
460
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
461
|
+
//! yield a corresponding performance improvement.
|
|
462
|
+
//! - Let ``in`` be one of ``{d_keys_in, d_values_in}`` and `out` be any of
|
|
463
|
+
//! ``{d_keys_out, d_values_out}``. The range ``[out, out + num_items)`` shall
|
|
464
|
+
//! not overlap ``[in, in + num_items)``,
|
|
465
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
466
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
467
|
+
//! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
|
|
468
|
+
//! the sorting interface using DoubleBuffer wrappers below.
|
|
469
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
470
|
+
//! outside the specified segments ``d_keys_in[i]``, ``d_values_in[i]``,
|
|
471
|
+
//! ``d_keys_out[i]``, ``d_values_out[i]`` will not be accessed nor modified.
|
|
472
|
+
//! - Note, the size of any segment may not exceed ``INT_MAX``. Please consider using ``DeviceSegmentedSort`` instead,
|
|
473
|
+
//! if the size of at least one of your segments could exceed ``INT_MAX``.
|
|
474
|
+
//! - @devicestorage
|
|
475
|
+
//!
|
|
476
|
+
//! Snippet
|
|
477
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
478
|
+
//!
|
|
479
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
480
|
+
//! (with one zero-length segment) of ``int`` keys with associated vector of ``int`` values.
|
|
481
|
+
//!
|
|
482
|
+
//! .. code-block:: c++
|
|
483
|
+
//!
|
|
484
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_radix_sort.cuh>
|
|
485
|
+
//!
|
|
486
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
487
|
+
//! // for sorting data
|
|
488
|
+
//! int num_items; // e.g., 7
|
|
489
|
+
//! int num_segments; // e.g., 3
|
|
490
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
491
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
492
|
+
//! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
|
|
493
|
+
//! int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
494
|
+
//! int *d_values_out; // e.g., [-, -, -, -, -, -, -]
|
|
495
|
+
//! ...
|
|
496
|
+
//!
|
|
497
|
+
//! // Determine temporary device storage requirements
|
|
498
|
+
//! void *d_temp_storage = nullptr;
|
|
499
|
+
//! size_t temp_storage_bytes = 0;
|
|
500
|
+
//! cub::DeviceSegmentedRadixSort::SortPairsDescending(
|
|
501
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
502
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out,
|
|
503
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
504
|
+
//!
|
|
505
|
+
//! // Allocate temporary storage
|
|
506
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
507
|
+
//!
|
|
508
|
+
//! // Run sorting operation
|
|
509
|
+
//! cub::DeviceSegmentedRadixSort::SortPairsDescending(
|
|
510
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
511
|
+
//! d_keys_in, d_keys_out, d_values_in, d_values_out,
|
|
512
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
513
|
+
//!
|
|
514
|
+
//! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
|
|
515
|
+
//! // d_values_out <-- [0, 2, 1, 6, 3, 4, 5]
|
|
516
|
+
//!
|
|
517
|
+
//! @endrst
|
|
518
|
+
//!
|
|
519
|
+
//! @tparam KeyT
|
|
520
|
+
//! **[inferred]** Key type
|
|
521
|
+
//!
|
|
522
|
+
//! @tparam ValueT
|
|
523
|
+
//! **[inferred]** Value type
|
|
524
|
+
//!
|
|
525
|
+
//! @tparam BeginOffsetIteratorT
|
|
526
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
527
|
+
//! beginning offsets @iterator
|
|
528
|
+
//!
|
|
529
|
+
//! @tparam EndOffsetIteratorT
|
|
530
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
531
|
+
//! ending offsets @iterator
|
|
532
|
+
//!
|
|
533
|
+
//! @param[in] d_temp_storage
|
|
534
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
535
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
536
|
+
//!
|
|
537
|
+
//! @param[in,out] temp_storage_bytes
|
|
538
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
539
|
+
//!
|
|
540
|
+
//! @param[in] d_keys_in
|
|
541
|
+
//! Device-accessible pointer to the input data of key data to sort
|
|
542
|
+
//!
|
|
543
|
+
//! @param[out] d_keys_out
|
|
544
|
+
//! Device-accessible pointer to the sorted output sequence of key data
|
|
545
|
+
//!
|
|
546
|
+
//! @param[in] d_values_in
|
|
547
|
+
//! Device-accessible pointer to the corresponding input sequence of
|
|
548
|
+
//! associated value items
|
|
549
|
+
//!
|
|
550
|
+
//! @param[out] d_values_out
|
|
551
|
+
//! Device-accessible pointer to the correspondingly-reordered output
|
|
552
|
+
//! sequence of associated value items
|
|
553
|
+
//!
|
|
554
|
+
//! @param[in] num_items
|
|
555
|
+
//! The total number of items within the segmented array, including items not
|
|
556
|
+
//! covered by segments. `num_items` should match the largest element within
|
|
557
|
+
//! the range `[d_end_offsets, d_end_offsets + num_segments)`.
|
|
558
|
+
//!
|
|
559
|
+
//! @param[in] num_segments
|
|
560
|
+
//! The number of segments that comprise the sorting data
|
|
561
|
+
//!
|
|
562
|
+
//! @param[in] d_begin_offsets
|
|
563
|
+
//! @rst
|
|
564
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
565
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
566
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
567
|
+
//! @endrst
|
|
568
|
+
//!
|
|
569
|
+
//! @param[in] d_end_offsets
|
|
570
|
+
//! @rst
|
|
571
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
572
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
573
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
574
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
575
|
+
//! @endrst
|
|
576
|
+
//!
|
|
577
|
+
//! @param[in] begin_bit
|
|
578
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for key comparison
|
|
579
|
+
//!
|
|
580
|
+
//! @param[in] end_bit
|
|
581
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
582
|
+
//! comparison (e.g., `sizeof(unsigned int) * 8`)
|
|
583
|
+
//!
|
|
584
|
+
//! @param[in] stream
|
|
585
|
+
//! @rst
|
|
586
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
587
|
+
//! @endrst
|
|
588
|
+
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
589
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
|
|
590
|
+
void* d_temp_storage,
|
|
591
|
+
size_t& temp_storage_bytes,
|
|
592
|
+
const KeyT* d_keys_in,
|
|
593
|
+
KeyT* d_keys_out,
|
|
594
|
+
const ValueT* d_values_in,
|
|
595
|
+
ValueT* d_values_out,
|
|
596
|
+
::cuda::std::int64_t num_items,
|
|
597
|
+
::cuda::std::int64_t num_segments,
|
|
598
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
599
|
+
EndOffsetIteratorT d_end_offsets,
|
|
600
|
+
int begin_bit = 0,
|
|
601
|
+
int end_bit = sizeof(KeyT) * 8,
|
|
602
|
+
cudaStream_t stream = 0)
|
|
603
|
+
{
|
|
604
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
605
|
+
|
|
606
|
+
// Signed integer type for global offsets
|
|
607
|
+
using SegmentSizeT = ::cuda::std::int32_t;
|
|
608
|
+
|
|
609
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
610
|
+
DoubleBuffer<ValueT> d_values(const_cast<ValueT*>(d_values_in), d_values_out);
|
|
611
|
+
|
|
612
|
+
return DispatchSegmentedRadixSort<
|
|
613
|
+
SortOrder::Descending,
|
|
614
|
+
KeyT,
|
|
615
|
+
ValueT,
|
|
616
|
+
BeginOffsetIteratorT,
|
|
617
|
+
EndOffsetIteratorT,
|
|
618
|
+
SegmentSizeT>::Dispatch(d_temp_storage,
|
|
619
|
+
temp_storage_bytes,
|
|
620
|
+
d_keys,
|
|
621
|
+
d_values,
|
|
622
|
+
num_items,
|
|
623
|
+
num_segments,
|
|
624
|
+
d_begin_offsets,
|
|
625
|
+
d_end_offsets,
|
|
626
|
+
begin_bit,
|
|
627
|
+
end_bit,
|
|
628
|
+
false,
|
|
629
|
+
stream);
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
//! @rst
|
|
633
|
+
//! Sorts segments of key-value pairs into descending order. (``~N`` auxiliary storage required).
|
|
634
|
+
//!
|
|
635
|
+
//! - The sorting operation is given a pair of key buffers and a corresponding
|
|
636
|
+
//! pair of associated value buffers. Each pair is managed by a DoubleBuffer
|
|
637
|
+
//! structure that indicates which of the two buffers is "current" (and thus
|
|
638
|
+
//! contains the input data to be sorted).
|
|
639
|
+
//! - The contents of both buffers within each pair may be altered by the
|
|
640
|
+
//! sorting operation.
|
|
641
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
642
|
+
//! indicator within each DoubleBuffer wrapper to reference which of the two
|
|
643
|
+
//! buffers now contains the sorted output sequence (a function of the number
|
|
644
|
+
//! of key bits specified and the targeted device architecture).
|
|
645
|
+
//! - When input a contiguous sequence of segments, a single sequence
|
|
646
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
|
|
647
|
+
//! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is
|
|
648
|
+
//! specified as ``segment_offsets + 1``).
|
|
649
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
650
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
651
|
+
//! yield a corresponding performance improvement.
|
|
652
|
+
//! - Let ``cur`` be one of ``{d_keys.Current(), d_values.Current()}`` and ``alt``
|
|
653
|
+
//! be any of ``{d_keys.Alternate(), d_values.Alternate()}``. The range
|
|
654
|
+
//! ``[cur, cur + num_items)`` shall not overlap
|
|
655
|
+
//! ``[alt, alt + num_items)``. Both ranges shall not overlap
|
|
656
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
657
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
658
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
659
|
+
//! outside the specified segments ``d_keys.Current()[i]``,
|
|
660
|
+
//! ``d_values.Current()[i]``, ``d_keys.Alternate()[i]``,
|
|
661
|
+
//! ``d_values.Alternate()[i]`` will not be accessed nor modified.
|
|
662
|
+
//! not to be modified.
|
|
663
|
+
//! - Note, the size of any segment may not exceed ``INT_MAX``. Please consider using ``DeviceSegmentedSort`` instead,
|
|
664
|
+
//! if the size of at least one of your segments could exceed ``INT_MAX``.
|
|
665
|
+
//! - @devicestorageP
|
|
666
|
+
//! - @devicestorage
|
|
667
|
+
//!
|
|
668
|
+
//! Snippet
|
|
669
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
670
|
+
//!
|
|
671
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
672
|
+
//! (with one zero-length segment) of ``int`` keys with associated vector of ``int`` values.
|
|
673
|
+
//!
|
|
674
|
+
//! .. code-block:: c++
|
|
675
|
+
//!
|
|
676
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_segmented_radix_sort.cuh>
|
|
677
|
+
//!
|
|
678
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
679
|
+
//! // for sorting data
|
|
680
|
+
//! int num_items; // e.g., 7
|
|
681
|
+
//! int num_segments; // e.g., 3
|
|
682
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
683
|
+
//! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
684
|
+
//! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
685
|
+
//! int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6]
|
|
686
|
+
//! int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
687
|
+
//! ...
|
|
688
|
+
//!
|
|
689
|
+
//! // Create a set of DoubleBuffers to wrap pairs of device pointers
|
|
690
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
691
|
+
//! cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
|
|
692
|
+
//!
|
|
693
|
+
//! // Determine temporary device storage requirements
|
|
694
|
+
//! void *d_temp_storage = nullptr;
|
|
695
|
+
//! size_t temp_storage_bytes = 0;
|
|
696
|
+
//! cub::DeviceSegmentedRadixSort::SortPairsDescending(
|
|
697
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, d_values,
|
|
698
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
699
|
+
//!
|
|
700
|
+
//! // Allocate temporary storage
|
|
701
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
702
|
+
//!
|
|
703
|
+
//! // Run sorting operation
|
|
704
|
+
//! cub::DeviceSegmentedRadixSort::SortPairsDescending(
|
|
705
|
+
//! d_temp_storage, temp_storage_bytes, d_keys, d_values,
|
|
706
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
707
|
+
//!
|
|
708
|
+
//! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
|
|
709
|
+
//! // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5]
|
|
710
|
+
//!
|
|
711
|
+
//! @endrst
|
|
712
|
+
//!
|
|
713
|
+
//! @tparam KeyT
|
|
714
|
+
//! **[inferred]** Key type
|
|
715
|
+
//!
|
|
716
|
+
//! @tparam ValueT
|
|
717
|
+
//! **[inferred]** Value type
|
|
718
|
+
//!
|
|
719
|
+
//! @tparam BeginOffsetIteratorT
|
|
720
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
721
|
+
//! beginning offsets @iterator
|
|
722
|
+
//!
|
|
723
|
+
//! @tparam EndOffsetIteratorT
|
|
724
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
725
|
+
//! ending offsets @iterator
|
|
726
|
+
//!
|
|
727
|
+
//! @param[in] d_temp_storage
|
|
728
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
729
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
730
|
+
//!
|
|
731
|
+
//! @param[in,out] temp_storage_bytes
|
|
732
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
733
|
+
//!
|
|
734
|
+
//! @param[in,out] d_keys
|
|
735
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
736
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
737
|
+
//! point to the sorted output keys
|
|
738
|
+
//!
|
|
739
|
+
//! @param[in,out] d_values
|
|
740
|
+
//! Double-buffer of values whose "current" device-accessible buffer
|
|
741
|
+
//! contains the unsorted input values and, upon return, is updated to point
|
|
742
|
+
//! to the sorted output values
|
|
743
|
+
//!
|
|
744
|
+
//! @param[in] num_items
|
|
745
|
+
//! The total number of items within the segmented array, including items not
|
|
746
|
+
//! covered by segments. `num_items` should match the largest element within
|
|
747
|
+
//! the range `[d_end_offsets, d_end_offsets + num_segments)`.
|
|
748
|
+
//!
|
|
749
|
+
//! @param[in] num_segments
|
|
750
|
+
//! The number of segments that comprise the sorting data
|
|
751
|
+
//!
|
|
752
|
+
//! @param[in] d_begin_offsets
|
|
753
|
+
//! @rst
|
|
754
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
755
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
756
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
757
|
+
//! @endrst
|
|
758
|
+
//!
|
|
759
|
+
//! @param[in] d_end_offsets
|
|
760
|
+
//! @rst
|
|
761
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
762
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
763
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
764
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
765
|
+
//! @endrst
|
|
766
|
+
//!
|
|
767
|
+
//! @param[in] begin_bit
|
|
768
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for key comparison
|
|
769
|
+
//!
|
|
770
|
+
//! @param[in] end_bit
|
|
771
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
772
|
+
//! comparison (e.g., `sizeof(unsigned int) * 8`)
|
|
773
|
+
//!
|
|
774
|
+
//! @param[in] stream
|
|
775
|
+
//! @rst
|
|
776
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
777
|
+
//! @endrst
|
|
778
|
+
template <typename KeyT, typename ValueT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
779
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending(
|
|
780
|
+
void* d_temp_storage,
|
|
781
|
+
size_t& temp_storage_bytes,
|
|
782
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
783
|
+
DoubleBuffer<ValueT>& d_values,
|
|
784
|
+
::cuda::std::int64_t num_items,
|
|
785
|
+
::cuda::std::int64_t num_segments,
|
|
786
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
787
|
+
EndOffsetIteratorT d_end_offsets,
|
|
788
|
+
int begin_bit = 0,
|
|
789
|
+
int end_bit = sizeof(KeyT) * 8,
|
|
790
|
+
cudaStream_t stream = 0)
|
|
791
|
+
{
|
|
792
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
793
|
+
|
|
794
|
+
// Signed integer type for global offsets
|
|
795
|
+
using SegmentSizeT = ::cuda::std::int32_t;
|
|
796
|
+
|
|
797
|
+
return DispatchSegmentedRadixSort<
|
|
798
|
+
SortOrder::Descending,
|
|
799
|
+
KeyT,
|
|
800
|
+
ValueT,
|
|
801
|
+
BeginOffsetIteratorT,
|
|
802
|
+
EndOffsetIteratorT,
|
|
803
|
+
SegmentSizeT>::Dispatch(d_temp_storage,
|
|
804
|
+
temp_storage_bytes,
|
|
805
|
+
d_keys,
|
|
806
|
+
d_values,
|
|
807
|
+
num_items,
|
|
808
|
+
num_segments,
|
|
809
|
+
d_begin_offsets,
|
|
810
|
+
d_end_offsets,
|
|
811
|
+
begin_bit,
|
|
812
|
+
end_bit,
|
|
813
|
+
true,
|
|
814
|
+
stream);
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
//! @} end member group
|
|
818
|
+
//! @name Keys-only
|
|
819
|
+
//! @{
|
|
820
|
+
|
|
821
|
+
//! @rst
|
|
822
|
+
//! Sorts segments of keys into ascending order. (``~2N`` auxiliary storage required)
|
|
823
|
+
//!
|
|
824
|
+
//! - The contents of the input data are not altered by the sorting operation
|
|
825
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
826
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
827
|
+
//! yield a corresponding performance improvement.
|
|
828
|
+
//! - When input a contiguous sequence of segments, a single sequence
|
|
829
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
|
|
830
|
+
//! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter
|
|
831
|
+
//! is specified as ``segment_offsets + 1``).
|
|
832
|
+
//! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
|
|
833
|
+
//! ``[d_keys_in, d_keys_in + num_items)``,
|
|
834
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
835
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
836
|
+
//! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
|
|
837
|
+
//! the sorting interface using DoubleBuffer wrappers below.
|
|
838
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
839
|
+
//! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
|
|
840
|
+
//! be accessed nor modified.
|
|
841
|
+
//! - Note, the size of any segment may not exceed ``INT_MAX``. Please consider using ``DeviceSegmentedSort`` instead,
|
|
842
|
+
//! if the size of at least one of your segments could exceed ``INT_MAX``.
|
|
843
|
+
//! - @devicestorage
|
|
844
|
+
//!
|
|
845
|
+
//! Snippet
|
|
846
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
847
|
+
//!
|
|
848
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
849
|
+
//! (with one zero-length segment) of `int` keys.
|
|
850
|
+
//!
|
|
851
|
+
//! .. code-block:: c++
|
|
852
|
+
//!
|
|
853
|
+
//! #include <cub/cub.cuh>
|
|
854
|
+
//! // or equivalently <cub/device/device_segmented_radix_sort.cuh>
|
|
855
|
+
//!
|
|
856
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
857
|
+
//! // for sorting data
|
|
858
|
+
//! int num_items; // e.g., 7
|
|
859
|
+
//! int num_segments; // e.g., 3
|
|
860
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
861
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
862
|
+
//! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
|
|
863
|
+
//! ...
|
|
864
|
+
//!
|
|
865
|
+
//! // Determine temporary device storage requirements
|
|
866
|
+
//! void *d_temp_storage = nullptr;
|
|
867
|
+
//! size_t temp_storage_bytes = 0;
|
|
868
|
+
//! cub::DeviceSegmentedRadixSort::SortKeys(
|
|
869
|
+
//! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
|
|
870
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
871
|
+
//!
|
|
872
|
+
//! // Allocate temporary storage
|
|
873
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
874
|
+
//!
|
|
875
|
+
//! // Run sorting operation
|
|
876
|
+
//! cub::DeviceSegmentedRadixSort::SortKeys(
|
|
877
|
+
//! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
|
|
878
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
879
|
+
//!
|
|
880
|
+
//! // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9]
|
|
881
|
+
//!
|
|
882
|
+
//! @endrst
|
|
883
|
+
//!
|
|
884
|
+
//! @tparam KeyT
|
|
885
|
+
//! **[inferred]** Key type
|
|
886
|
+
//!
|
|
887
|
+
//! @tparam BeginOffsetIteratorT
|
|
888
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
889
|
+
//! beginning offsets @iterator
|
|
890
|
+
//!
|
|
891
|
+
//! @tparam EndOffsetIteratorT
|
|
892
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
893
|
+
//! ending offsets @iterator
|
|
894
|
+
//!
|
|
895
|
+
//! @param[in] d_temp_storage
|
|
896
|
+
//! Device-accessible allocation of temporary storage.
|
|
897
|
+
//! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
898
|
+
//!
|
|
899
|
+
//! @param[in,out] temp_storage_bytes
|
|
900
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
901
|
+
//!
|
|
902
|
+
//! @param[in] d_keys_in
|
|
903
|
+
//! Device-accessible pointer to the input data of key data to sort
|
|
904
|
+
//!
|
|
905
|
+
//! @param[out] d_keys_out
|
|
906
|
+
//! Device-accessible pointer to the sorted output sequence of key data
|
|
907
|
+
//!
|
|
908
|
+
//! @param[in] num_items
|
|
909
|
+
//! The total number of items within the segmented array, including items not
|
|
910
|
+
//! covered by segments. `num_items` should match the largest element within
|
|
911
|
+
//! the range `[d_end_offsets, d_end_offsets + num_segments)`.
|
|
912
|
+
//!
|
|
913
|
+
//! @param[in] num_segments
|
|
914
|
+
//! The number of segments that comprise the sorting data
|
|
915
|
+
//!
|
|
916
|
+
//! @param[in] d_begin_offsets
|
|
917
|
+
//! @rst
|
|
918
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
919
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
920
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
921
|
+
//! @endrst
|
|
922
|
+
//!
|
|
923
|
+
//! @param[in] d_end_offsets
|
|
924
|
+
//! @rst
|
|
925
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
926
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
927
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
928
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
929
|
+
//! @endrst
|
|
930
|
+
//!
|
|
931
|
+
//! @param[in] begin_bit
|
|
932
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for key comparison
|
|
933
|
+
//!
|
|
934
|
+
//! @param[in] end_bit
|
|
935
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
936
|
+
//! comparison (e.g., `sizeof(unsigned int) * 8`)
|
|
937
|
+
//!
|
|
938
|
+
//! @param[in] stream
|
|
939
|
+
//! @rst
|
|
940
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
941
|
+
//! @endrst
|
|
942
|
+
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
943
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
|
|
944
|
+
void* d_temp_storage,
|
|
945
|
+
size_t& temp_storage_bytes,
|
|
946
|
+
const KeyT* d_keys_in,
|
|
947
|
+
KeyT* d_keys_out,
|
|
948
|
+
::cuda::std::int64_t num_items,
|
|
949
|
+
::cuda::std::int64_t num_segments,
|
|
950
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
951
|
+
EndOffsetIteratorT d_end_offsets,
|
|
952
|
+
int begin_bit = 0,
|
|
953
|
+
int end_bit = sizeof(KeyT) * 8,
|
|
954
|
+
cudaStream_t stream = 0)
|
|
955
|
+
{
|
|
956
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
957
|
+
|
|
958
|
+
// Signed integer type for global offsets
|
|
959
|
+
using SegmentSizeT = ::cuda::std::int32_t;
|
|
960
|
+
|
|
961
|
+
// Null value type
|
|
962
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
963
|
+
DoubleBuffer<NullType> d_values;
|
|
964
|
+
|
|
965
|
+
return DispatchSegmentedRadixSort<
|
|
966
|
+
SortOrder::Ascending,
|
|
967
|
+
KeyT,
|
|
968
|
+
NullType,
|
|
969
|
+
BeginOffsetIteratorT,
|
|
970
|
+
EndOffsetIteratorT,
|
|
971
|
+
SegmentSizeT>::Dispatch(d_temp_storage,
|
|
972
|
+
temp_storage_bytes,
|
|
973
|
+
d_keys,
|
|
974
|
+
d_values,
|
|
975
|
+
num_items,
|
|
976
|
+
num_segments,
|
|
977
|
+
d_begin_offsets,
|
|
978
|
+
d_end_offsets,
|
|
979
|
+
begin_bit,
|
|
980
|
+
end_bit,
|
|
981
|
+
false,
|
|
982
|
+
stream);
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
//! @rst
|
|
986
|
+
//! Sorts segments of keys into ascending order. (``~N`` auxiliary storage required).
|
|
987
|
+
//!
|
|
988
|
+
//! - The sorting operation is given a pair of key buffers managed by a
|
|
989
|
+
//! DoubleBuffer structure that indicates which of the two buffers is
|
|
990
|
+
//! "current" (and thus contains the input data to be sorted).
|
|
991
|
+
//! - The contents of both buffers may be altered by the sorting operation.
|
|
992
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
993
|
+
//! indicator within the DoubleBuffer wrapper to reference which of the two
|
|
994
|
+
//! buffers now contains the sorted output sequence (a function of the
|
|
995
|
+
//! number of key bits specified and the targeted device architecture).
|
|
996
|
+
//! - When input a contiguous sequence of segments, a single sequence
|
|
997
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
|
|
998
|
+
//! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter
|
|
999
|
+
//! is specified as ``segment_offsets + 1``).
|
|
1000
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
1001
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
1002
|
+
//! yield a corresponding performance improvement.
|
|
1003
|
+
//! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
|
|
1004
|
+
//! The range ``[cur, cur + num_items)`` shall not overlap
|
|
1005
|
+
//! ``[alt, alt + num_items)``. Both ranges shall not overlap
|
|
1006
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
1007
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
1008
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
1009
|
+
//! outside the specified segments ``d_keys.Current()[i]``,
|
|
1010
|
+
//! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
|
|
1011
|
+
//! - Note, the size of any segment may not exceed ``INT_MAX``. Please consider using ``DeviceSegmentedSort`` instead,
|
|
1012
|
+
//! if the size of at least one of your segments could exceed ``INT_MAX``.
|
|
1013
|
+
//! - @devicestorageP
|
|
1014
|
+
//! - @devicestorage
|
|
1015
|
+
//!
|
|
1016
|
+
//! Snippet
|
|
1017
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1018
|
+
//!
|
|
1019
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
1020
|
+
//! (with one zero-length segment) of ``int`` keys.
|
|
1021
|
+
//!
|
|
1022
|
+
//! .. code-block:: c++
|
|
1023
|
+
//!
|
|
1024
|
+
//! #include <cub/cub.cuh>
|
|
1025
|
+
//! // or equivalently <cub/device/device_segmented_radix_sort.cuh>
|
|
1026
|
+
//!
|
|
1027
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
1028
|
+
//! // sorting data
|
|
1029
|
+
//! int num_items; // e.g., 7
|
|
1030
|
+
//! int num_segments; // e.g., 3
|
|
1031
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
1032
|
+
//! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1033
|
+
//! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
1034
|
+
//! ...
|
|
1035
|
+
//!
|
|
1036
|
+
//! // Create a DoubleBuffer to wrap the pair of device pointers
|
|
1037
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
1038
|
+
//!
|
|
1039
|
+
//! // Determine temporary device storage requirements
|
|
1040
|
+
//! void *d_temp_storage = nullptr;
|
|
1041
|
+
//! size_t temp_storage_bytes = 0;
|
|
1042
|
+
//! cub::DeviceSegmentedRadixSort::SortKeys(
|
|
1043
|
+
//! d_temp_storage, temp_storage_bytes, d_keys,
|
|
1044
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1045
|
+
//!
|
|
1046
|
+
//! // Allocate temporary storage
|
|
1047
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1048
|
+
//!
|
|
1049
|
+
//! // Run sorting operation
|
|
1050
|
+
//! cub::DeviceSegmentedRadixSort::SortKeys(
|
|
1051
|
+
//! d_temp_storage, temp_storage_bytes, d_keys,
|
|
1052
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1053
|
+
//!
|
|
1054
|
+
//! // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9]
|
|
1055
|
+
//!
|
|
1056
|
+
//! @endrst
|
|
1057
|
+
//!
|
|
1058
|
+
//! @tparam KeyT
|
|
1059
|
+
//! **[inferred]** Key type
|
|
1060
|
+
//!
|
|
1061
|
+
//! @tparam BeginOffsetIteratorT
|
|
1062
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1063
|
+
//! beginning offsets @iterator
|
|
1064
|
+
//!
|
|
1065
|
+
//! @tparam EndOffsetIteratorT
|
|
1066
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1067
|
+
//! ending offsets @iterator
|
|
1068
|
+
//!
|
|
1069
|
+
//! @param[in] d_temp_storage
|
|
1070
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1071
|
+
//! required allocation size is written to `temp_storage_bytes` and no work
|
|
1072
|
+
//! is done.
|
|
1073
|
+
//!
|
|
1074
|
+
//! @param[in,out] temp_storage_bytes
|
|
1075
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1076
|
+
//!
|
|
1077
|
+
//! @param[in,out] d_keys
|
|
1078
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
1079
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
1080
|
+
//! point to the sorted output keys
|
|
1081
|
+
//!
|
|
1082
|
+
//! @param[in] num_items
|
|
1083
|
+
//! The total number of items within the segmented array, including items not
|
|
1084
|
+
//! covered by segments. `num_items` should match the largest element within
|
|
1085
|
+
//! the range `[d_end_offsets, d_end_offsets + num_segments)`.
|
|
1086
|
+
//!
|
|
1087
|
+
//! @param[in] num_segments
|
|
1088
|
+
//! The number of segments that comprise the sorting data
|
|
1089
|
+
//!
|
|
1090
|
+
//! @param[in] d_begin_offsets
|
|
1091
|
+
//! @rst
|
|
1092
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1093
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
1094
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
1095
|
+
//! @endrst
|
|
1096
|
+
//!
|
|
1097
|
+
//! @param[in] d_end_offsets
|
|
1098
|
+
//! @rst
|
|
1099
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1100
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
1101
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
1102
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
1103
|
+
//! @endrst
|
|
1104
|
+
//!
|
|
1105
|
+
//! @param[in] begin_bit
|
|
1106
|
+
//! **[optional]** The least-significant bit index (inclusive)
|
|
1107
|
+
//! needed for key comparison
|
|
1108
|
+
//!
|
|
1109
|
+
//! @param[in] end_bit
|
|
1110
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
1111
|
+
//! comparison (e.g., `sizeof(unsigned int) * 8`)
|
|
1112
|
+
//!
|
|
1113
|
+
//! @param[in] stream
|
|
1114
|
+
//! @rst
|
|
1115
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1116
|
+
//! @endrst
|
|
1117
|
+
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
1118
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(
|
|
1119
|
+
void* d_temp_storage,
|
|
1120
|
+
size_t& temp_storage_bytes,
|
|
1121
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
1122
|
+
::cuda::std::int64_t num_items,
|
|
1123
|
+
::cuda::std::int64_t num_segments,
|
|
1124
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
1125
|
+
EndOffsetIteratorT d_end_offsets,
|
|
1126
|
+
int begin_bit = 0,
|
|
1127
|
+
int end_bit = sizeof(KeyT) * 8,
|
|
1128
|
+
cudaStream_t stream = 0)
|
|
1129
|
+
{
|
|
1130
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
1131
|
+
|
|
1132
|
+
// Signed integer type for global offsets
|
|
1133
|
+
using SegmentSizeT = ::cuda::std::int32_t;
|
|
1134
|
+
|
|
1135
|
+
// Null value type
|
|
1136
|
+
DoubleBuffer<NullType> d_values;
|
|
1137
|
+
|
|
1138
|
+
return DispatchSegmentedRadixSort<
|
|
1139
|
+
SortOrder::Ascending,
|
|
1140
|
+
KeyT,
|
|
1141
|
+
NullType,
|
|
1142
|
+
BeginOffsetIteratorT,
|
|
1143
|
+
EndOffsetIteratorT,
|
|
1144
|
+
SegmentSizeT>::Dispatch(d_temp_storage,
|
|
1145
|
+
temp_storage_bytes,
|
|
1146
|
+
d_keys,
|
|
1147
|
+
d_values,
|
|
1148
|
+
num_items,
|
|
1149
|
+
num_segments,
|
|
1150
|
+
d_begin_offsets,
|
|
1151
|
+
d_end_offsets,
|
|
1152
|
+
begin_bit,
|
|
1153
|
+
end_bit,
|
|
1154
|
+
true,
|
|
1155
|
+
stream);
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1158
|
+
//! @rst
|
|
1159
|
+
//! Sorts segments of keys into descending order. (``~2N`` auxiliary storage required).
|
|
1160
|
+
//!
|
|
1161
|
+
//! - The contents of the input data are not altered by the sorting operation
|
|
1162
|
+
//! - When input a contiguous sequence of segments, a single sequence
|
|
1163
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
|
|
1164
|
+
//! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter
|
|
1165
|
+
//! is specified as ``segment_offsets + 1``).
|
|
1166
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
1167
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
1168
|
+
//! yield a corresponding performance improvement.
|
|
1169
|
+
//! - The range ``[d_keys_out, d_keys_out + num_items)`` shall not overlap
|
|
1170
|
+
//! ``[d_keys_in, d_keys_in + num_items)``,
|
|
1171
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
1172
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
1173
|
+
//! - @devicestorageNP For sorting using only ``O(P)`` temporary storage, see
|
|
1174
|
+
//! the sorting interface using DoubleBuffer wrappers below.
|
|
1175
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
1176
|
+
//! outside the specified segments ``d_keys_in[i]``, ``d_keys_out[i]`` will not
|
|
1177
|
+
//! be accessed nor modified.
|
|
1178
|
+
//! - Note, the size of any segment may not exceed ``INT_MAX``. Please consider using ``DeviceSegmentedSort`` instead,
|
|
1179
|
+
//! if the size of at least one of your segments could exceed ``INT_MAX``.
|
|
1180
|
+
//! - @devicestorage
|
|
1181
|
+
//!
|
|
1182
|
+
//! Snippet
|
|
1183
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1184
|
+
//!
|
|
1185
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
1186
|
+
//! (with one zero-length segment) of ``int`` keys.
|
|
1187
|
+
//!
|
|
1188
|
+
//! .. code-block:: c++
|
|
1189
|
+
//!
|
|
1190
|
+
//! #include <cub/cub.cuh>
|
|
1191
|
+
//! // or equivalently <cub/device/device_segmented_radix_sort.cuh>
|
|
1192
|
+
//!
|
|
1193
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
1194
|
+
//! // for sorting data
|
|
1195
|
+
//! int num_items; // e.g., 7
|
|
1196
|
+
//! int num_segments; // e.g., 3
|
|
1197
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
1198
|
+
//! int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1199
|
+
//! int *d_keys_out; // e.g., [-, -, -, -, -, -, -]
|
|
1200
|
+
//! ...
|
|
1201
|
+
//!
|
|
1202
|
+
//! // Create a DoubleBuffer to wrap the pair of device pointers
|
|
1203
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
1204
|
+
//!
|
|
1205
|
+
//! // Determine temporary device storage requirements
|
|
1206
|
+
//! void *d_temp_storage = nullptr;
|
|
1207
|
+
//! size_t temp_storage_bytes = 0;
|
|
1208
|
+
//! cub::DeviceSegmentedRadixSort::SortKeysDescending(
|
|
1209
|
+
//! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
|
|
1210
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1211
|
+
//!
|
|
1212
|
+
//! // Allocate temporary storage
|
|
1213
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1214
|
+
//!
|
|
1215
|
+
//! // Run sorting operation
|
|
1216
|
+
//! cub::DeviceSegmentedRadixSort::SortKeysDescending(
|
|
1217
|
+
//! d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
|
|
1218
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1219
|
+
//!
|
|
1220
|
+
//! // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0]
|
|
1221
|
+
//!
|
|
1222
|
+
//! @endrst
|
|
1223
|
+
//!
|
|
1224
|
+
//! @tparam KeyT
|
|
1225
|
+
//! **[inferred]** Key type
|
|
1226
|
+
//!
|
|
1227
|
+
//! @tparam BeginOffsetIteratorT
|
|
1228
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
|
|
1229
|
+
//!
|
|
1230
|
+
//! @tparam EndOffsetIteratorT
|
|
1231
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
|
|
1232
|
+
//!
|
|
1233
|
+
//! @param[in] d_temp_storage
|
|
1234
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1235
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
1236
|
+
//!
|
|
1237
|
+
//! @param[in,out] temp_storage_bytes
|
|
1238
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1239
|
+
//!
|
|
1240
|
+
//! @param[in] d_keys_in
|
|
1241
|
+
//! Device-accessible pointer to the input data of key data to sort
|
|
1242
|
+
//!
|
|
1243
|
+
//! @param[out] d_keys_out
|
|
1244
|
+
//! Device-accessible pointer to the sorted output sequence of key data
|
|
1245
|
+
//!
|
|
1246
|
+
//! @param[in] num_items
|
|
1247
|
+
//! The total number of items within the segmented array, including items not
|
|
1248
|
+
//! covered by segments. `num_items` should match the largest element within
|
|
1249
|
+
//! the range `[d_end_offsets, d_end_offsets + num_segments)`.
|
|
1250
|
+
//!
|
|
1251
|
+
//! @param[in] num_segments
|
|
1252
|
+
//! The number of segments that comprise the sorting data
|
|
1253
|
+
//!
|
|
1254
|
+
//! @param[in] d_begin_offsets
|
|
1255
|
+
//! @rst
|
|
1256
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1257
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
1258
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
1259
|
+
//! @endrst
|
|
1260
|
+
//!
|
|
1261
|
+
//! @param[in] d_end_offsets
|
|
1262
|
+
//! @rst
|
|
1263
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1264
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
1265
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
1266
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
1267
|
+
//! @endrst
|
|
1268
|
+
//!
|
|
1269
|
+
//! @param[in] begin_bit
|
|
1270
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for key comparison
|
|
1271
|
+
//!
|
|
1272
|
+
//! @param[in] end_bit
|
|
1273
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
1274
|
+
//! comparison (e.g., sizeof(unsigned int) * 8)
|
|
1275
|
+
//!
|
|
1276
|
+
//! @param[in] stream
|
|
1277
|
+
//! @rst
|
|
1278
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1279
|
+
//! @endrst
|
|
1280
|
+
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
1281
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
|
|
1282
|
+
void* d_temp_storage,
|
|
1283
|
+
size_t& temp_storage_bytes,
|
|
1284
|
+
const KeyT* d_keys_in,
|
|
1285
|
+
KeyT* d_keys_out,
|
|
1286
|
+
::cuda::std::int64_t num_items,
|
|
1287
|
+
::cuda::std::int64_t num_segments,
|
|
1288
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
1289
|
+
EndOffsetIteratorT d_end_offsets,
|
|
1290
|
+
int begin_bit = 0,
|
|
1291
|
+
int end_bit = sizeof(KeyT) * 8,
|
|
1292
|
+
cudaStream_t stream = 0)
|
|
1293
|
+
{
|
|
1294
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
1295
|
+
|
|
1296
|
+
// Signed integer type for global offsets
|
|
1297
|
+
using SegmentSizeT = ::cuda::std::int32_t;
|
|
1298
|
+
|
|
1299
|
+
DoubleBuffer<KeyT> d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
|
|
1300
|
+
DoubleBuffer<NullType> d_values;
|
|
1301
|
+
|
|
1302
|
+
return DispatchSegmentedRadixSort<
|
|
1303
|
+
SortOrder::Descending,
|
|
1304
|
+
KeyT,
|
|
1305
|
+
NullType,
|
|
1306
|
+
BeginOffsetIteratorT,
|
|
1307
|
+
EndOffsetIteratorT,
|
|
1308
|
+
SegmentSizeT>::Dispatch(d_temp_storage,
|
|
1309
|
+
temp_storage_bytes,
|
|
1310
|
+
d_keys,
|
|
1311
|
+
d_values,
|
|
1312
|
+
num_items,
|
|
1313
|
+
num_segments,
|
|
1314
|
+
d_begin_offsets,
|
|
1315
|
+
d_end_offsets,
|
|
1316
|
+
begin_bit,
|
|
1317
|
+
end_bit,
|
|
1318
|
+
false,
|
|
1319
|
+
stream);
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
//! @rst
|
|
1323
|
+
//! Sorts segments of keys into descending order. (``~N`` auxiliary storage required).
|
|
1324
|
+
//!
|
|
1325
|
+
//! - The sorting operation is given a pair of key buffers managed by a
|
|
1326
|
+
//! DoubleBuffer structure that indicates which of the two buffers is
|
|
1327
|
+
//! "current" (and thus contains the input data to be sorted).
|
|
1328
|
+
//! - The contents of both buffers may be altered by the sorting operation.
|
|
1329
|
+
//! - Upon completion, the sorting operation will update the "current"
|
|
1330
|
+
//! indicator within the DoubleBuffer wrapper to reference which of the two
|
|
1331
|
+
//! buffers now contains the sorted output sequence (a function of the
|
|
1332
|
+
//! number of key bits specified and the targeted device architecture).
|
|
1333
|
+
//! - When input a contiguous sequence of segments, a single sequence
|
|
1334
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
1335
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
1336
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
1337
|
+
//! - An optional bit subrange ``[begin_bit, end_bit)`` of differentiating key
|
|
1338
|
+
//! bits can be specified. This can reduce overall sorting overhead and
|
|
1339
|
+
//! yield a corresponding performance improvement.
|
|
1340
|
+
//! - Let ``cur = d_keys.Current()`` and ``alt = d_keys.Alternate()``.
|
|
1341
|
+
//! The range ``[cur, cur + num_items)`` shall not overlap
|
|
1342
|
+
//! ``[alt, alt + num_items)``. Both ranges shall not overlap
|
|
1343
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
1344
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)`` in any way.
|
|
1345
|
+
//! - Segments are not required to be contiguous. For all index values ``i``
|
|
1346
|
+
//! outside the specified segments ``d_keys.Current()[i]``,
|
|
1347
|
+
//! ``d_keys[i].Alternate()[i]`` will not be accessed nor modified.
|
|
1348
|
+
//! - Note, the size of any segment may not exceed ``INT_MAX``. Please consider using ``DeviceSegmentedSort`` instead,
|
|
1349
|
+
//! if the size of at least one of your segments could exceed ``INT_MAX``.
|
|
1350
|
+
//! - @devicestorageP
|
|
1351
|
+
//! - @devicestorage
|
|
1352
|
+
//!
|
|
1353
|
+
//! Snippet
|
|
1354
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1355
|
+
//!
|
|
1356
|
+
//! The code snippet below illustrates the batched sorting of three segments
|
|
1357
|
+
//! (with one zero-length segment) of `int` keys.
|
|
1358
|
+
//!
|
|
1359
|
+
//! .. code-block:: c++
|
|
1360
|
+
//!
|
|
1361
|
+
//! #include <cub/cub.cuh>
|
|
1362
|
+
//! // or equivalently <cub/device/device_segmented_radix_sort.cuh>
|
|
1363
|
+
//!
|
|
1364
|
+
//! // Declare, allocate, and initialize device-accessible pointers
|
|
1365
|
+
//! // for sorting data
|
|
1366
|
+
//! int num_items; // e.g., 7
|
|
1367
|
+
//! int num_segments; // e.g., 3
|
|
1368
|
+
//! int *d_offsets; // e.g., [0, 3, 3, 7]
|
|
1369
|
+
//! int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1370
|
+
//! int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -]
|
|
1371
|
+
//! ...
|
|
1372
|
+
//!
|
|
1373
|
+
//! // Create a DoubleBuffer to wrap the pair of device pointers
|
|
1374
|
+
//! cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
|
|
1375
|
+
//!
|
|
1376
|
+
//! // Determine temporary device storage requirements
|
|
1377
|
+
//! void *d_temp_storage = nullptr;
|
|
1378
|
+
//! size_t temp_storage_bytes = 0;
|
|
1379
|
+
//! cub::DeviceSegmentedRadixSort::SortKeysDescending(
|
|
1380
|
+
//! d_temp_storage, temp_storage_bytes, d_keys,
|
|
1381
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1382
|
+
//!
|
|
1383
|
+
//! // Allocate temporary storage
|
|
1384
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1385
|
+
//!
|
|
1386
|
+
//! // Run sorting operation
|
|
1387
|
+
//! cub::DeviceSegmentedRadixSort::SortKeysDescending(
|
|
1388
|
+
//! d_temp_storage, temp_storage_bytes, d_keys,
|
|
1389
|
+
//! num_items, num_segments, d_offsets, d_offsets + 1);
|
|
1390
|
+
//!
|
|
1391
|
+
//! // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0]
|
|
1392
|
+
//!
|
|
1393
|
+
//! @endrst
|
|
1394
|
+
//!
|
|
1395
|
+
//! @tparam KeyT
|
|
1396
|
+
//! **[inferred]** Key type
|
|
1397
|
+
//!
|
|
1398
|
+
//! @tparam BeginOffsetIteratorT
|
|
1399
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1400
|
+
//! beginning offsets @iterator
|
|
1401
|
+
//!
|
|
1402
|
+
//! @tparam EndOffsetIteratorT
|
|
1403
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1404
|
+
//! ending offsets @iterator
|
|
1405
|
+
//!
|
|
1406
|
+
//! @param[in] d_temp_storage
|
|
1407
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1408
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
1409
|
+
//!
|
|
1410
|
+
//! @param[in,out] temp_storage_bytes
|
|
1411
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1412
|
+
//!
|
|
1413
|
+
//! @param[in,out] d_keys
|
|
1414
|
+
//! Reference to the double-buffer of keys whose "current" device-accessible
|
|
1415
|
+
//! buffer contains the unsorted input keys and, upon return, is updated to
|
|
1416
|
+
//! point to the sorted output keys
|
|
1417
|
+
//!
|
|
1418
|
+
//! @param[in] num_items
|
|
1419
|
+
//! The total number of items within the segmented array, including items not
|
|
1420
|
+
//! covered by segments. `num_items` should match the largest element within
|
|
1421
|
+
//! the range `[d_end_offsets, d_end_offsets + num_segments)`.
|
|
1422
|
+
//!
|
|
1423
|
+
//! @param[in] num_segments
|
|
1424
|
+
//! The number of segments that comprise the sorting data
|
|
1425
|
+
//!
|
|
1426
|
+
//! @param[in] d_begin_offsets
|
|
1427
|
+
//! @rst
|
|
1428
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1429
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
1430
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
|
|
1431
|
+
//! @endrst
|
|
1432
|
+
//!
|
|
1433
|
+
//! @param[in] d_end_offsets
|
|
1434
|
+
//! @rst
|
|
1435
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1436
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
1437
|
+
//! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
|
|
1438
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
1439
|
+
//! @endrst
|
|
1440
|
+
//!
|
|
1441
|
+
//! @param[in] begin_bit
|
|
1442
|
+
//! **[optional]** The least-significant bit index (inclusive) needed for key comparison
|
|
1443
|
+
//!
|
|
1444
|
+
//! @param[in] end_bit
|
|
1445
|
+
//! **[optional]** The most-significant bit index (exclusive) needed for key
|
|
1446
|
+
//! comparison (e.g., `sizeof(unsigned int) * 8`)
|
|
1447
|
+
//!
|
|
1448
|
+
//! @param[in] stream
|
|
1449
|
+
//! @rst
|
|
1450
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1451
|
+
//! @endrst
|
|
1452
|
+
template <typename KeyT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
1453
|
+
CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending(
|
|
1454
|
+
void* d_temp_storage,
|
|
1455
|
+
size_t& temp_storage_bytes,
|
|
1456
|
+
DoubleBuffer<KeyT>& d_keys,
|
|
1457
|
+
::cuda::std::int64_t num_items,
|
|
1458
|
+
::cuda::std::int64_t num_segments,
|
|
1459
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
1460
|
+
EndOffsetIteratorT d_end_offsets,
|
|
1461
|
+
int begin_bit = 0,
|
|
1462
|
+
int end_bit = sizeof(KeyT) * 8,
|
|
1463
|
+
cudaStream_t stream = 0)
|
|
1464
|
+
{
|
|
1465
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, GetName());
|
|
1466
|
+
|
|
1467
|
+
// Signed integer type for global offsets
|
|
1468
|
+
using SegmentSizeT = ::cuda::std::int32_t;
|
|
1469
|
+
|
|
1470
|
+
// Null value type
|
|
1471
|
+
DoubleBuffer<NullType> d_values;
|
|
1472
|
+
|
|
1473
|
+
return DispatchSegmentedRadixSort<
|
|
1474
|
+
SortOrder::Descending,
|
|
1475
|
+
KeyT,
|
|
1476
|
+
NullType,
|
|
1477
|
+
BeginOffsetIteratorT,
|
|
1478
|
+
EndOffsetIteratorT,
|
|
1479
|
+
SegmentSizeT>::Dispatch(d_temp_storage,
|
|
1480
|
+
temp_storage_bytes,
|
|
1481
|
+
d_keys,
|
|
1482
|
+
d_values,
|
|
1483
|
+
num_items,
|
|
1484
|
+
num_segments,
|
|
1485
|
+
d_begin_offsets,
|
|
1486
|
+
d_end_offsets,
|
|
1487
|
+
begin_bit,
|
|
1488
|
+
end_bit,
|
|
1489
|
+
true,
|
|
1490
|
+
stream);
|
|
1491
|
+
}
|
|
1492
|
+
|
|
1493
|
+
//! @} end member group
|
|
1494
|
+
};
|
|
1495
|
+
|
|
1496
|
+
CUB_NAMESPACE_END
|