cuda-cccl 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/__init__.py +27 -0
- cuda/cccl/_cuda_version_utils.py +24 -0
- cuda/cccl/cooperative/__init__.py +9 -0
- cuda/cccl/cooperative/experimental/__init__.py +24 -0
- cuda/cccl/headers/__init__.py +7 -0
- cuda/cccl/headers/include/__init__.py +1 -0
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +259 -0
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1182 -0
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +81 -0
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +709 -0
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +748 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +786 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +703 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +555 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +589 -0
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +474 -0
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +289 -0
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1117 -0
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +606 -0
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +631 -0
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +963 -0
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1227 -0
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +1313 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +424 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +1264 -0
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1225 -0
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2196 -0
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +667 -0
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +2315 -0
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
- cuda/cccl/headers/include/cub/block/block_store.cuh +1247 -0
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
- cuda/cccl/headers/include/cub/config.cuh +53 -0
- cuda/cccl/headers/include/cub/cub.cuh +120 -0
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +114 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
- cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
- cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
- cuda/cccl/headers/include/cub/device/device_for.cuh +1063 -0
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
- cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
- cuda/cccl/headers/include/cub/device/device_partition.cuh +668 -0
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +2518 -0
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
- cuda/cccl/headers/include/cub/device/device_scan.cuh +2212 -0
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1430 -0
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
- cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
- cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
- cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1744 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1310 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +531 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +517 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +975 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +440 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +627 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +569 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +261 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +583 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +189 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +321 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +522 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1028 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +67 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +118 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +60 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +275 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +76 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +126 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1065 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +942 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +673 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +618 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1010 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +398 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +440 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +481 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +884 -0
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +548 -0
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
- cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
- cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
- cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
- cuda/cccl/headers/include/cub/util_device.cuh +800 -0
- cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
- cuda/cccl/headers/include/cub/util_math.cuh +118 -0
- cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
- cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
- cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
- cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
- cuda/cccl/headers/include/cub/version.cuh +89 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +737 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +408 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +829 -0
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1890 -0
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +521 -0
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
- cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
- cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +487 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
- cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
- cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
- cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
- cuda/cccl/headers/include/cuda/__cccl_config +37 -0
- cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
- cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
- cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
- cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
- cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
- cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
- cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
- cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
- cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
- cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
- cuda/cccl/headers/include/cuda/__complex_ +28 -0
- cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
- cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
- cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +835 -0
- cuda/cccl/headers/include/cuda/__event/event.h +171 -0
- cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
- cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
- cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
- cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
- cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
- cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
- cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
- cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
- cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
- cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
- cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +533 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
- cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
- cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
- cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
- cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
- cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
- cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +652 -0
- cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
- cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2983 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
- cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
- cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
- cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
- cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
- cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
- cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
- cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
- cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
- cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
- cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
- cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
- cuda/cccl/headers/include/cuda/access_property +26 -0
- cuda/cccl/headers/include/cuda/algorithm +27 -0
- cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
- cuda/cccl/headers/include/cuda/atomic +27 -0
- cuda/cccl/headers/include/cuda/barrier +267 -0
- cuda/cccl/headers/include/cuda/bit +29 -0
- cuda/cccl/headers/include/cuda/cmath +37 -0
- cuda/cccl/headers/include/cuda/devices +33 -0
- cuda/cccl/headers/include/cuda/discard_memory +32 -0
- cuda/cccl/headers/include/cuda/functional +32 -0
- cuda/cccl/headers/include/cuda/iterator +39 -0
- cuda/cccl/headers/include/cuda/latch +27 -0
- cuda/cccl/headers/include/cuda/mdspan +28 -0
- cuda/cccl/headers/include/cuda/memory +35 -0
- cuda/cccl/headers/include/cuda/memory_resource +35 -0
- cuda/cccl/headers/include/cuda/numeric +29 -0
- cuda/cccl/headers/include/cuda/pipeline +579 -0
- cuda/cccl/headers/include/cuda/ptx +129 -0
- cuda/cccl/headers/include/cuda/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
- cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
- cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
- cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
- cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
- cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
- cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
- cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
- cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
- cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
- cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
- cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
- cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +673 -0
- cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +91 -0
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
- cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
- cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
- cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
- cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
- cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
- cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
- cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
- cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
- cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +259 -0
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
- cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
- cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
- cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
- cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
- cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
- cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
- cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
- cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
- cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
- cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
- cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
- cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
- cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
- cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
- cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
- cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
- cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
- cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
- cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
- cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
- cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
- cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
- cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +139 -0
- cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
- cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
- cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
- cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +165 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
- cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
- cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
- cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
- cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
- cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
- cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
- cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
- cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
- cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
- cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
- cuda/cccl/headers/include/cuda/std/__format_ +45 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
- cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
- cuda/cccl/headers/include/cuda/std/__functional/function.h +1275 -0
- cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
- cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
- cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
- cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
- cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
- cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
- cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
- cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
- cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
- cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
- cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
- cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
- cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +72 -0
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
- cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
- cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
- cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
- cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
- cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
- cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
- cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
- cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
- cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
- cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
- cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
- cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +759 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
- cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +525 -0
- cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
- cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
- cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
- cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
- cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
- cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
- cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
- cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
- cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
- cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
- cuda/cccl/headers/include/cuda/std/__new_ +29 -0
- cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
- cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
- cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
- cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
- cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
- cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
- cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
- cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
- cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
- cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
- cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
- cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
- cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
- cuda/cccl/headers/include/cuda/std/__random_ +29 -0
- cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
- cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
- cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
- cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
- cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
- cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
- cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
- cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
- cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
- cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
- cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
- cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
- cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
- cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
- cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
- cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
- cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
- cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
- cuda/cccl/headers/include/cuda/std/__string_ +29 -0
- cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
- cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +84 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
- cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
- cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
- cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
- cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
- cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
- cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
- cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
- cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
- cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
- cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
- cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
- cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
- cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
- cuda/cccl/headers/include/cuda/std/array +518 -0
- cuda/cccl/headers/include/cuda/std/atomic +810 -0
- cuda/cccl/headers/include/cuda/std/barrier +42 -0
- cuda/cccl/headers/include/cuda/std/bit +35 -0
- cuda/cccl/headers/include/cuda/std/bitset +994 -0
- cuda/cccl/headers/include/cuda/std/cassert +28 -0
- cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
- cuda/cccl/headers/include/cuda/std/cfloat +59 -0
- cuda/cccl/headers/include/cuda/std/chrono +26 -0
- cuda/cccl/headers/include/cuda/std/climits +61 -0
- cuda/cccl/headers/include/cuda/std/cmath +87 -0
- cuda/cccl/headers/include/cuda/std/complex +50 -0
- cuda/cccl/headers/include/cuda/std/concepts +48 -0
- cuda/cccl/headers/include/cuda/std/cstddef +28 -0
- cuda/cccl/headers/include/cuda/std/cstdint +178 -0
- cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
- cuda/cccl/headers/include/cuda/std/cstring +110 -0
- cuda/cccl/headers/include/cuda/std/ctime +154 -0
- cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2144 -0
- cuda/cccl/headers/include/cuda/std/execution +29 -0
- cuda/cccl/headers/include/cuda/std/expected +30 -0
- cuda/cccl/headers/include/cuda/std/functional +56 -0
- cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
- cuda/cccl/headers/include/cuda/std/iterator +70 -0
- cuda/cccl/headers/include/cuda/std/latch +34 -0
- cuda/cccl/headers/include/cuda/std/limits +28 -0
- cuda/cccl/headers/include/cuda/std/linalg +30 -0
- cuda/cccl/headers/include/cuda/std/mdspan +38 -0
- cuda/cccl/headers/include/cuda/std/memory +39 -0
- cuda/cccl/headers/include/cuda/std/numbers +346 -0
- cuda/cccl/headers/include/cuda/std/numeric +41 -0
- cuda/cccl/headers/include/cuda/std/optional +31 -0
- cuda/cccl/headers/include/cuda/std/ranges +69 -0
- cuda/cccl/headers/include/cuda/std/ratio +416 -0
- cuda/cccl/headers/include/cuda/std/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/source_location +83 -0
- cuda/cccl/headers/include/cuda/std/span +628 -0
- cuda/cccl/headers/include/cuda/std/string_view +925 -0
- cuda/cccl/headers/include/cuda/std/tuple +26 -0
- cuda/cccl/headers/include/cuda/std/type_traits +177 -0
- cuda/cccl/headers/include/cuda/std/utility +70 -0
- cuda/cccl/headers/include/cuda/std/variant +25 -0
- cuda/cccl/headers/include/cuda/std/version +240 -0
- cuda/cccl/headers/include/cuda/stream +31 -0
- cuda/cccl/headers/include/cuda/stream_ref +59 -0
- cuda/cccl/headers/include/cuda/type_traits +27 -0
- cuda/cccl/headers/include/cuda/utility +28 -0
- cuda/cccl/headers/include/cuda/version +16 -0
- cuda/cccl/headers/include/cuda/warp +28 -0
- cuda/cccl/headers/include/cuda/work_stealing +26 -0
- cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
- cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
- cuda/cccl/headers/include/nv/target +240 -0
- cuda/cccl/headers/include/thrust/addressof.h +22 -0
- cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
- cuda/cccl/headers/include/thrust/advance.h +57 -0
- cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
- cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
- cuda/cccl/headers/include/thrust/complex.h +858 -0
- cuda/cccl/headers/include/thrust/copy.h +506 -0
- cuda/cccl/headers/include/thrust/count.h +245 -0
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
- cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +626 -0
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +192 -0
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +96 -0
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +78 -0
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +115 -0
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +116 -0
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
- cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
- cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
- cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
- cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
- cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
- cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
- cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
- cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config.h +36 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
- cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
- cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
- cuda/cccl/headers/include/thrust/detail/count.h +55 -0
- cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
- cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
- cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
- cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +81 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
- cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
- cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
- cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
- cuda/cccl/headers/include/thrust/detail/function.h +49 -0
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
- cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
- cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
- cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
- cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
- cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
- cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
- cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
- cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
- cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
- cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
- cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
- cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
- cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
- cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
- cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
- cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
- cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
- cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
- cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
- cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
- cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
- cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
- cuda/cccl/headers/include/thrust/device_delete.h +74 -0
- cuda/cccl/headers/include/thrust/device_free.h +85 -0
- cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
- cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
- cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
- cuda/cccl/headers/include/thrust/device_new.h +112 -0
- cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
- cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
- cuda/cccl/headers/include/thrust/device_reference.h +983 -0
- cuda/cccl/headers/include/thrust/device_vector.h +576 -0
- cuda/cccl/headers/include/thrust/distance.h +43 -0
- cuda/cccl/headers/include/thrust/equal.h +247 -0
- cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
- cuda/cccl/headers/include/thrust/extrema.h +657 -0
- cuda/cccl/headers/include/thrust/fill.h +200 -0
- cuda/cccl/headers/include/thrust/find.h +382 -0
- cuda/cccl/headers/include/thrust/for_each.h +261 -0
- cuda/cccl/headers/include/thrust/functional.h +395 -0
- cuda/cccl/headers/include/thrust/gather.h +464 -0
- cuda/cccl/headers/include/thrust/generate.h +193 -0
- cuda/cccl/headers/include/thrust/host_vector.h +576 -0
- cuda/cccl/headers/include/thrust/inner_product.h +264 -0
- cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
- cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
- cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
- cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
- cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
- cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
- cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
- cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
- cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
- cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
- cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
- cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
- cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
- cuda/cccl/headers/include/thrust/logical.h +290 -0
- cuda/cccl/headers/include/thrust/memory.h +299 -0
- cuda/cccl/headers/include/thrust/merge.h +725 -0
- cuda/cccl/headers/include/thrust/mismatch.h +261 -0
- cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +528 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
- cuda/cccl/headers/include/thrust/mr/new.h +100 -0
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
- cuda/cccl/headers/include/thrust/mr/pool.h +528 -0
- cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
- cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
- cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
- cuda/cccl/headers/include/thrust/pair.h +99 -0
- cuda/cccl/headers/include/thrust/partition.h +1391 -0
- cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
- cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
- cuda/cccl/headers/include/thrust/random.h +120 -0
- cuda/cccl/headers/include/thrust/reduce.h +1113 -0
- cuda/cccl/headers/include/thrust/remove.h +768 -0
- cuda/cccl/headers/include/thrust/replace.h +826 -0
- cuda/cccl/headers/include/thrust/reverse.h +215 -0
- cuda/cccl/headers/include/thrust/scan.h +1671 -0
- cuda/cccl/headers/include/thrust/scatter.h +446 -0
- cuda/cccl/headers/include/thrust/sequence.h +277 -0
- cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
- cuda/cccl/headers/include/thrust/shuffle.h +182 -0
- cuda/cccl/headers/include/thrust/sort.h +1320 -0
- cuda/cccl/headers/include/thrust/swap.h +147 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +273 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +233 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +170 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +223 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +341 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +469 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
- cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
- cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
- cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
- cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +73 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.inl +172 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +36 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +33 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system_error.h +57 -0
- cuda/cccl/headers/include/thrust/tabulate.h +125 -0
- cuda/cccl/headers/include/thrust/transform.h +1045 -0
- cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
- cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
- cuda/cccl/headers/include/thrust/tuple.h +139 -0
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
- cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
- cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
- cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
- cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
- cuda/cccl/headers/include/thrust/unique.h +1088 -0
- cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
- cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
- cuda/cccl/headers/include/thrust/version.h +93 -0
- cuda/cccl/headers/include/thrust/zip_function.h +176 -0
- cuda/cccl/headers/include_paths.py +51 -0
- cuda/cccl/parallel/__init__.py +9 -0
- cuda/cccl/parallel/experimental/__init__.py +24 -0
- cuda/cccl/py.typed +0 -0
- cuda/compute/__init__.py +79 -0
- cuda/compute/_bindings.py +79 -0
- cuda/compute/_bindings.pyi +475 -0
- cuda/compute/_bindings_impl.pyx +2273 -0
- cuda/compute/_caching.py +71 -0
- cuda/compute/_cccl_interop.py +422 -0
- cuda/compute/_utils/__init__.py +0 -0
- cuda/compute/_utils/protocols.py +132 -0
- cuda/compute/_utils/temp_storage_buffer.py +86 -0
- cuda/compute/algorithms/__init__.py +54 -0
- cuda/compute/algorithms/_histogram.py +243 -0
- cuda/compute/algorithms/_merge_sort.py +225 -0
- cuda/compute/algorithms/_radix_sort.py +312 -0
- cuda/compute/algorithms/_reduce.py +182 -0
- cuda/compute/algorithms/_scan.py +331 -0
- cuda/compute/algorithms/_segmented_reduce.py +257 -0
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/compute/algorithms/_transform.py +329 -0
- cuda/compute/algorithms/_unique_by_key.py +252 -0
- cuda/compute/cccl/.gitkeep +0 -0
- cuda/compute/cu12/_bindings_impl.cp313-win_amd64.pyd +0 -0
- cuda/compute/cu12/cccl/cccl.c.parallel.dll +0 -0
- cuda/compute/cu12/cccl/cccl.c.parallel.lib +0 -0
- cuda/compute/cu13/_bindings_impl.cp313-win_amd64.pyd +0 -0
- cuda/compute/cu13/cccl/cccl.c.parallel.dll +0 -0
- cuda/compute/cu13/cccl/cccl.c.parallel.lib +0 -0
- cuda/compute/iterators/__init__.py +21 -0
- cuda/compute/iterators/_factories.py +219 -0
- cuda/compute/iterators/_iterators.py +817 -0
- cuda/compute/iterators/_zip_iterator.py +199 -0
- cuda/compute/numba_utils.py +53 -0
- cuda/compute/op.py +3 -0
- cuda/compute/struct.py +272 -0
- cuda/compute/typing.py +37 -0
- cuda/coop/__init__.py +8 -0
- cuda/coop/_caching.py +48 -0
- cuda/coop/_common.py +275 -0
- cuda/coop/_nvrtc.py +92 -0
- cuda/coop/_scan_op.py +181 -0
- cuda/coop/_types.py +937 -0
- cuda/coop/_typing.py +107 -0
- cuda/coop/block/__init__.py +39 -0
- cuda/coop/block/_block_exchange.py +251 -0
- cuda/coop/block/_block_load_store.py +215 -0
- cuda/coop/block/_block_merge_sort.py +125 -0
- cuda/coop/block/_block_radix_sort.py +214 -0
- cuda/coop/block/_block_reduce.py +294 -0
- cuda/coop/block/_block_scan.py +983 -0
- cuda/coop/warp/__init__.py +9 -0
- cuda/coop/warp/_warp_merge_sort.py +92 -0
- cuda/coop/warp/_warp_reduce.py +153 -0
- cuda/coop/warp/_warp_scan.py +78 -0
- cuda_cccl-0.3.3.dist-info/METADATA +41 -0
- cuda_cccl-0.3.3.dist-info/RECORD +1968 -0
- cuda_cccl-0.3.3.dist-info/WHEEL +5 -0
- cuda_cccl-0.3.3.dist-info/licenses/LICENSE +1 -0
|
@@ -0,0 +1,2212 @@
|
|
|
1
|
+
/******************************************************************************
|
|
2
|
+
* Copyright (c) 2011, Duane Merrill. All rights reserved.
|
|
3
|
+
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
|
|
4
|
+
*
|
|
5
|
+
* Redistribution and use in source and binary forms, with or without
|
|
6
|
+
* modification, are permitted provided that the following conditions are met:
|
|
7
|
+
* * Redistributions of source code must retain the above copyright
|
|
8
|
+
* notice, this list of conditions and the following disclaimer.
|
|
9
|
+
* * Redistributions in binary form must reproduce the above copyright
|
|
10
|
+
* notice, this list of conditions and the following disclaimer in the
|
|
11
|
+
* documentation and/or other materials provided with the distribution.
|
|
12
|
+
* * Neither the name of the NVIDIA CORPORATION nor the
|
|
13
|
+
* names of its contributors may be used to endorse or promote products
|
|
14
|
+
* derived from this software without specific prior written permission.
|
|
15
|
+
*
|
|
16
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
17
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
18
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
19
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
20
|
+
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
21
|
+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
22
|
+
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
23
|
+
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
24
|
+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
25
|
+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
26
|
+
*
|
|
27
|
+
******************************************************************************/
|
|
28
|
+
|
|
29
|
+
//! @file
|
|
30
|
+
//! cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data
|
|
31
|
+
//! items residing within device-accessible memory.
|
|
32
|
+
|
|
33
|
+
#pragma once
|
|
34
|
+
|
|
35
|
+
#include <cub/config.cuh>
|
|
36
|
+
|
|
37
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
38
|
+
# pragma GCC system_header
|
|
39
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
40
|
+
# pragma clang system_header
|
|
41
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
42
|
+
# pragma system_header
|
|
43
|
+
#endif // no system header
|
|
44
|
+
|
|
45
|
+
#include <cub/detail/choose_offset.cuh>
|
|
46
|
+
#include <cub/detail/device_memory_resource.cuh>
|
|
47
|
+
#include <cub/detail/temporary_storage.cuh>
|
|
48
|
+
#include <cub/device/dispatch/dispatch_scan.cuh>
|
|
49
|
+
#include <cub/device/dispatch/dispatch_scan_by_key.cuh>
|
|
50
|
+
#include <cub/thread/thread_operators.cuh>
|
|
51
|
+
|
|
52
|
+
#include <cuda/__execution/determinism.h>
|
|
53
|
+
#include <cuda/__execution/require.h>
|
|
54
|
+
#include <cuda/__execution/tune.h>
|
|
55
|
+
#include <cuda/__memory_resource/get_memory_resource.h>
|
|
56
|
+
#include <cuda/__stream/get_stream.h>
|
|
57
|
+
#include <cuda/std/__execution/env.h>
|
|
58
|
+
#include <cuda/std/__functional/invoke.h>
|
|
59
|
+
|
|
60
|
+
CUB_NAMESPACE_BEGIN
|
|
61
|
+
|
|
62
|
+
namespace detail::scan
|
|
63
|
+
{
|
|
64
|
+
struct get_tuning_query_t
|
|
65
|
+
{};
|
|
66
|
+
|
|
67
|
+
template <class Derived>
|
|
68
|
+
struct tuning
|
|
69
|
+
{
|
|
70
|
+
[[nodiscard]] _CCCL_NODEBUG_API constexpr Derived query(const get_tuning_query_t&) const noexcept
|
|
71
|
+
{
|
|
72
|
+
return static_cast<const Derived&>(*this);
|
|
73
|
+
}
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
struct default_tuning : tuning<default_tuning>
|
|
77
|
+
{
|
|
78
|
+
template <typename InputValueT, typename OutputValueT, typename AccumT, typename OffsetT, typename ScanOpT>
|
|
79
|
+
using fn = policy_hub<InputValueT, OutputValueT, AccumT, OffsetT, ScanOpT>;
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
} // namespace detail::scan
|
|
83
|
+
|
|
84
|
+
//! @rst
|
|
85
|
+
//! DeviceScan provides device-wide, parallel operations for computing a
|
|
86
|
+
//! prefix scan across a sequence of data items residing within
|
|
87
|
+
//! device-accessible memory.
|
|
88
|
+
//!
|
|
89
|
+
//! Overview
|
|
90
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
91
|
+
//!
|
|
92
|
+
//! Given a sequence of input elements and a binary reduction operator, a
|
|
93
|
+
//! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output
|
|
94
|
+
//! sequence where each element is computed to be the reduction of the elements
|
|
95
|
+
//! occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan
|
|
96
|
+
//! with the addition operator. The term *inclusive* indicates that the
|
|
97
|
+
//! *i*\ :sup:`th` output reduction incorporates the *i*\ :sup:`th` input.
|
|
98
|
+
//! The term *exclusive* indicates the *i*\ :sup:`th` input is not
|
|
99
|
+
//! incorporated into the *i*\ :sup:`th` output reduction. When the input and
|
|
100
|
+
//! output sequences are the same, the scan is performed in-place.
|
|
101
|
+
//!
|
|
102
|
+
//! In order to provide an efficient parallel implementation, the binary reduction operator must be associative. That
|
|
103
|
+
//! is, ``op(op(a, b), c)`` must be equivalent to ``op(a, op(b, c))`` for any input values ``a``, ``b``, and ``c``.
|
|
104
|
+
//!
|
|
105
|
+
//! As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our
|
|
106
|
+
//! *"decoupled look-back"* algorithm for performing global prefix scan with
|
|
107
|
+
//! only a single pass through the input data, as described in our 2016 technical
|
|
108
|
+
//! report [1]_. The central idea is to leverage a small, constant factor of
|
|
109
|
+
//! redundant work in order to overlap the latencies of global prefix
|
|
110
|
+
//! propagation with local computation. As such, our algorithm requires only
|
|
111
|
+
//! ``~2*n*`` data movement (``n`` inputs are read, ``n`` outputs are written), and
|
|
112
|
+
//! typically proceeds at "memcpy" speeds. Our algorithm supports inplace operations.
|
|
113
|
+
//!
|
|
114
|
+
//! .. [1] Duane Merrill and Michael Garland. `Single-pass Parallel Prefix Scan with Decoupled Look-back
|
|
115
|
+
//! <https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back>`_,
|
|
116
|
+
//! *NVIDIA Technical Report NVR-2016-002*, 2016.
|
|
117
|
+
//!
|
|
118
|
+
//! Usage Considerations
|
|
119
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
120
|
+
//!
|
|
121
|
+
//! @cdp_class{DeviceScan}
|
|
122
|
+
//!
|
|
123
|
+
//! Performance
|
|
124
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
125
|
+
//!
|
|
126
|
+
//! @linear_performance{prefix scan}
|
|
127
|
+
//!
|
|
128
|
+
//! @endrst
|
|
129
|
+
struct DeviceScan
|
|
130
|
+
{
|
|
131
|
+
//! @cond
|
|
132
|
+
template <typename TuningEnvT,
|
|
133
|
+
typename InputIteratorT,
|
|
134
|
+
typename OutputIteratorT,
|
|
135
|
+
typename ScanOpT,
|
|
136
|
+
typename InitValueT,
|
|
137
|
+
typename NumItemsT,
|
|
138
|
+
::cuda::execution::determinism::__determinism_t Determinism,
|
|
139
|
+
ForceInclusive EnforceInclusive = ForceInclusive::No>
|
|
140
|
+
CUB_RUNTIME_FUNCTION static cudaError_t scan_impl_determinism(
|
|
141
|
+
void* d_temp_storage,
|
|
142
|
+
size_t& temp_storage_bytes,
|
|
143
|
+
InputIteratorT d_in,
|
|
144
|
+
OutputIteratorT d_out,
|
|
145
|
+
ScanOpT scan_op,
|
|
146
|
+
InitValueT init,
|
|
147
|
+
NumItemsT num_items,
|
|
148
|
+
::cuda::execution::determinism::__determinism_holder_t<Determinism>,
|
|
149
|
+
cudaStream_t stream)
|
|
150
|
+
{
|
|
151
|
+
using scan_tuning_t = ::cuda::std::execution::
|
|
152
|
+
__query_result_or_t<TuningEnvT, detail::scan::get_tuning_query_t, detail::scan::default_tuning>;
|
|
153
|
+
|
|
154
|
+
// Unsigned integer type for global offsets
|
|
155
|
+
using offset_t = detail::choose_offset_t<NumItemsT>;
|
|
156
|
+
|
|
157
|
+
using accum_t =
|
|
158
|
+
::cuda::std::__accumulator_t<ScanOpT,
|
|
159
|
+
cub::detail::it_value_t<InputIteratorT>,
|
|
160
|
+
::cuda::std::_If<::cuda::std::is_same_v<InitValueT, NullType>,
|
|
161
|
+
cub::detail::it_value_t<InputIteratorT>,
|
|
162
|
+
typename InitValueT::value_type>>;
|
|
163
|
+
|
|
164
|
+
using policy_t = typename scan_tuning_t::
|
|
165
|
+
template fn<detail::it_value_t<InputIteratorT>, detail::it_value_t<OutputIteratorT>, accum_t, offset_t, ScanOpT>;
|
|
166
|
+
|
|
167
|
+
using dispatch_t =
|
|
168
|
+
DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, offset_t, accum_t, EnforceInclusive, policy_t>;
|
|
169
|
+
|
|
170
|
+
return dispatch_t::Dispatch(
|
|
171
|
+
d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, static_cast<offset_t>(num_items), stream);
|
|
172
|
+
}
|
|
173
|
+
//! @endcond
|
|
174
|
+
|
|
175
|
+
//! @cond
|
|
176
|
+
template <typename InputIteratorT,
|
|
177
|
+
typename OutputIteratorT,
|
|
178
|
+
typename ScanOpT,
|
|
179
|
+
typename InitValueT,
|
|
180
|
+
typename NumItemsT,
|
|
181
|
+
ForceInclusive EnforceInclusive = ForceInclusive::No,
|
|
182
|
+
typename EnvT>
|
|
183
|
+
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t scan_impl_env(
|
|
184
|
+
InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, InitValueT init, NumItemsT num_items, EnvT env)
|
|
185
|
+
{
|
|
186
|
+
static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
|
|
187
|
+
"Determinism should be used inside requires to have an effect.");
|
|
188
|
+
|
|
189
|
+
using requirements_t =
|
|
190
|
+
_CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
|
|
191
|
+
|
|
192
|
+
using requested_determinism_t =
|
|
193
|
+
_CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
|
|
194
|
+
_CUDA_EXEC::determinism::__get_determinism_t,
|
|
195
|
+
_CUDA_EXEC::determinism::run_to_run_t>;
|
|
196
|
+
|
|
197
|
+
// Static assert to reject gpu_to_gpu determinism since it's not implemented
|
|
198
|
+
static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
|
|
199
|
+
"gpu_to_gpu determinism is not supported");
|
|
200
|
+
|
|
201
|
+
static_assert(!::cuda::std::is_same_v<requested_determinism_t, ::cuda::execution::determinism::not_guaranteed_t>,
|
|
202
|
+
"not_guaranteed determinism is not supported");
|
|
203
|
+
|
|
204
|
+
using determinism_t = ::cuda::execution::determinism::run_to_run_t;
|
|
205
|
+
|
|
206
|
+
// Query relevant properties from the environment
|
|
207
|
+
auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}});
|
|
208
|
+
auto mr = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
|
|
209
|
+
|
|
210
|
+
void* d_temp_storage = nullptr;
|
|
211
|
+
size_t temp_storage_bytes = 0;
|
|
212
|
+
|
|
213
|
+
using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
|
|
214
|
+
|
|
215
|
+
// Query the required temporary storage size
|
|
216
|
+
cudaError_t error = scan_impl_determinism<tuning_t>(
|
|
217
|
+
d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, num_items, determinism_t{}, stream.get());
|
|
218
|
+
|
|
219
|
+
if (error != cudaSuccess)
|
|
220
|
+
{
|
|
221
|
+
return error;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// TODO(gevtushenko): use uninitialized buffer whenit's available
|
|
225
|
+
error = CubDebug(detail::temporary_storage::allocate(stream, d_temp_storage, temp_storage_bytes, mr));
|
|
226
|
+
if (error != cudaSuccess)
|
|
227
|
+
{
|
|
228
|
+
return error;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// Run the algorithm
|
|
232
|
+
error = scan_impl_determinism<tuning_t>(
|
|
233
|
+
d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, init, num_items, determinism_t{}, stream.get());
|
|
234
|
+
|
|
235
|
+
// Try to deallocate regardless of the error to avoid memory leaks
|
|
236
|
+
cudaError_t deallocate_error =
|
|
237
|
+
CubDebug(detail::temporary_storage::deallocate(stream, d_temp_storage, temp_storage_bytes, mr));
|
|
238
|
+
|
|
239
|
+
if (error != cudaSuccess)
|
|
240
|
+
{
|
|
241
|
+
// Reduction error takes precedence over deallocation error since it happens first
|
|
242
|
+
return error;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
return deallocate_error;
|
|
246
|
+
}
|
|
247
|
+
//! @endcond
|
|
248
|
+
|
|
249
|
+
//! @name Exclusive scans
|
|
250
|
+
//! @{
|
|
251
|
+
|
|
252
|
+
//! @rst
|
|
253
|
+
//! Computes a device-wide exclusive prefix sum.
|
|
254
|
+
//! The value of ``0`` is applied as the initial value, and is assigned to ``*d_out``.
|
|
255
|
+
//!
|
|
256
|
+
//! - Supports non-commutative sum operators.
|
|
257
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
258
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
259
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
260
|
+
//! the @lookback description.
|
|
261
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
|
|
262
|
+
//! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
|
|
263
|
+
//! shall not overlap in any other way.
|
|
264
|
+
//! - @devicestorage
|
|
265
|
+
//!
|
|
266
|
+
//! Snippet
|
|
267
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
268
|
+
//!
|
|
269
|
+
//! The code snippet below illustrates the exclusive prefix sum of an ``int``
|
|
270
|
+
//! device vector.
|
|
271
|
+
//!
|
|
272
|
+
//! .. code-block:: c++
|
|
273
|
+
//!
|
|
274
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
|
|
275
|
+
//!
|
|
276
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
277
|
+
//! // input and output
|
|
278
|
+
//! int num_items; // e.g., 7
|
|
279
|
+
//! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
280
|
+
//! int *d_out; // e.g., [ , , , , , , ]
|
|
281
|
+
//! ...
|
|
282
|
+
//!
|
|
283
|
+
//! // Determine temporary device storage requirements
|
|
284
|
+
//! void *d_temp_storage = nullptr;
|
|
285
|
+
//! size_t temp_storage_bytes = 0;
|
|
286
|
+
//! cub::DeviceScan::ExclusiveSum(
|
|
287
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
288
|
+
//! d_in, d_out, num_items);
|
|
289
|
+
//!
|
|
290
|
+
//! // Allocate temporary storage
|
|
291
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
292
|
+
//!
|
|
293
|
+
//! // Run exclusive prefix sum
|
|
294
|
+
//! cub::DeviceScan::ExclusiveSum(
|
|
295
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
296
|
+
//! d_in, d_out, num_items);
|
|
297
|
+
//!
|
|
298
|
+
//! // d_out <-- [0, 8, 14, 21, 26, 29, 29]
|
|
299
|
+
//!
|
|
300
|
+
//! @endrst
|
|
301
|
+
//!
|
|
302
|
+
//! @tparam InputIteratorT
|
|
303
|
+
//! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
|
|
304
|
+
//!
|
|
305
|
+
//! @tparam OutputIteratorT
|
|
306
|
+
//! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
|
|
307
|
+
//!
|
|
308
|
+
//! @tparam NumItemsT
|
|
309
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
310
|
+
//!
|
|
311
|
+
//! @param[in] d_temp_storage
|
|
312
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
313
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
314
|
+
//!
|
|
315
|
+
//! @param[in,out] temp_storage_bytes
|
|
316
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
317
|
+
//!
|
|
318
|
+
//! @param[in] d_in
|
|
319
|
+
//! Random-access iterator to the input sequence of data items
|
|
320
|
+
//!
|
|
321
|
+
//! @param[out] d_out
|
|
322
|
+
//! Random-access iterator to the output sequence of data items
|
|
323
|
+
//!
|
|
324
|
+
//! @param[in] num_items
|
|
325
|
+
//! Total number of input items (i.e., the length of `d_in`)
|
|
326
|
+
//!
|
|
327
|
+
//! @param[in] stream
|
|
328
|
+
//! @rst
|
|
329
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
330
|
+
//! @endrst
|
|
331
|
+
template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
|
|
332
|
+
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
|
|
333
|
+
void* d_temp_storage,
|
|
334
|
+
size_t& temp_storage_bytes,
|
|
335
|
+
InputIteratorT d_in,
|
|
336
|
+
OutputIteratorT d_out,
|
|
337
|
+
NumItemsT num_items,
|
|
338
|
+
cudaStream_t stream = 0)
|
|
339
|
+
{
|
|
340
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum");
|
|
341
|
+
|
|
342
|
+
// Unsigned integer type for global offsets
|
|
343
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
344
|
+
using InitT = cub::detail::it_value_t<InputIteratorT>;
|
|
345
|
+
|
|
346
|
+
// Initial value
|
|
347
|
+
InitT init_value{};
|
|
348
|
+
|
|
349
|
+
return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, detail::InputValue<InitT>, OffsetT>::
|
|
350
|
+
Dispatch(d_temp_storage,
|
|
351
|
+
temp_storage_bytes,
|
|
352
|
+
d_in,
|
|
353
|
+
d_out,
|
|
354
|
+
::cuda::std::plus<>{},
|
|
355
|
+
detail::InputValue<InitT>(init_value),
|
|
356
|
+
num_items,
|
|
357
|
+
stream);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
//! @rst
|
|
361
|
+
//! Computes a device-wide exclusive prefix sum.
|
|
362
|
+
//! The value of ``0`` is applied as the initial value, and is assigned to ``*d_out``.
|
|
363
|
+
//!
|
|
364
|
+
//! - Supports non-commutative sum operators.
|
|
365
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
366
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
367
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
368
|
+
//! the @lookback description.
|
|
369
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
|
|
370
|
+
//! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
|
|
371
|
+
//! shall not overlap in any other way.
|
|
372
|
+
//! - @devicestorage
|
|
373
|
+
//!
|
|
374
|
+
//! Preconditions
|
|
375
|
+
//! +++++++++++++
|
|
376
|
+
//!
|
|
377
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
|
|
378
|
+
//! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
|
|
379
|
+
//! shall not overlap in any other way.
|
|
380
|
+
//! - ``d_in`` and ``d_out`` must not be null pointers
|
|
381
|
+
//!
|
|
382
|
+
//! Snippet
|
|
383
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
384
|
+
//!
|
|
385
|
+
//! The code snippet below illustrates a user-defined exclusive-scan of a
|
|
386
|
+
//! device vector of ``float`` data elements.
|
|
387
|
+
//!
|
|
388
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_env_api.cu
|
|
389
|
+
//! :language: c++
|
|
390
|
+
//! :dedent:
|
|
391
|
+
//! :start-after: example-begin exclusive-sum-env-determinism
|
|
392
|
+
//! :end-before: example-end exclusive-sum-env-determinism
|
|
393
|
+
//!
|
|
394
|
+
//! @endrst
|
|
395
|
+
//!
|
|
396
|
+
//! @tparam InputIteratorT
|
|
397
|
+
//! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
|
|
398
|
+
//!
|
|
399
|
+
//! @tparam OutputIteratorT
|
|
400
|
+
//! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
|
|
401
|
+
//!
|
|
402
|
+
//! @tparam NumItemsT
|
|
403
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
404
|
+
//!
|
|
405
|
+
//! @tparam EnvT
|
|
406
|
+
//! **[inferred]** Execution environment type. Default is `_CUDA_STD_EXEC::env<>`.
|
|
407
|
+
//!
|
|
408
|
+
//! @param[in] d_in
|
|
409
|
+
//! Random-access iterator to the input sequence of data items
|
|
410
|
+
//!
|
|
411
|
+
//! @param[out] d_out
|
|
412
|
+
//! Random-access iterator to the output sequence of data items
|
|
413
|
+
//!
|
|
414
|
+
//! @param[in] num_items
|
|
415
|
+
//! Total number of input items (i.e., the length of `d_in`)
|
|
416
|
+
//!
|
|
417
|
+
//! @param[in] env
|
|
418
|
+
//! @rst
|
|
419
|
+
//! **[optional]** Execution environment. Default is `_CUDA_STD_EXEC::env{}`.
|
|
420
|
+
//! @endrst
|
|
421
|
+
template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT, typename EnvT = _CUDA_STD_EXEC::env<>>
|
|
422
|
+
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
|
|
423
|
+
ExclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, EnvT env = {})
|
|
424
|
+
{
|
|
425
|
+
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveSum");
|
|
426
|
+
|
|
427
|
+
using InitT = cub::detail::it_value_t<InputIteratorT>;
|
|
428
|
+
|
|
429
|
+
// Initial value
|
|
430
|
+
InitT init_value{};
|
|
431
|
+
|
|
432
|
+
return scan_impl_env(d_in, d_out, ::cuda::std::plus<>{}, detail::InputValue<InitT>(init_value), num_items, env);
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
//! @rst
|
|
436
|
+
//! Computes a device-wide exclusive prefix sum in-place.
|
|
437
|
+
//! The value of ``0`` is applied as the initial value, and is assigned to ``*d_data``.
|
|
438
|
+
//!
|
|
439
|
+
//! - Supports non-commutative sum operators.
|
|
440
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
441
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
442
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
443
|
+
//! the @lookback description.
|
|
444
|
+
//! - @devicestorage
|
|
445
|
+
//!
|
|
446
|
+
//! Snippet
|
|
447
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
448
|
+
//!
|
|
449
|
+
//! The code snippet below illustrates the exclusive prefix sum of an ``int``
|
|
450
|
+
//! device vector.
|
|
451
|
+
//!
|
|
452
|
+
//! .. code-block:: c++
|
|
453
|
+
//!
|
|
454
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
|
|
455
|
+
//!
|
|
456
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
457
|
+
//! // input and output
|
|
458
|
+
//! int num_items; // e.g., 7
|
|
459
|
+
//! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
460
|
+
//! ...
|
|
461
|
+
//!
|
|
462
|
+
//! // Determine temporary device storage requirements
|
|
463
|
+
//! void *d_temp_storage = nullptr;
|
|
464
|
+
//! size_t temp_storage_bytes = 0;
|
|
465
|
+
//! cub::DeviceScan::ExclusiveSum(
|
|
466
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
467
|
+
//! d_data, num_items);
|
|
468
|
+
//!
|
|
469
|
+
//! // Allocate temporary storage
|
|
470
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
471
|
+
//!
|
|
472
|
+
//! // Run exclusive prefix sum
|
|
473
|
+
//! cub::DeviceScan::ExclusiveSum(
|
|
474
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
475
|
+
//! d_data, num_items);
|
|
476
|
+
//!
|
|
477
|
+
//! // d_data <-- [0, 8, 14, 21, 26, 29, 29]
|
|
478
|
+
//!
|
|
479
|
+
//! @endrst
|
|
480
|
+
//!
|
|
481
|
+
//! @tparam IteratorT
|
|
482
|
+
//! **[inferred]** Random-access iterator type for reading scan inputs and wrigin scan outputs
|
|
483
|
+
//!
|
|
484
|
+
//! @tparam NumItemsT
|
|
485
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
486
|
+
//!
|
|
487
|
+
//! @param[in] d_temp_storage
|
|
488
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
489
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
490
|
+
//!
|
|
491
|
+
//! @param[in,out] temp_storage_bytes
|
|
492
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
493
|
+
//!
|
|
494
|
+
//! @param[in,out] d_data
|
|
495
|
+
//! Random-access iterator to the sequence of data items
|
|
496
|
+
//!
|
|
497
|
+
//! @param[in] num_items
|
|
498
|
+
//! Total number of input items (i.e., the length of `d_in`)
|
|
499
|
+
//!
|
|
500
|
+
//! @param[in] stream
|
|
501
|
+
//! @rst
|
|
502
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
503
|
+
//! @endrst
|
|
504
|
+
template <typename IteratorT, typename NumItemsT>
|
|
505
|
+
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSum(
|
|
506
|
+
void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
|
|
507
|
+
{
|
|
508
|
+
return ExclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
//! @rst
|
|
512
|
+
//! Computes a device-wide exclusive prefix scan using the specified
|
|
513
|
+
//! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
|
|
514
|
+
//! the initial value, and is assigned to ``*d_out``.
|
|
515
|
+
//!
|
|
516
|
+
//! - Supports non-commutative scan operators.
|
|
517
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
518
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
519
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
520
|
+
//! the @lookback description.
|
|
521
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
|
|
522
|
+
//! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
|
|
523
|
+
//! shall not overlap in any other way.
|
|
524
|
+
//! - @devicestorage
|
|
525
|
+
//!
|
|
526
|
+
//! Snippet
|
|
527
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
528
|
+
//!
|
|
529
|
+
//! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
|
|
530
|
+
//!
|
|
531
|
+
//! .. code-block:: c++
|
|
532
|
+
//!
|
|
533
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
|
|
534
|
+
//! #include <cuda/std/climits> // for INT_MAX
|
|
535
|
+
//!
|
|
536
|
+
//! // CustomMin functor
|
|
537
|
+
//! struct CustomMin
|
|
538
|
+
//! {
|
|
539
|
+
//! template <typename T>
|
|
540
|
+
//! __host__ __device__ __forceinline__
|
|
541
|
+
//! T operator()(const T &a, const T &b) const {
|
|
542
|
+
//! return (b < a) ? b : a;
|
|
543
|
+
//! }
|
|
544
|
+
//! };
|
|
545
|
+
//!
|
|
546
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
547
|
+
//! // input and output
|
|
548
|
+
//! int num_items; // e.g., 7
|
|
549
|
+
//! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
550
|
+
//! int *d_out; // e.g., [ , , , , , , ]
|
|
551
|
+
//! CustomMin min_op;
|
|
552
|
+
//! ...
|
|
553
|
+
//!
|
|
554
|
+
//! // Determine temporary device storage requirements for exclusive
|
|
555
|
+
//! // prefix scan
|
|
556
|
+
//! void *d_temp_storage = nullptr;
|
|
557
|
+
//! size_t temp_storage_bytes = 0;
|
|
558
|
+
//! cub::DeviceScan::ExclusiveScan(
|
|
559
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
560
|
+
//! d_in, d_out, min_op, (int) INT_MAX, num_items);
|
|
561
|
+
//!
|
|
562
|
+
//! // Allocate temporary storage for exclusive prefix scan
|
|
563
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
564
|
+
//!
|
|
565
|
+
//! // Run exclusive prefix min-scan
|
|
566
|
+
//! cub::DeviceScan::ExclusiveScan(
|
|
567
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
568
|
+
//! d_in, d_out, min_op, (int) INT_MAX, num_items);
|
|
569
|
+
//!
|
|
570
|
+
//! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
|
|
571
|
+
//!
|
|
572
|
+
//! @endrst
|
|
573
|
+
//!
|
|
574
|
+
//! @tparam InputIteratorT
|
|
575
|
+
//! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
|
|
576
|
+
//!
|
|
577
|
+
//! @tparam OutputIteratorT
|
|
578
|
+
//! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
|
|
579
|
+
//!
|
|
580
|
+
//! @tparam ScanOpT
|
|
581
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
582
|
+
//!
|
|
583
|
+
//! @tparam InitValueT
|
|
584
|
+
//! **[inferred]** Type of the `init_value`
|
|
585
|
+
//!
|
|
586
|
+
//! @tparam NumItemsT
|
|
587
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
588
|
+
//!
|
|
589
|
+
//! @param[in] d_temp_storage
|
|
590
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
591
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
592
|
+
//!
|
|
593
|
+
//! @param[in,out] temp_storage_bytes
|
|
594
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
595
|
+
//!
|
|
596
|
+
//! @param[in] d_in
|
|
597
|
+
//! Random-access iterator to the input sequence of data items
|
|
598
|
+
//!
|
|
599
|
+
//! @param[out] d_out
|
|
600
|
+
//! Random-access iterator to the output sequence of data items
|
|
601
|
+
//!
|
|
602
|
+
//! @param[in] scan_op
|
|
603
|
+
//! Binary associative scan functor
|
|
604
|
+
//!
|
|
605
|
+
//! @param[in] init_value
|
|
606
|
+
//! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
|
|
607
|
+
//!
|
|
608
|
+
//! @param[in] num_items
|
|
609
|
+
//! Total number of input items (i.e., the length of `d_in`)
|
|
610
|
+
//!
|
|
611
|
+
//! @param[in] stream
|
|
612
|
+
//! @rst
|
|
613
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
614
|
+
//! @endrst
|
|
615
|
+
template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
|
|
616
|
+
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
|
|
617
|
+
void* d_temp_storage,
|
|
618
|
+
size_t& temp_storage_bytes,
|
|
619
|
+
InputIteratorT d_in,
|
|
620
|
+
OutputIteratorT d_out,
|
|
621
|
+
ScanOpT scan_op,
|
|
622
|
+
InitValueT init_value,
|
|
623
|
+
NumItemsT num_items,
|
|
624
|
+
cudaStream_t stream = 0)
|
|
625
|
+
{
|
|
626
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
|
|
627
|
+
|
|
628
|
+
// Unsigned integer type for global offsets
|
|
629
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
630
|
+
|
|
631
|
+
return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
|
|
632
|
+
d_temp_storage,
|
|
633
|
+
temp_storage_bytes,
|
|
634
|
+
d_in,
|
|
635
|
+
d_out,
|
|
636
|
+
scan_op,
|
|
637
|
+
detail::InputValue<InitValueT>(init_value),
|
|
638
|
+
num_items,
|
|
639
|
+
stream);
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
//! @rst
|
|
643
|
+
//! Computes a device-wide exclusive prefix scan using the specified
|
|
644
|
+
//! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
|
|
645
|
+
//! the initial value, and is assigned to ``*d_out``.
|
|
646
|
+
//!
|
|
647
|
+
//! - Supports non-commutative scan operators.
|
|
648
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
649
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
650
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
651
|
+
//! the @lookback description.
|
|
652
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
|
|
653
|
+
//! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
|
|
654
|
+
//! shall not overlap in any other way.
|
|
655
|
+
//! - @devicestorage
|
|
656
|
+
//!
|
|
657
|
+
//! Snippet
|
|
658
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
659
|
+
//!
|
|
660
|
+
//! The code snippet below illustrates a user-defined exclusive-scan of a
|
|
661
|
+
//! device vector of ``float`` data elements.
|
|
662
|
+
//!
|
|
663
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_env_api.cu
|
|
664
|
+
//! :language: c++
|
|
665
|
+
//! :dedent:
|
|
666
|
+
//! :start-after: example-begin exclusive-scan-env-determinism
|
|
667
|
+
//! :end-before: example-end exclusive-scan-env-determinism
|
|
668
|
+
//!
|
|
669
|
+
//! @endrst
|
|
670
|
+
//!
|
|
671
|
+
//! @tparam InputIteratorT
|
|
672
|
+
//! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
|
|
673
|
+
//!
|
|
674
|
+
//! @tparam OutputIteratorT
|
|
675
|
+
//! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
|
|
676
|
+
//!
|
|
677
|
+
//! @tparam ScanOpT
|
|
678
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
679
|
+
//!
|
|
680
|
+
//! @tparam InitValueT
|
|
681
|
+
//! **[inferred]** Type of the `init_value`
|
|
682
|
+
//!
|
|
683
|
+
//! @tparam NumItemsT
|
|
684
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
685
|
+
//!
|
|
686
|
+
//! @tparam EnvT
|
|
687
|
+
//! **[inferred]** Execution environment type. Default is `_CUDA_STD_EXEC::env<>`.
|
|
688
|
+
//!
|
|
689
|
+
//! @param[in] d_in
|
|
690
|
+
//! Random-access iterator to the input sequence of data items
|
|
691
|
+
//!
|
|
692
|
+
//! @param[out] d_out
|
|
693
|
+
//! Random-access iterator to the output sequence of data items
|
|
694
|
+
//!
|
|
695
|
+
//! @param[in] scan_op
|
|
696
|
+
//! Binary associative scan functor
|
|
697
|
+
//!
|
|
698
|
+
//! @param[in] init_value
|
|
699
|
+
//! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
|
|
700
|
+
//!
|
|
701
|
+
//! @param[in] num_items
|
|
702
|
+
//! Total number of input items (i.e., the length of `d_in`)
|
|
703
|
+
//!
|
|
704
|
+
//! @param[in] env
|
|
705
|
+
//! @rst
|
|
706
|
+
//! **[optional]** Execution environment. Default is `_CUDA_STD_EXEC::env{}`.
|
|
707
|
+
//! @endrst
|
|
708
|
+
template <typename InputIteratorT,
|
|
709
|
+
typename OutputIteratorT,
|
|
710
|
+
typename ScanOpT,
|
|
711
|
+
typename InitValueT,
|
|
712
|
+
typename NumItemsT,
|
|
713
|
+
typename EnvT = _CUDA_STD_EXEC::env<>>
|
|
714
|
+
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
|
|
715
|
+
InputIteratorT d_in,
|
|
716
|
+
OutputIteratorT d_out,
|
|
717
|
+
ScanOpT scan_op,
|
|
718
|
+
InitValueT init_value,
|
|
719
|
+
NumItemsT num_items,
|
|
720
|
+
EnvT env = {})
|
|
721
|
+
{
|
|
722
|
+
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveScan");
|
|
723
|
+
|
|
724
|
+
return scan_impl_env(d_in, d_out, scan_op, detail::InputValue<InitValueT>(init_value), num_items, env);
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
//! @rst
|
|
728
|
+
//! Computes a device-wide exclusive prefix scan using the specified
|
|
729
|
+
//! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
|
|
730
|
+
//! the initial value, and is assigned to ``*d_data``.
|
|
731
|
+
//!
|
|
732
|
+
//! - Supports non-commutative scan operators.
|
|
733
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
734
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
735
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
736
|
+
//! the @lookback description.
|
|
737
|
+
//! - @devicestorage
|
|
738
|
+
//!
|
|
739
|
+
//! Snippet
|
|
740
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
741
|
+
//!
|
|
742
|
+
//! The code snippet below illustrates the exclusive prefix min-scan of an
|
|
743
|
+
//! ``int`` device vector:
|
|
744
|
+
//!
|
|
745
|
+
//! .. code-block:: c++
|
|
746
|
+
//!
|
|
747
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
|
|
748
|
+
//! #include <cuda/std/climits> // for INT_MAX
|
|
749
|
+
//!
|
|
750
|
+
//! // CustomMin functor
|
|
751
|
+
//! struct CustomMin
|
|
752
|
+
//! {
|
|
753
|
+
//! template <typename T>
|
|
754
|
+
//! __host__ __device__ __forceinline__
|
|
755
|
+
//! T operator()(const T &a, const T &b) const {
|
|
756
|
+
//! return (b < a) ? b : a;
|
|
757
|
+
//! }
|
|
758
|
+
//! };
|
|
759
|
+
//!
|
|
760
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
761
|
+
//! // input and output
|
|
762
|
+
//! int num_items; // e.g., 7
|
|
763
|
+
//! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
764
|
+
//! CustomMin min_op;
|
|
765
|
+
//! ...
|
|
766
|
+
//!
|
|
767
|
+
//! // Determine temporary device storage requirements for exclusive
|
|
768
|
+
//! // prefix scan
|
|
769
|
+
//! void *d_temp_storage = nullptr;
|
|
770
|
+
//! size_t temp_storage_bytes = 0;
|
|
771
|
+
//! cub::DeviceScan::ExclusiveScan(
|
|
772
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
773
|
+
//! d_data, min_op, (int) INT_MAX, num_items);
|
|
774
|
+
//!
|
|
775
|
+
//! // Allocate temporary storage for exclusive prefix scan
|
|
776
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
777
|
+
//!
|
|
778
|
+
//! // Run exclusive prefix min-scan
|
|
779
|
+
//! cub::DeviceScan::ExclusiveScan(
|
|
780
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
781
|
+
//! d_data, min_op, (int) INT_MAX, num_items);
|
|
782
|
+
//!
|
|
783
|
+
//! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
|
|
784
|
+
//!
|
|
785
|
+
//! @endrst
|
|
786
|
+
//!
|
|
787
|
+
//! @tparam IteratorT
|
|
788
|
+
//! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
|
|
789
|
+
//!
|
|
790
|
+
//! @tparam ScanOpT
|
|
791
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
792
|
+
//!
|
|
793
|
+
//! @tparam InitValueT
|
|
794
|
+
//! **[inferred]** Type of the `init_value`
|
|
795
|
+
//!
|
|
796
|
+
//! @tparam NumItemsT
|
|
797
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
798
|
+
//!
|
|
799
|
+
//! @param[in] d_temp_storage
|
|
800
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
801
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
802
|
+
//!
|
|
803
|
+
//! @param[in,out] temp_storage_bytes
|
|
804
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
805
|
+
//!
|
|
806
|
+
//! @param[in,out] d_data
|
|
807
|
+
//! Random-access iterator to the sequence of data items
|
|
808
|
+
//!
|
|
809
|
+
//! @param[in] scan_op
|
|
810
|
+
//! Binary associative scan functor
|
|
811
|
+
//!
|
|
812
|
+
//! @param[in] init_value
|
|
813
|
+
//! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
|
|
814
|
+
//!
|
|
815
|
+
//! @param[in] num_items
|
|
816
|
+
//! Total number of input items (i.e., the length of `d_in`)
|
|
817
|
+
//!
|
|
818
|
+
//! @param[in] stream
|
|
819
|
+
//! @rst
|
|
820
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
821
|
+
//! @endrst
|
|
822
|
+
template <typename IteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
|
|
823
|
+
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
|
|
824
|
+
void* d_temp_storage,
|
|
825
|
+
size_t& temp_storage_bytes,
|
|
826
|
+
IteratorT d_data,
|
|
827
|
+
ScanOpT scan_op,
|
|
828
|
+
InitValueT init_value,
|
|
829
|
+
NumItemsT num_items,
|
|
830
|
+
cudaStream_t stream = 0)
|
|
831
|
+
{
|
|
832
|
+
return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
//! @rst
|
|
836
|
+
//! Computes a device-wide exclusive prefix scan using the specified
|
|
837
|
+
//! binary associative ``scan_op`` functor. The ``init_value`` value is provided as a future value.
|
|
838
|
+
//!
|
|
839
|
+
//! - Supports non-commutative scan operators.
|
|
840
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
841
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
842
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
843
|
+
//! the @lookback description.
|
|
844
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place.
|
|
845
|
+
//! The range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
|
|
846
|
+
//! shall not overlap in any other way.
|
|
847
|
+
//! - @devicestorage
|
|
848
|
+
//!
|
|
849
|
+
//! Snippet
|
|
850
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
851
|
+
//!
|
|
852
|
+
//! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
|
|
853
|
+
//!
|
|
854
|
+
//! .. code-block:: c++
|
|
855
|
+
//!
|
|
856
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
|
|
857
|
+
//! #include <cuda/std/climits> // for INT_MAX
|
|
858
|
+
//!
|
|
859
|
+
//! // CustomMin functor
|
|
860
|
+
//! struct CustomMin
|
|
861
|
+
//! {
|
|
862
|
+
//! template <typename T>
|
|
863
|
+
//! __host__ __device__ __forceinline__
|
|
864
|
+
//! T operator()(const T &a, const T &b) const {
|
|
865
|
+
//! return (b < a) ? b : a;
|
|
866
|
+
//! }
|
|
867
|
+
//! };
|
|
868
|
+
//!
|
|
869
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
870
|
+
//! // input and output
|
|
871
|
+
//! int num_items; // e.g., 7
|
|
872
|
+
//! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
873
|
+
//! int *d_out; // e.g., [ , , , , , , ]
|
|
874
|
+
//! int *d_init_iter; // e.g., INT_MAX
|
|
875
|
+
//! CustomMin min_op;
|
|
876
|
+
//!
|
|
877
|
+
//! auto future_init_value =
|
|
878
|
+
//! cub::FutureValue<InitialValueT, IterT>(d_init_iter);
|
|
879
|
+
//!
|
|
880
|
+
//! ...
|
|
881
|
+
//!
|
|
882
|
+
//! // Determine temporary device storage requirements for exclusive
|
|
883
|
+
//! // prefix scan
|
|
884
|
+
//! void *d_temp_storage = nullptr;
|
|
885
|
+
//! size_t temp_storage_bytes = 0;
|
|
886
|
+
//! cub::DeviceScan::ExclusiveScan(
|
|
887
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
888
|
+
//! d_in, d_out, min_op, future_init_value, num_items);
|
|
889
|
+
//!
|
|
890
|
+
//! // Allocate temporary storage for exclusive prefix scan
|
|
891
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
892
|
+
//!
|
|
893
|
+
//! // Run exclusive prefix min-scan
|
|
894
|
+
//! cub::DeviceScan::ExclusiveScan(
|
|
895
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
896
|
+
//! d_in, d_out, min_op, future_init_value, num_items);
|
|
897
|
+
//!
|
|
898
|
+
//! // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
|
|
899
|
+
//!
|
|
900
|
+
//! @endrst
|
|
901
|
+
//!
|
|
902
|
+
//! @tparam InputIteratorT
|
|
903
|
+
//! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
|
|
904
|
+
//!
|
|
905
|
+
//! @tparam OutputIteratorT
|
|
906
|
+
//! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
|
|
907
|
+
//!
|
|
908
|
+
//! @tparam ScanOpT
|
|
909
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
910
|
+
//!
|
|
911
|
+
//! @tparam InitValueT
|
|
912
|
+
//! **[inferred]** Type of the `init_value`
|
|
913
|
+
//!
|
|
914
|
+
//! @tparam NumItemsT
|
|
915
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
916
|
+
//!
|
|
917
|
+
//! @param[in] d_temp_storage
|
|
918
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
919
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
920
|
+
//!
|
|
921
|
+
//! @param[in,out] temp_storage_bytes
|
|
922
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
923
|
+
//!
|
|
924
|
+
//! @param[in] d_in
|
|
925
|
+
//! Pointer to the input sequence of data items
|
|
926
|
+
//!
|
|
927
|
+
//! @param[out] d_out
|
|
928
|
+
//! Pointer to the output sequence of data items
|
|
929
|
+
//!
|
|
930
|
+
//! @param[in] scan_op
|
|
931
|
+
//! Binary associative scan functor
|
|
932
|
+
//!
|
|
933
|
+
//! @param[in] init_value
|
|
934
|
+
//! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
|
|
935
|
+
//!
|
|
936
|
+
//! @param[in] num_items
|
|
937
|
+
//! Total number of input items (i.e., the length of `d_in`)
|
|
938
|
+
//!
|
|
939
|
+
//! @param[in] stream
|
|
940
|
+
//! @rst
|
|
941
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
942
|
+
//! @endrst
|
|
943
|
+
template <typename InputIteratorT,
|
|
944
|
+
typename OutputIteratorT,
|
|
945
|
+
typename ScanOpT,
|
|
946
|
+
typename InitValueT,
|
|
947
|
+
typename InitValueIterT = InitValueT*,
|
|
948
|
+
typename NumItemsT = int>
|
|
949
|
+
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
|
|
950
|
+
void* d_temp_storage,
|
|
951
|
+
size_t& temp_storage_bytes,
|
|
952
|
+
InputIteratorT d_in,
|
|
953
|
+
OutputIteratorT d_out,
|
|
954
|
+
ScanOpT scan_op,
|
|
955
|
+
FutureValue<InitValueT, InitValueIterT> init_value,
|
|
956
|
+
NumItemsT num_items,
|
|
957
|
+
cudaStream_t stream = 0)
|
|
958
|
+
{
|
|
959
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScan");
|
|
960
|
+
|
|
961
|
+
// Unsigned integer type for global offsets
|
|
962
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
963
|
+
|
|
964
|
+
return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
|
|
965
|
+
d_temp_storage,
|
|
966
|
+
temp_storage_bytes,
|
|
967
|
+
d_in,
|
|
968
|
+
d_out,
|
|
969
|
+
scan_op,
|
|
970
|
+
detail::InputValue<InitValueT>(init_value),
|
|
971
|
+
num_items,
|
|
972
|
+
stream);
|
|
973
|
+
}
|
|
974
|
+
|
|
975
|
+
//! @rst
|
|
976
|
+
//! Computes a device-wide exclusive prefix scan using the specified binary associative ``scan_op`` functor.
|
|
977
|
+
//! The ``init_value`` value is provided as a future value.
|
|
978
|
+
//!
|
|
979
|
+
//! - Supports non-commutative scan operators.
|
|
980
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
981
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
982
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
983
|
+
//! the @lookback description.
|
|
984
|
+
//! - @devicestorage
|
|
985
|
+
//!
|
|
986
|
+
//! Snippet
|
|
987
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
988
|
+
//!
|
|
989
|
+
//! The code snippet below illustrates the exclusive prefix min-scan of an ``int`` device vector
|
|
990
|
+
//!
|
|
991
|
+
//! .. code-block:: c++
|
|
992
|
+
//!
|
|
993
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
|
|
994
|
+
//! #include <cuda/std/climits> // for INT_MAX
|
|
995
|
+
//!
|
|
996
|
+
//! // CustomMin functor
|
|
997
|
+
//! struct CustomMin
|
|
998
|
+
//! {
|
|
999
|
+
//! template <typename T>
|
|
1000
|
+
//! __host__ __device__ __forceinline__
|
|
1001
|
+
//! T operator()(const T &a, const T &b) const {
|
|
1002
|
+
//! return (b < a) ? b : a;
|
|
1003
|
+
//! }
|
|
1004
|
+
//! };
|
|
1005
|
+
//!
|
|
1006
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
1007
|
+
//! // input and output
|
|
1008
|
+
//! int num_items; // e.g., 7
|
|
1009
|
+
//! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1010
|
+
//! int *d_init_iter; // e.g., INT_MAX
|
|
1011
|
+
//! CustomMin min_op;
|
|
1012
|
+
//!
|
|
1013
|
+
//! auto future_init_value =
|
|
1014
|
+
//! cub::FutureValue<InitialValueT, IterT>(d_init_iter);
|
|
1015
|
+
//!
|
|
1016
|
+
//! ...
|
|
1017
|
+
//!
|
|
1018
|
+
//! // Determine temporary device storage requirements for exclusive
|
|
1019
|
+
//! // prefix scan
|
|
1020
|
+
//! void *d_temp_storage = nullptr;
|
|
1021
|
+
//! size_t temp_storage_bytes = 0;
|
|
1022
|
+
//! cub::DeviceScan::ExclusiveScan(
|
|
1023
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1024
|
+
//! d_data, min_op, future_init_value, num_items);
|
|
1025
|
+
//!
|
|
1026
|
+
//! // Allocate temporary storage for exclusive prefix scan
|
|
1027
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1028
|
+
//!
|
|
1029
|
+
//! // Run exclusive prefix min-scan
|
|
1030
|
+
//! cub::DeviceScan::ExclusiveScan(
|
|
1031
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1032
|
+
//! d_data, min_op, future_init_value, num_items);
|
|
1033
|
+
//!
|
|
1034
|
+
//! // d_data <-- [2147483647, 8, 6, 6, 5, 3, 0]
|
|
1035
|
+
//!
|
|
1036
|
+
//! @endrst
|
|
1037
|
+
//!
|
|
1038
|
+
//! @tparam IteratorT
|
|
1039
|
+
//! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
|
|
1040
|
+
//!
|
|
1041
|
+
//! @tparam ScanOpT
|
|
1042
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
1043
|
+
//!
|
|
1044
|
+
//! @tparam InitValueT
|
|
1045
|
+
//! **[inferred]** Type of the `init_value`
|
|
1046
|
+
//!
|
|
1047
|
+
//! @tparam NumItemsT
|
|
1048
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
1049
|
+
//!
|
|
1050
|
+
//! @param[in] d_temp_storage
|
|
1051
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1052
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
1053
|
+
//!
|
|
1054
|
+
//! @param[in,out] temp_storage_bytes
|
|
1055
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1056
|
+
//!
|
|
1057
|
+
//! @param[in,out] d_data
|
|
1058
|
+
//! Pointer to the sequence of data items
|
|
1059
|
+
//!
|
|
1060
|
+
//! @param[in] scan_op
|
|
1061
|
+
//! Binary associative scan functor
|
|
1062
|
+
//!
|
|
1063
|
+
//! @param[in] init_value
|
|
1064
|
+
//! Initial value to seed the exclusive scan (and is assigned to `*d_out`)
|
|
1065
|
+
//!
|
|
1066
|
+
//! @param[in] num_items
|
|
1067
|
+
//! Total number of input items (i.e., the length of `d_in`)
|
|
1068
|
+
//!
|
|
1069
|
+
//! @param[in] stream
|
|
1070
|
+
//! @rst
|
|
1071
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1072
|
+
//! @endrst
|
|
1073
|
+
template <typename IteratorT,
|
|
1074
|
+
typename ScanOpT,
|
|
1075
|
+
typename InitValueT,
|
|
1076
|
+
typename InitValueIterT = InitValueT*,
|
|
1077
|
+
typename NumItemsT = int>
|
|
1078
|
+
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScan(
|
|
1079
|
+
void* d_temp_storage,
|
|
1080
|
+
size_t& temp_storage_bytes,
|
|
1081
|
+
IteratorT d_data,
|
|
1082
|
+
ScanOpT scan_op,
|
|
1083
|
+
FutureValue<InitValueT, InitValueIterT> init_value,
|
|
1084
|
+
NumItemsT num_items,
|
|
1085
|
+
cudaStream_t stream = 0)
|
|
1086
|
+
{
|
|
1087
|
+
return ExclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, init_value, num_items, stream);
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
//! @} end member group
|
|
1091
|
+
|
|
1092
|
+
//! @name Inclusive scans
|
|
1093
|
+
//! @{
|
|
1094
|
+
|
|
1095
|
+
//! @rst
|
|
1096
|
+
//! Computes a device-wide inclusive prefix sum.
|
|
1097
|
+
//!
|
|
1098
|
+
//! - Supports non-commutative sum operators.
|
|
1099
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
1100
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
1101
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
1102
|
+
//! the @lookback description.
|
|
1103
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
|
|
1104
|
+
//! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
|
|
1105
|
+
//! shall not overlap in any other way.
|
|
1106
|
+
//! - @devicestorage
|
|
1107
|
+
//!
|
|
1108
|
+
//! Snippet
|
|
1109
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1110
|
+
//!
|
|
1111
|
+
//! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
|
|
1112
|
+
//!
|
|
1113
|
+
//! .. code-block:: c++
|
|
1114
|
+
//!
|
|
1115
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
|
|
1116
|
+
//!
|
|
1117
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
1118
|
+
//! // input and output
|
|
1119
|
+
//! int num_items; // e.g., 7
|
|
1120
|
+
//! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1121
|
+
//! int *d_out; // e.g., [ , , , , , , ]
|
|
1122
|
+
//! ...
|
|
1123
|
+
//!
|
|
1124
|
+
//! // Determine temporary device storage requirements for inclusive
|
|
1125
|
+
//! // prefix sum
|
|
1126
|
+
//! void *d_temp_storage = nullptr;
|
|
1127
|
+
//! size_t temp_storage_bytes = 0;
|
|
1128
|
+
//! cub::DeviceScan::InclusiveSum(
|
|
1129
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1130
|
+
//! d_in, d_out, num_items);
|
|
1131
|
+
//!
|
|
1132
|
+
//! // Allocate temporary storage for inclusive prefix sum
|
|
1133
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1134
|
+
//!
|
|
1135
|
+
//! // Run inclusive prefix sum
|
|
1136
|
+
//! cub::DeviceScan::InclusiveSum(
|
|
1137
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1138
|
+
//! d_in, d_out, num_items);
|
|
1139
|
+
//!
|
|
1140
|
+
//! // d_out <-- [8, 14, 21, 26, 29, 29, 38]
|
|
1141
|
+
//!
|
|
1142
|
+
//! @endrst
|
|
1143
|
+
//!
|
|
1144
|
+
//! @tparam InputIteratorT
|
|
1145
|
+
//! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
|
|
1146
|
+
//!
|
|
1147
|
+
//! @tparam OutputIteratorT
|
|
1148
|
+
//! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
|
|
1149
|
+
//!
|
|
1150
|
+
//! @tparam NumItemsT
|
|
1151
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
1152
|
+
//!
|
|
1153
|
+
//! @param[in] d_temp_storage
|
|
1154
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1155
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
1156
|
+
//!
|
|
1157
|
+
//! @param[in,out] temp_storage_bytes
|
|
1158
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1159
|
+
//!
|
|
1160
|
+
//! @param[in] d_in
|
|
1161
|
+
//! Random-access iterator to the input sequence of data items
|
|
1162
|
+
//!
|
|
1163
|
+
//! @param[out] d_out
|
|
1164
|
+
//! Random-access iterator to the output sequence of data items
|
|
1165
|
+
//!
|
|
1166
|
+
//! @param[in] num_items
|
|
1167
|
+
//! Total number of input items (i.e., the length of `d_in`)
|
|
1168
|
+
//!
|
|
1169
|
+
//! @param[in] stream
|
|
1170
|
+
//! @rst
|
|
1171
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1172
|
+
//! @endrst
|
|
1173
|
+
template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
|
|
1174
|
+
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
|
|
1175
|
+
void* d_temp_storage,
|
|
1176
|
+
size_t& temp_storage_bytes,
|
|
1177
|
+
InputIteratorT d_in,
|
|
1178
|
+
OutputIteratorT d_out,
|
|
1179
|
+
NumItemsT num_items,
|
|
1180
|
+
cudaStream_t stream = 0)
|
|
1181
|
+
{
|
|
1182
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSum");
|
|
1183
|
+
|
|
1184
|
+
// Unsigned integer type for global offsets
|
|
1185
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
1186
|
+
|
|
1187
|
+
return DispatchScan<InputIteratorT, OutputIteratorT, ::cuda::std::plus<>, NullType, OffsetT>::Dispatch(
|
|
1188
|
+
d_temp_storage, temp_storage_bytes, d_in, d_out, ::cuda::std::plus<>{}, NullType{}, num_items, stream);
|
|
1189
|
+
}
|
|
1190
|
+
|
|
1191
|
+
//! @rst
|
|
1192
|
+
//! Computes a device-wide inclusive prefix sum in-place.
|
|
1193
|
+
//!
|
|
1194
|
+
//! - Supports non-commutative sum operators.
|
|
1195
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
1196
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
1197
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
1198
|
+
//! the @lookback description.
|
|
1199
|
+
//! - @devicestorage
|
|
1200
|
+
//!
|
|
1201
|
+
//! Snippet
|
|
1202
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1203
|
+
//!
|
|
1204
|
+
//! The code snippet below illustrates the inclusive prefix sum of an ``int`` device vector.
|
|
1205
|
+
//!
|
|
1206
|
+
//! .. code-block:: c++
|
|
1207
|
+
//!
|
|
1208
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
|
|
1209
|
+
//!
|
|
1210
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
1211
|
+
//! // input and output
|
|
1212
|
+
//! int num_items; // e.g., 7
|
|
1213
|
+
//! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1214
|
+
//! ...
|
|
1215
|
+
//!
|
|
1216
|
+
//! // Determine temporary device storage requirements for inclusive
|
|
1217
|
+
//! // prefix sum
|
|
1218
|
+
//! void *d_temp_storage = nullptr;
|
|
1219
|
+
//! size_t temp_storage_bytes = 0;
|
|
1220
|
+
//! cub::DeviceScan::InclusiveSum(
|
|
1221
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1222
|
+
//! d_data, num_items);
|
|
1223
|
+
//!
|
|
1224
|
+
//! // Allocate temporary storage for inclusive prefix sum
|
|
1225
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1226
|
+
//!
|
|
1227
|
+
//! // Run inclusive prefix sum
|
|
1228
|
+
//! cub::DeviceScan::InclusiveSum(
|
|
1229
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1230
|
+
//! d_data, num_items);
|
|
1231
|
+
//!
|
|
1232
|
+
//! // d_data <-- [8, 14, 21, 26, 29, 29, 38]
|
|
1233
|
+
//!
|
|
1234
|
+
//! @endrst
|
|
1235
|
+
//!
|
|
1236
|
+
//! @tparam IteratorT
|
|
1237
|
+
//! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
|
|
1238
|
+
//!
|
|
1239
|
+
//! @tparam NumItemsT
|
|
1240
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
1241
|
+
//!
|
|
1242
|
+
//! @param[in] d_temp_storage
|
|
1243
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1244
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
1245
|
+
//!
|
|
1246
|
+
//! @param[in,out] temp_storage_bytes
|
|
1247
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1248
|
+
//!
|
|
1249
|
+
//! @param[in,out] d_data
|
|
1250
|
+
//! Random-access iterator to the sequence of data items
|
|
1251
|
+
//!
|
|
1252
|
+
//! @param[in] num_items
|
|
1253
|
+
//! Total number of input items (i.e., the length of `d_in`)
|
|
1254
|
+
//!
|
|
1255
|
+
//! @param[in] stream
|
|
1256
|
+
//! @rst
|
|
1257
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1258
|
+
//! @endrst
|
|
1259
|
+
template <typename IteratorT, typename NumItemsT>
|
|
1260
|
+
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSum(
|
|
1261
|
+
void* d_temp_storage, size_t& temp_storage_bytes, IteratorT d_data, NumItemsT num_items, cudaStream_t stream = 0)
|
|
1262
|
+
{
|
|
1263
|
+
return InclusiveSum(d_temp_storage, temp_storage_bytes, d_data, d_data, num_items, stream);
|
|
1264
|
+
}
|
|
1265
|
+
|
|
1266
|
+
//! @rst
|
|
1267
|
+
//! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
|
|
1268
|
+
//!
|
|
1269
|
+
//! - Supports non-commutative scan operators.
|
|
1270
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
1271
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
1272
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
1273
|
+
//! the @lookback description.
|
|
1274
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
|
|
1275
|
+
//! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
|
|
1276
|
+
//! shall not overlap in any other way.
|
|
1277
|
+
//! - @devicestorage
|
|
1278
|
+
//!
|
|
1279
|
+
//! Snippet
|
|
1280
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1281
|
+
//!
|
|
1282
|
+
//! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
|
|
1283
|
+
//!
|
|
1284
|
+
//! .. code-block:: c++
|
|
1285
|
+
//!
|
|
1286
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
|
|
1287
|
+
//! #include <cuda/std/climits> // for INT_MAX
|
|
1288
|
+
//!
|
|
1289
|
+
//! // CustomMin functor
|
|
1290
|
+
//! struct CustomMin
|
|
1291
|
+
//! {
|
|
1292
|
+
//! template <typename T>
|
|
1293
|
+
//! __host__ __device__ __forceinline__
|
|
1294
|
+
//! T operator()(const T &a, const T &b) const {
|
|
1295
|
+
//! return (b < a) ? b : a;
|
|
1296
|
+
//! }
|
|
1297
|
+
//! };
|
|
1298
|
+
//!
|
|
1299
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
1300
|
+
//! // input and output
|
|
1301
|
+
//! int num_items; // e.g., 7
|
|
1302
|
+
//! int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1303
|
+
//! int *d_out; // e.g., [ , , , , , , ]
|
|
1304
|
+
//! CustomMin min_op;
|
|
1305
|
+
//! ...
|
|
1306
|
+
//!
|
|
1307
|
+
//! // Determine temporary device storage requirements for inclusive
|
|
1308
|
+
//! // prefix scan
|
|
1309
|
+
//! void *d_temp_storage = nullptr;
|
|
1310
|
+
//! size_t temp_storage_bytes = 0;
|
|
1311
|
+
//! cub::DeviceScan::InclusiveScan(
|
|
1312
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1313
|
+
//! d_in, d_out, min_op, num_items);
|
|
1314
|
+
//!
|
|
1315
|
+
//! // Allocate temporary storage for inclusive prefix scan
|
|
1316
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1317
|
+
//!
|
|
1318
|
+
//! // Run inclusive prefix min-scan
|
|
1319
|
+
//! cub::DeviceScan::InclusiveScan(
|
|
1320
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1321
|
+
//! d_in, d_out, min_op, num_items);
|
|
1322
|
+
//!
|
|
1323
|
+
//! // d_out <-- [8, 6, 6, 5, 3, 0, 0]
|
|
1324
|
+
//!
|
|
1325
|
+
//! @endrst
|
|
1326
|
+
//!
|
|
1327
|
+
//! @tparam InputIteratorT
|
|
1328
|
+
//! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
|
|
1329
|
+
//!
|
|
1330
|
+
//! @tparam OutputIteratorT
|
|
1331
|
+
//! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
|
|
1332
|
+
//!
|
|
1333
|
+
//! @tparam ScanOpT
|
|
1334
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
1335
|
+
//!
|
|
1336
|
+
//! @tparam NumItemsT
|
|
1337
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
1338
|
+
//!
|
|
1339
|
+
//! @param[in]
|
|
1340
|
+
//! d_temp_storage Device-accessible allocation of temporary storage.
|
|
1341
|
+
//! When `nullptr`, the required allocation size is written to
|
|
1342
|
+
//! `temp_storage_bytes` and no work is done.
|
|
1343
|
+
//!
|
|
1344
|
+
//! @param[in,out] temp_storage_bytes
|
|
1345
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1346
|
+
//!
|
|
1347
|
+
//! @param[in] d_in
|
|
1348
|
+
//! Random-access iterator to the input sequence of data items
|
|
1349
|
+
//!
|
|
1350
|
+
//! @param[out] d_out
|
|
1351
|
+
//! Random-access iterator to the output sequence of data items
|
|
1352
|
+
//!
|
|
1353
|
+
//! @param[in] scan_op
|
|
1354
|
+
//! Binary associative scan functor
|
|
1355
|
+
//!
|
|
1356
|
+
//! @param[in] num_items
|
|
1357
|
+
//! Total number of input items (i.e., the length of `d_in`)
|
|
1358
|
+
//!
|
|
1359
|
+
//! @param[in] stream
|
|
1360
|
+
//! @rst
|
|
1361
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1362
|
+
//! @endrst
|
|
1363
|
+
template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename NumItemsT>
|
|
1364
|
+
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
|
|
1365
|
+
void* d_temp_storage,
|
|
1366
|
+
size_t& temp_storage_bytes,
|
|
1367
|
+
InputIteratorT d_in,
|
|
1368
|
+
OutputIteratorT d_out,
|
|
1369
|
+
ScanOpT scan_op,
|
|
1370
|
+
NumItemsT num_items,
|
|
1371
|
+
cudaStream_t stream = 0)
|
|
1372
|
+
{
|
|
1373
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScan");
|
|
1374
|
+
|
|
1375
|
+
// Unsigned integer type for global offsets
|
|
1376
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
1377
|
+
|
|
1378
|
+
return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
|
|
1379
|
+
d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream);
|
|
1380
|
+
}
|
|
1381
|
+
|
|
1382
|
+
//! @rst
|
|
1383
|
+
//! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
|
|
1384
|
+
//! The result of applying the ``scan_op`` binary operator to ``init_value`` value and ``*d_in``
|
|
1385
|
+
//! is assigned to ``*d_out``.
|
|
1386
|
+
//!
|
|
1387
|
+
//! - Supports non-commutative scan operators.
|
|
1388
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
1389
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
1390
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
1391
|
+
//! the @lookback description.
|
|
1392
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The
|
|
1393
|
+
//! range ``[d_in, d_in + num_items)`` and ``[d_out, d_out + num_items)``
|
|
1394
|
+
//! shall not overlap in any other way.
|
|
1395
|
+
//! - @devicestorage
|
|
1396
|
+
//!
|
|
1397
|
+
//! Snippet
|
|
1398
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1399
|
+
//!
|
|
1400
|
+
//! The code snippet below illustrates the inclusive max-scan of an ``int`` device vector.
|
|
1401
|
+
//!
|
|
1402
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_api.cu
|
|
1403
|
+
//! :language: c++
|
|
1404
|
+
//! :dedent:
|
|
1405
|
+
//! :start-after: example-begin device-inclusive-scan
|
|
1406
|
+
//! :end-before: example-end device-inclusive-scan
|
|
1407
|
+
//!
|
|
1408
|
+
//! @endrst
|
|
1409
|
+
//!
|
|
1410
|
+
//! @tparam InputIteratorT
|
|
1411
|
+
//! **[inferred]** Random-access input iterator type for reading scan inputs @iterator
|
|
1412
|
+
//!
|
|
1413
|
+
//! @tparam OutputIteratorT
|
|
1414
|
+
//! **[inferred]** Random-access output iterator type for writing scan outputs @iterator
|
|
1415
|
+
//!
|
|
1416
|
+
//! @tparam ScanOpT
|
|
1417
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
1418
|
+
//!
|
|
1419
|
+
//! @tparam InitValueT
|
|
1420
|
+
//! **[inferred]** Type of the `init_value`
|
|
1421
|
+
//!
|
|
1422
|
+
//! @tparam NumItemsT
|
|
1423
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
1424
|
+
//!
|
|
1425
|
+
//! @param[in] d_temp_storage
|
|
1426
|
+
//! Device-accessible allocation of temporary storage.
|
|
1427
|
+
//! When `nullptr`, the required allocation size is written to
|
|
1428
|
+
//! `temp_storage_bytes` and no work is done.
|
|
1429
|
+
//!
|
|
1430
|
+
//! @param[in,out] temp_storage_bytes
|
|
1431
|
+
//! Reference to the size in bytes of the `d_temp_storage` allocation
|
|
1432
|
+
//!
|
|
1433
|
+
//! @param[in] d_in
|
|
1434
|
+
//! Random-access iterator to the input sequence of data items
|
|
1435
|
+
//!
|
|
1436
|
+
//! @param[out] d_out
|
|
1437
|
+
//! Random-access iterator to the output sequence of data items
|
|
1438
|
+
//!
|
|
1439
|
+
//! @param[in] scan_op
|
|
1440
|
+
//! Binary associative scan functor
|
|
1441
|
+
//!
|
|
1442
|
+
//! @param[in] init_value
|
|
1443
|
+
//! Initial value to seed the inclusive scan (`scan_op(init_value, d_in[0])`
|
|
1444
|
+
//! is assigned to `*d_out`)
|
|
1445
|
+
//!
|
|
1446
|
+
//! @param[in] num_items
|
|
1447
|
+
//! Total number of input items (i.e., the length of `d_in`)
|
|
1448
|
+
//!
|
|
1449
|
+
//! @param[in] stream
|
|
1450
|
+
//! CUDA stream to launch kernels within.
|
|
1451
|
+
template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitValueT, typename NumItemsT>
|
|
1452
|
+
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanInit(
|
|
1453
|
+
void* d_temp_storage,
|
|
1454
|
+
size_t& temp_storage_bytes,
|
|
1455
|
+
InputIteratorT d_in,
|
|
1456
|
+
OutputIteratorT d_out,
|
|
1457
|
+
ScanOpT scan_op,
|
|
1458
|
+
InitValueT init_value,
|
|
1459
|
+
NumItemsT num_items,
|
|
1460
|
+
cudaStream_t stream = 0)
|
|
1461
|
+
{
|
|
1462
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanInit");
|
|
1463
|
+
|
|
1464
|
+
// Unsigned integer type for global offsets
|
|
1465
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
1466
|
+
using AccumT = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::it_value_t<InputIteratorT>, InitValueT>;
|
|
1467
|
+
|
|
1468
|
+
return DispatchScan<
|
|
1469
|
+
InputIteratorT,
|
|
1470
|
+
OutputIteratorT,
|
|
1471
|
+
ScanOpT,
|
|
1472
|
+
detail::InputValue<InitValueT>,
|
|
1473
|
+
OffsetT,
|
|
1474
|
+
AccumT,
|
|
1475
|
+
ForceInclusive::Yes>::Dispatch(d_temp_storage,
|
|
1476
|
+
temp_storage_bytes,
|
|
1477
|
+
d_in,
|
|
1478
|
+
d_out,
|
|
1479
|
+
scan_op,
|
|
1480
|
+
detail::InputValue<InitValueT>(init_value),
|
|
1481
|
+
num_items,
|
|
1482
|
+
stream);
|
|
1483
|
+
}
|
|
1484
|
+
|
|
1485
|
+
//! @rst
|
|
1486
|
+
//! Computes a device-wide inclusive prefix scan using the specified binary associative ``scan_op`` functor.
|
|
1487
|
+
//!
|
|
1488
|
+
//! - Supports non-commutative scan operators.
|
|
1489
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
1490
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
1491
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
1492
|
+
//! the @lookback description.
|
|
1493
|
+
//! - @devicestorage
|
|
1494
|
+
//!
|
|
1495
|
+
//! Snippet
|
|
1496
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1497
|
+
//!
|
|
1498
|
+
//! The code snippet below illustrates the inclusive prefix min-scan of an ``int`` device vector.
|
|
1499
|
+
//!
|
|
1500
|
+
//! .. code-block:: c++
|
|
1501
|
+
//!
|
|
1502
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
|
|
1503
|
+
//! #include <cuda/std/climits> // for INT_MAX
|
|
1504
|
+
//!
|
|
1505
|
+
//! // CustomMin functor
|
|
1506
|
+
//! struct CustomMin
|
|
1507
|
+
//! {
|
|
1508
|
+
//! template <typename T>
|
|
1509
|
+
//! __host__ __device__ __forceinline__
|
|
1510
|
+
//! T operator()(const T &a, const T &b) const {
|
|
1511
|
+
//! return (b < a) ? b : a;
|
|
1512
|
+
//! }
|
|
1513
|
+
//! };
|
|
1514
|
+
//!
|
|
1515
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
1516
|
+
//! // input and output
|
|
1517
|
+
//! int num_items; // e.g., 7
|
|
1518
|
+
//! int *d_data; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1519
|
+
//! CustomMin min_op;
|
|
1520
|
+
//! ...
|
|
1521
|
+
//!
|
|
1522
|
+
//! // Determine temporary device storage requirements for inclusive
|
|
1523
|
+
//! // prefix scan
|
|
1524
|
+
//! void *d_temp_storage = nullptr;
|
|
1525
|
+
//! size_t temp_storage_bytes = 0;
|
|
1526
|
+
//! cub::DeviceScan::InclusiveScan(
|
|
1527
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1528
|
+
//! d_data, min_op, num_items);
|
|
1529
|
+
//!
|
|
1530
|
+
//! // Allocate temporary storage for inclusive prefix scan
|
|
1531
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1532
|
+
//!
|
|
1533
|
+
//! // Run inclusive prefix min-scan
|
|
1534
|
+
//! cub::DeviceScan::InclusiveScan(
|
|
1535
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1536
|
+
//! d_in, d_out, min_op, num_items);
|
|
1537
|
+
//!
|
|
1538
|
+
//! // d_data <-- [8, 6, 6, 5, 3, 0, 0]
|
|
1539
|
+
//!
|
|
1540
|
+
//! @endrst
|
|
1541
|
+
//!
|
|
1542
|
+
//! @tparam IteratorT
|
|
1543
|
+
//! **[inferred]** Random-access input iterator type for reading scan inputs and writing scan outputs
|
|
1544
|
+
//!
|
|
1545
|
+
//! @tparam ScanOpT
|
|
1546
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
1547
|
+
//!
|
|
1548
|
+
//! @tparam NumItemsT
|
|
1549
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
1550
|
+
//!
|
|
1551
|
+
//! @param[in]
|
|
1552
|
+
//! d_temp_storage Device-accessible allocation of temporary storage.
|
|
1553
|
+
//! When `nullptr`, the required allocation size is written to
|
|
1554
|
+
//! `temp_storage_bytes` and no work is done.
|
|
1555
|
+
//!
|
|
1556
|
+
//! @param[in,out] temp_storage_bytes
|
|
1557
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1558
|
+
//!
|
|
1559
|
+
//! @param[in] d_data
|
|
1560
|
+
//! Random-access iterator to the sequence of data items
|
|
1561
|
+
//!
|
|
1562
|
+
//! @param[in] scan_op
|
|
1563
|
+
//! Binary associative scan functor
|
|
1564
|
+
//!
|
|
1565
|
+
//! @param[in] num_items
|
|
1566
|
+
//! Total number of input items (i.e., the length of `d_in`)
|
|
1567
|
+
//!
|
|
1568
|
+
//! @param[in] stream
|
|
1569
|
+
//! @rst
|
|
1570
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1571
|
+
//! @endrst
|
|
1572
|
+
template <typename IteratorT, typename ScanOpT, typename NumItemsT>
|
|
1573
|
+
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScan(
|
|
1574
|
+
void* d_temp_storage,
|
|
1575
|
+
size_t& temp_storage_bytes,
|
|
1576
|
+
IteratorT d_data,
|
|
1577
|
+
ScanOpT scan_op,
|
|
1578
|
+
NumItemsT num_items,
|
|
1579
|
+
cudaStream_t stream = 0)
|
|
1580
|
+
{
|
|
1581
|
+
return InclusiveScan(d_temp_storage, temp_storage_bytes, d_data, d_data, scan_op, num_items, stream);
|
|
1582
|
+
}
|
|
1583
|
+
//! @} end member group
|
|
1584
|
+
|
|
1585
|
+
//! @name Scans by key
|
|
1586
|
+
//! @{
|
|
1587
|
+
|
|
1588
|
+
//! @rst
|
|
1589
|
+
//! Computes a device-wide exclusive prefix sum-by-key with key equality
|
|
1590
|
+
//! defined by ``equality_op``. The value of ``0`` is applied as the initial
|
|
1591
|
+
//! value, and is assigned to the beginning of each segment in ``d_values_out``.
|
|
1592
|
+
//!
|
|
1593
|
+
//! - Supports non-commutative sum operators.
|
|
1594
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
1595
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
1596
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
1597
|
+
//! the @lookback description.
|
|
1598
|
+
//! - ``d_keys_in`` may equal ``d_values_out`` but the range
|
|
1599
|
+
//! ``[d_keys_in, d_keys_in + num_items)`` and the range
|
|
1600
|
+
//! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
|
|
1601
|
+
//! - ``d_values_in`` may equal ``d_values_out`` but the range
|
|
1602
|
+
//! ``[d_values_in, d_values_in + num_items)`` and the range
|
|
1603
|
+
//! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
|
|
1604
|
+
//! - @devicestorage
|
|
1605
|
+
//!
|
|
1606
|
+
//! Snippet
|
|
1607
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1608
|
+
//!
|
|
1609
|
+
//! The code snippet below illustrates the exclusive prefix sum-by-key of an ``int`` device vector.
|
|
1610
|
+
//!
|
|
1611
|
+
//! .. code-block:: c++
|
|
1612
|
+
//!
|
|
1613
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
|
|
1614
|
+
//!
|
|
1615
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
1616
|
+
//! // input and output
|
|
1617
|
+
//! int num_items; // e.g., 7
|
|
1618
|
+
//! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
|
|
1619
|
+
//! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1620
|
+
//! int *d_values_out; // e.g., [ , , , , , , ]
|
|
1621
|
+
//! ...
|
|
1622
|
+
//!
|
|
1623
|
+
//! // Determine temporary device storage requirements
|
|
1624
|
+
//! void *d_temp_storage = nullptr;
|
|
1625
|
+
//! size_t temp_storage_bytes = 0;
|
|
1626
|
+
//! cub::DeviceScan::ExclusiveSumByKey(
|
|
1627
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1628
|
+
//! d_keys_in, d_values_in, d_values_out, num_items);
|
|
1629
|
+
//!
|
|
1630
|
+
//! // Allocate temporary storage
|
|
1631
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1632
|
+
//!
|
|
1633
|
+
//! // Run exclusive prefix sum
|
|
1634
|
+
//! cub::DeviceScan::ExclusiveSumByKey(
|
|
1635
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1636
|
+
//! d_keys_in, d_values_in, d_values_out, num_items);
|
|
1637
|
+
//!
|
|
1638
|
+
//! // d_values_out <-- [0, 8, 0, 7, 12, 0, 0]
|
|
1639
|
+
//!
|
|
1640
|
+
//! @endrst
|
|
1641
|
+
//!
|
|
1642
|
+
//! @tparam KeysInputIteratorT
|
|
1643
|
+
//! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
|
|
1644
|
+
//!
|
|
1645
|
+
//! @tparam ValuesInputIteratorT
|
|
1646
|
+
//! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
|
|
1647
|
+
//!
|
|
1648
|
+
//! @tparam ValuesOutputIteratorT
|
|
1649
|
+
//! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
|
|
1650
|
+
//!
|
|
1651
|
+
//! @tparam EqualityOpT
|
|
1652
|
+
//! **[inferred]** Functor type having member
|
|
1653
|
+
//! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
|
|
1654
|
+
//!
|
|
1655
|
+
//! @tparam NumItemsT
|
|
1656
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
1657
|
+
//!
|
|
1658
|
+
//! @param[in] d_temp_storage
|
|
1659
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1660
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
1661
|
+
//!
|
|
1662
|
+
//! @param[in,out] temp_storage_bytes
|
|
1663
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1664
|
+
//!
|
|
1665
|
+
//! @param[in] d_keys_in
|
|
1666
|
+
//! Random-access input iterator to the input sequence of key items
|
|
1667
|
+
//!
|
|
1668
|
+
//! @param[in] d_values_in
|
|
1669
|
+
//! Random-access input iterator to the input sequence of value items
|
|
1670
|
+
//!
|
|
1671
|
+
//! @param[out] d_values_out
|
|
1672
|
+
//! Random-access output iterator to the output sequence of value items
|
|
1673
|
+
//!
|
|
1674
|
+
//! @param[in] num_items
|
|
1675
|
+
//! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
|
|
1676
|
+
//!
|
|
1677
|
+
//! @param[in] equality_op
|
|
1678
|
+
//! Binary functor that defines the equality of keys.
|
|
1679
|
+
//! Default is cuda::std::equal_to<>{}.
|
|
1680
|
+
//!
|
|
1681
|
+
//! @param[in] stream
|
|
1682
|
+
//! @rst
|
|
1683
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1684
|
+
//! @endrst
|
|
1685
|
+
template <typename KeysInputIteratorT,
|
|
1686
|
+
typename ValuesInputIteratorT,
|
|
1687
|
+
typename ValuesOutputIteratorT,
|
|
1688
|
+
typename EqualityOpT = ::cuda::std::equal_to<>,
|
|
1689
|
+
typename NumItemsT = uint32_t>
|
|
1690
|
+
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey(
|
|
1691
|
+
void* d_temp_storage,
|
|
1692
|
+
size_t& temp_storage_bytes,
|
|
1693
|
+
KeysInputIteratorT d_keys_in,
|
|
1694
|
+
ValuesInputIteratorT d_values_in,
|
|
1695
|
+
ValuesOutputIteratorT d_values_out,
|
|
1696
|
+
NumItemsT num_items,
|
|
1697
|
+
EqualityOpT equality_op = EqualityOpT(),
|
|
1698
|
+
cudaStream_t stream = 0)
|
|
1699
|
+
{
|
|
1700
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSumByKey");
|
|
1701
|
+
|
|
1702
|
+
// Unsigned integer type for global offsets
|
|
1703
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
1704
|
+
using InitT = cub::detail::it_value_t<ValuesInputIteratorT>;
|
|
1705
|
+
|
|
1706
|
+
// Initial value
|
|
1707
|
+
InitT init_value{};
|
|
1708
|
+
|
|
1709
|
+
return DispatchScanByKey<
|
|
1710
|
+
KeysInputIteratorT,
|
|
1711
|
+
ValuesInputIteratorT,
|
|
1712
|
+
ValuesOutputIteratorT,
|
|
1713
|
+
EqualityOpT,
|
|
1714
|
+
::cuda::std::plus<>,
|
|
1715
|
+
InitT,
|
|
1716
|
+
OffsetT>::Dispatch(d_temp_storage,
|
|
1717
|
+
temp_storage_bytes,
|
|
1718
|
+
d_keys_in,
|
|
1719
|
+
d_values_in,
|
|
1720
|
+
d_values_out,
|
|
1721
|
+
equality_op,
|
|
1722
|
+
::cuda::std::plus<>{},
|
|
1723
|
+
init_value,
|
|
1724
|
+
num_items,
|
|
1725
|
+
stream);
|
|
1726
|
+
}
|
|
1727
|
+
|
|
1728
|
+
//! @rst
|
|
1729
|
+
//! Computes a device-wide exclusive prefix scan-by-key using the
|
|
1730
|
+
//! specified binary associative ``scan_op`` functor. The key equality is defined by
|
|
1731
|
+
//! ``equality_op``. The ``init_value`` value is applied as the initial
|
|
1732
|
+
//! value, and is assigned to the beginning of each segment in ``d_values_out``.
|
|
1733
|
+
//!
|
|
1734
|
+
//! - Supports non-commutative scan operators.
|
|
1735
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
1736
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
1737
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
1738
|
+
//! the @lookback description.
|
|
1739
|
+
//! - ``d_keys_in`` may equal ``d_values_out`` but the range
|
|
1740
|
+
//! ``[d_keys_in, d_keys_in + num_items)`` and the range
|
|
1741
|
+
//! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
|
|
1742
|
+
//! - ``d_values_in`` may equal ``d_values_out`` but the range
|
|
1743
|
+
//! ``[d_values_in, d_values_in + num_items)`` and the range
|
|
1744
|
+
//! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
|
|
1745
|
+
//! - @devicestorage
|
|
1746
|
+
//!
|
|
1747
|
+
//! Snippet
|
|
1748
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1749
|
+
//!
|
|
1750
|
+
//! The code snippet below illustrates the exclusive prefix min-scan-by-key of an ``int`` device vector
|
|
1751
|
+
//!
|
|
1752
|
+
//! .. code-block:: c++
|
|
1753
|
+
//!
|
|
1754
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
|
|
1755
|
+
//! #include <cuda/std/climits> // for INT_MAX
|
|
1756
|
+
//!
|
|
1757
|
+
//! // CustomMin functor
|
|
1758
|
+
//! struct CustomMin
|
|
1759
|
+
//! {
|
|
1760
|
+
//! template <typename T>
|
|
1761
|
+
//! __host__ __device__ __forceinline__
|
|
1762
|
+
//! T operator()(const T &a, const T &b) const {
|
|
1763
|
+
//! return (b < a) ? b : a;
|
|
1764
|
+
//! }
|
|
1765
|
+
//! };
|
|
1766
|
+
//!
|
|
1767
|
+
//! // CustomEqual functor
|
|
1768
|
+
//! struct CustomEqual
|
|
1769
|
+
//! {
|
|
1770
|
+
//! template <typename T>
|
|
1771
|
+
//! __host__ __device__ __forceinline__
|
|
1772
|
+
//! T operator()(const T &a, const T &b) const {
|
|
1773
|
+
//! return a == b;
|
|
1774
|
+
//! }
|
|
1775
|
+
//! };
|
|
1776
|
+
//!
|
|
1777
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
1778
|
+
//! // input and output
|
|
1779
|
+
//! int num_items; // e.g., 7
|
|
1780
|
+
//! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
|
|
1781
|
+
//! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1782
|
+
//! int *d_values_out; // e.g., [ , , , , , , ]
|
|
1783
|
+
//! CustomMin min_op;
|
|
1784
|
+
//! CustomEqual equality_op;
|
|
1785
|
+
//! ...
|
|
1786
|
+
//!
|
|
1787
|
+
//! // Determine temporary device storage requirements for exclusive
|
|
1788
|
+
//! // prefix scan
|
|
1789
|
+
//! void *d_temp_storage = nullptr;
|
|
1790
|
+
//! size_t temp_storage_bytes = 0;
|
|
1791
|
+
//! cub::DeviceScan::ExclusiveScanByKey(
|
|
1792
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1793
|
+
//! d_keys_in, d_values_in, d_values_out, min_op,
|
|
1794
|
+
//! (int) INT_MAX, num_items, equality_op);
|
|
1795
|
+
//!
|
|
1796
|
+
//! // Allocate temporary storage for exclusive prefix scan
|
|
1797
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1798
|
+
//!
|
|
1799
|
+
//! // Run exclusive prefix min-scan
|
|
1800
|
+
//! cub::DeviceScan::ExclusiveScanByKey(
|
|
1801
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1802
|
+
//! d_keys_in, d_values_in, d_values_out, min_op,
|
|
1803
|
+
//! (int) INT_MAX, num_items, equality_op);
|
|
1804
|
+
//!
|
|
1805
|
+
//! // d_values_out <-- [2147483647, 8, 2147483647, 7, 5, 2147483647, 0]
|
|
1806
|
+
//!
|
|
1807
|
+
//! @endrst
|
|
1808
|
+
//!
|
|
1809
|
+
//! @tparam KeysInputIteratorT
|
|
1810
|
+
//! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
|
|
1811
|
+
//!
|
|
1812
|
+
//! @tparam ValuesInputIteratorT
|
|
1813
|
+
//! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
|
|
1814
|
+
//!
|
|
1815
|
+
//! @tparam ValuesOutputIteratorT
|
|
1816
|
+
//! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
|
|
1817
|
+
//!
|
|
1818
|
+
//! @tparam ScanOpT
|
|
1819
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
1820
|
+
//!
|
|
1821
|
+
//! @tparam InitValueT
|
|
1822
|
+
//! **[inferred]** Type of the `init_value`
|
|
1823
|
+
//!
|
|
1824
|
+
//! @tparam EqualityOpT
|
|
1825
|
+
//! **[inferred]** Functor type having member
|
|
1826
|
+
//! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
|
|
1827
|
+
//!
|
|
1828
|
+
//! @tparam NumItemsT
|
|
1829
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
1830
|
+
//!
|
|
1831
|
+
//! @param[in] d_temp_storage
|
|
1832
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1833
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
1834
|
+
//!
|
|
1835
|
+
//! @param[in,out] temp_storage_bytes
|
|
1836
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1837
|
+
//!
|
|
1838
|
+
//! @param[in] d_keys_in
|
|
1839
|
+
//! Random-access input iterator to the input sequence of key items
|
|
1840
|
+
//!
|
|
1841
|
+
//! @param[in] d_values_in
|
|
1842
|
+
//! Random-access input iterator to the input sequence of value items
|
|
1843
|
+
//!
|
|
1844
|
+
//! @param[out] d_values_out
|
|
1845
|
+
//! Random-access output iterator to the output sequence of value items
|
|
1846
|
+
//!
|
|
1847
|
+
//! @param[in] scan_op
|
|
1848
|
+
//! Binary associative scan functor
|
|
1849
|
+
//!
|
|
1850
|
+
//! @param[in] init_value
|
|
1851
|
+
//! Initial value to seed the exclusive scan (and is assigned to the
|
|
1852
|
+
//! beginning of each segment in `d_values_out`)
|
|
1853
|
+
//!
|
|
1854
|
+
//! @param[in] num_items
|
|
1855
|
+
//! Total number of input items (i.e., the length of `d_keys_in` and
|
|
1856
|
+
//! `d_values_in`)
|
|
1857
|
+
//!
|
|
1858
|
+
//! @param[in] equality_op
|
|
1859
|
+
//! Binary functor that defines the equality of keys.
|
|
1860
|
+
//! Default is cuda::std::equal_to<>{}.
|
|
1861
|
+
//!
|
|
1862
|
+
//! @param[in] stream
|
|
1863
|
+
//! @rst
|
|
1864
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1865
|
+
//! @endrst
|
|
1866
|
+
template <typename KeysInputIteratorT,
|
|
1867
|
+
typename ValuesInputIteratorT,
|
|
1868
|
+
typename ValuesOutputIteratorT,
|
|
1869
|
+
typename ScanOpT,
|
|
1870
|
+
typename InitValueT,
|
|
1871
|
+
typename EqualityOpT = ::cuda::std::equal_to<>,
|
|
1872
|
+
typename NumItemsT = uint32_t>
|
|
1873
|
+
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey(
|
|
1874
|
+
void* d_temp_storage,
|
|
1875
|
+
size_t& temp_storage_bytes,
|
|
1876
|
+
KeysInputIteratorT d_keys_in,
|
|
1877
|
+
ValuesInputIteratorT d_values_in,
|
|
1878
|
+
ValuesOutputIteratorT d_values_out,
|
|
1879
|
+
ScanOpT scan_op,
|
|
1880
|
+
InitValueT init_value,
|
|
1881
|
+
NumItemsT num_items,
|
|
1882
|
+
EqualityOpT equality_op = EqualityOpT(),
|
|
1883
|
+
cudaStream_t stream = 0)
|
|
1884
|
+
{
|
|
1885
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveScanByKey");
|
|
1886
|
+
|
|
1887
|
+
// Unsigned integer type for global offsets
|
|
1888
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
1889
|
+
|
|
1890
|
+
return DispatchScanByKey<
|
|
1891
|
+
KeysInputIteratorT,
|
|
1892
|
+
ValuesInputIteratorT,
|
|
1893
|
+
ValuesOutputIteratorT,
|
|
1894
|
+
EqualityOpT,
|
|
1895
|
+
ScanOpT,
|
|
1896
|
+
InitValueT,
|
|
1897
|
+
OffsetT>::Dispatch(d_temp_storage,
|
|
1898
|
+
temp_storage_bytes,
|
|
1899
|
+
d_keys_in,
|
|
1900
|
+
d_values_in,
|
|
1901
|
+
d_values_out,
|
|
1902
|
+
equality_op,
|
|
1903
|
+
scan_op,
|
|
1904
|
+
init_value,
|
|
1905
|
+
num_items,
|
|
1906
|
+
stream);
|
|
1907
|
+
}
|
|
1908
|
+
|
|
1909
|
+
//! @rst
|
|
1910
|
+
//! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``.
|
|
1911
|
+
//!
|
|
1912
|
+
//! - Supports non-commutative sum operators.
|
|
1913
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
1914
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
1915
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
1916
|
+
//! the @lookback description.
|
|
1917
|
+
//! - ``d_keys_in`` may equal ``d_values_out`` but the range
|
|
1918
|
+
//! ``[d_keys_in, d_keys_in + num_items)`` and the range
|
|
1919
|
+
//! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
|
|
1920
|
+
//! - ``d_values_in`` may equal ``d_values_out`` but the range
|
|
1921
|
+
//! ``[d_values_in, d_values_in + num_items)`` and the range
|
|
1922
|
+
//! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
|
|
1923
|
+
//! - @devicestorage
|
|
1924
|
+
//!
|
|
1925
|
+
//! Snippet
|
|
1926
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1927
|
+
//!
|
|
1928
|
+
//! The code snippet below illustrates the inclusive prefix sum-by-key of an ``int`` device vector.
|
|
1929
|
+
//!
|
|
1930
|
+
//! .. code-block:: c++
|
|
1931
|
+
//!
|
|
1932
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
|
|
1933
|
+
//!
|
|
1934
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
1935
|
+
//! // input and output
|
|
1936
|
+
//! int num_items; // e.g., 7
|
|
1937
|
+
//! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
|
|
1938
|
+
//! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
1939
|
+
//! int *d_values_out; // e.g., [ , , , , , , ]
|
|
1940
|
+
//! ...
|
|
1941
|
+
//!
|
|
1942
|
+
//! // Determine temporary device storage requirements for inclusive prefix sum
|
|
1943
|
+
//! void *d_temp_storage = nullptr;
|
|
1944
|
+
//! size_t temp_storage_bytes = 0;
|
|
1945
|
+
//! cub::DeviceScan::InclusiveSumByKey(
|
|
1946
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1947
|
+
//! d_keys_in, d_values_in, d_values_out, num_items);
|
|
1948
|
+
//!
|
|
1949
|
+
//! // Allocate temporary storage for inclusive prefix sum
|
|
1950
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
1951
|
+
//!
|
|
1952
|
+
//! // Run inclusive prefix sum
|
|
1953
|
+
//! cub::DeviceScan::InclusiveSumByKey(
|
|
1954
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
1955
|
+
//! d_keys_in, d_values_in, d_values_out, num_items);
|
|
1956
|
+
//!
|
|
1957
|
+
//! // d_out <-- [8, 14, 7, 12, 15, 0, 9]
|
|
1958
|
+
//!
|
|
1959
|
+
//! @endrst
|
|
1960
|
+
//!
|
|
1961
|
+
//! @tparam KeysInputIteratorT
|
|
1962
|
+
//! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
|
|
1963
|
+
//!
|
|
1964
|
+
//! @tparam ValuesInputIteratorT
|
|
1965
|
+
//! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
|
|
1966
|
+
//!
|
|
1967
|
+
//! @tparam ValuesOutputIteratorT
|
|
1968
|
+
//! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
|
|
1969
|
+
//!
|
|
1970
|
+
//! @tparam EqualityOpT
|
|
1971
|
+
//! **[inferred]** Functor type having member
|
|
1972
|
+
//! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
|
|
1973
|
+
//!
|
|
1974
|
+
//! @tparam NumItemsT
|
|
1975
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
1976
|
+
//!
|
|
1977
|
+
//! @param[in] d_temp_storage
|
|
1978
|
+
//! Device-accessible allocation of temporary storage.
|
|
1979
|
+
//! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
1980
|
+
//!
|
|
1981
|
+
//! @param[in,out] temp_storage_bytes
|
|
1982
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1983
|
+
//!
|
|
1984
|
+
//! @param[in] d_keys_in
|
|
1985
|
+
//! Random-access input iterator to the input sequence of key items
|
|
1986
|
+
//!
|
|
1987
|
+
//! @param[in] d_values_in
|
|
1988
|
+
//! Random-access input iterator to the input sequence of value items
|
|
1989
|
+
//!
|
|
1990
|
+
//! @param[out] d_values_out
|
|
1991
|
+
//! Random-access output iterator to the output sequence of value items
|
|
1992
|
+
//!
|
|
1993
|
+
//! @param[in] num_items
|
|
1994
|
+
//! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
|
|
1995
|
+
//!
|
|
1996
|
+
//! @param[in] equality_op
|
|
1997
|
+
//! Binary functor that defines the equality of keys.
|
|
1998
|
+
//! Default is cuda::std::equal_to<>{}.
|
|
1999
|
+
//!
|
|
2000
|
+
//! @param[in] stream
|
|
2001
|
+
//! @rst
|
|
2002
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
2003
|
+
//! @endrst
|
|
2004
|
+
template <typename KeysInputIteratorT,
|
|
2005
|
+
typename ValuesInputIteratorT,
|
|
2006
|
+
typename ValuesOutputIteratorT,
|
|
2007
|
+
typename EqualityOpT = ::cuda::std::equal_to<>,
|
|
2008
|
+
typename NumItemsT = uint32_t>
|
|
2009
|
+
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey(
|
|
2010
|
+
void* d_temp_storage,
|
|
2011
|
+
size_t& temp_storage_bytes,
|
|
2012
|
+
KeysInputIteratorT d_keys_in,
|
|
2013
|
+
ValuesInputIteratorT d_values_in,
|
|
2014
|
+
ValuesOutputIteratorT d_values_out,
|
|
2015
|
+
NumItemsT num_items,
|
|
2016
|
+
EqualityOpT equality_op = EqualityOpT(),
|
|
2017
|
+
cudaStream_t stream = 0)
|
|
2018
|
+
{
|
|
2019
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveSumByKey");
|
|
2020
|
+
|
|
2021
|
+
// Unsigned integer type for global offsets
|
|
2022
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
2023
|
+
|
|
2024
|
+
return DispatchScanByKey<
|
|
2025
|
+
KeysInputIteratorT,
|
|
2026
|
+
ValuesInputIteratorT,
|
|
2027
|
+
ValuesOutputIteratorT,
|
|
2028
|
+
EqualityOpT,
|
|
2029
|
+
::cuda::std::plus<>,
|
|
2030
|
+
NullType,
|
|
2031
|
+
OffsetT>::Dispatch(d_temp_storage,
|
|
2032
|
+
temp_storage_bytes,
|
|
2033
|
+
d_keys_in,
|
|
2034
|
+
d_values_in,
|
|
2035
|
+
d_values_out,
|
|
2036
|
+
equality_op,
|
|
2037
|
+
::cuda::std::plus<>{},
|
|
2038
|
+
NullType{},
|
|
2039
|
+
num_items,
|
|
2040
|
+
stream);
|
|
2041
|
+
}
|
|
2042
|
+
|
|
2043
|
+
//! @rst
|
|
2044
|
+
//! Computes a device-wide inclusive prefix scan-by-key using the
|
|
2045
|
+
//! specified binary associative ``scan_op`` functor. The key equality is defined by ``equality_op``.
|
|
2046
|
+
//!
|
|
2047
|
+
//! - Supports non-commutative scan operators.
|
|
2048
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
2049
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
2050
|
+
//! operators may vary from run to run. Additional details can be found in
|
|
2051
|
+
//! the @lookback description.
|
|
2052
|
+
//! - ``d_keys_in`` may equal ``d_values_out`` but the range
|
|
2053
|
+
//! ``[d_keys_in, d_keys_in + num_items)`` and the range
|
|
2054
|
+
//! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
|
|
2055
|
+
//! - ``d_values_in`` may equal ``d_values_out`` but the range
|
|
2056
|
+
//! ``[d_values_in, d_values_in + num_items)`` and the range
|
|
2057
|
+
//! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise.
|
|
2058
|
+
//! - @devicestorage
|
|
2059
|
+
//!
|
|
2060
|
+
//! Snippet
|
|
2061
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
2062
|
+
//!
|
|
2063
|
+
//! The code snippet below illustrates the inclusive prefix min-scan-by-key of an ``int`` device vector.
|
|
2064
|
+
//!
|
|
2065
|
+
//! .. code-block:: c++
|
|
2066
|
+
//!
|
|
2067
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_scan.cuh>
|
|
2068
|
+
//! #include <cuda/std/climits> // for INT_MAX
|
|
2069
|
+
//!
|
|
2070
|
+
//! // CustomMin functor
|
|
2071
|
+
//! struct CustomMin
|
|
2072
|
+
//! {
|
|
2073
|
+
//! template <typename T>
|
|
2074
|
+
//! __host__ __device__ __forceinline__
|
|
2075
|
+
//! T operator()(const T &a, const T &b) const {
|
|
2076
|
+
//! return (b < a) ? b : a;
|
|
2077
|
+
//! }
|
|
2078
|
+
//! };
|
|
2079
|
+
//!
|
|
2080
|
+
//! // CustomEqual functor
|
|
2081
|
+
//! struct CustomEqual
|
|
2082
|
+
//! {
|
|
2083
|
+
//! template <typename T>
|
|
2084
|
+
//! __host__ __device__ __forceinline__
|
|
2085
|
+
//! T operator()(const T &a, const T &b) const {
|
|
2086
|
+
//! return a == b;
|
|
2087
|
+
//! }
|
|
2088
|
+
//! };
|
|
2089
|
+
//!
|
|
2090
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
2091
|
+
//! // input and output
|
|
2092
|
+
//! int num_items; // e.g., 7
|
|
2093
|
+
//! int *d_keys_in; // e.g., [0, 0, 1, 1, 1, 2, 2]
|
|
2094
|
+
//! int *d_values_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
|
|
2095
|
+
//! int *d_values_out; // e.g., [ , , , , , , ]
|
|
2096
|
+
//! CustomMin min_op;
|
|
2097
|
+
//! CustomEqual equality_op;
|
|
2098
|
+
//! ...
|
|
2099
|
+
//!
|
|
2100
|
+
//! // Determine temporary device storage requirements for inclusive prefix scan
|
|
2101
|
+
//! void *d_temp_storage = nullptr;
|
|
2102
|
+
//! size_t temp_storage_bytes = 0;
|
|
2103
|
+
//! cub::DeviceScan::InclusiveScanByKey(
|
|
2104
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
2105
|
+
//! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
|
|
2106
|
+
//!
|
|
2107
|
+
//! // Allocate temporary storage for inclusive prefix scan
|
|
2108
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
2109
|
+
//!
|
|
2110
|
+
//! // Run inclusive prefix min-scan
|
|
2111
|
+
//! cub::DeviceScan::InclusiveScanByKey(
|
|
2112
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
2113
|
+
//! d_keys_in, d_values_in, d_values_out, min_op, num_items, equality_op);
|
|
2114
|
+
//!
|
|
2115
|
+
//! // d_out <-- [8, 6, 7, 5, 3, 0, 0]
|
|
2116
|
+
//!
|
|
2117
|
+
//! @endrst
|
|
2118
|
+
//!
|
|
2119
|
+
//! @tparam KeysInputIteratorT
|
|
2120
|
+
//! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator
|
|
2121
|
+
//!
|
|
2122
|
+
//! @tparam ValuesInputIteratorT
|
|
2123
|
+
//! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator
|
|
2124
|
+
//!
|
|
2125
|
+
//! @tparam ValuesOutputIteratorT
|
|
2126
|
+
//! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator
|
|
2127
|
+
//!
|
|
2128
|
+
//! @tparam ScanOpT
|
|
2129
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
2130
|
+
//!
|
|
2131
|
+
//! @tparam EqualityOpT
|
|
2132
|
+
//! **[inferred]** Functor type having member
|
|
2133
|
+
//! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys
|
|
2134
|
+
//!
|
|
2135
|
+
//! @tparam NumItemsT
|
|
2136
|
+
//! **[inferred]** An integral type representing the number of input elements
|
|
2137
|
+
//!
|
|
2138
|
+
//! @param[in] d_temp_storage
|
|
2139
|
+
//! Device-accessible allocation of temporary storage.
|
|
2140
|
+
//! When `nullptr`, the required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
2141
|
+
//!
|
|
2142
|
+
//! @param[in,out] temp_storage_bytes
|
|
2143
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
2144
|
+
//!
|
|
2145
|
+
//! @param[in] d_keys_in
|
|
2146
|
+
//! Random-access input iterator to the input sequence of key items
|
|
2147
|
+
//!
|
|
2148
|
+
//! @param[in] d_values_in
|
|
2149
|
+
//! Random-access input iterator to the input sequence of value items
|
|
2150
|
+
//!
|
|
2151
|
+
//! @param[out] d_values_out
|
|
2152
|
+
//! Random-access output iterator to the output sequence of value items
|
|
2153
|
+
//!
|
|
2154
|
+
//! @param[in] scan_op
|
|
2155
|
+
//! Binary associative scan functor
|
|
2156
|
+
//!
|
|
2157
|
+
//! @param[in] num_items
|
|
2158
|
+
//! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`)
|
|
2159
|
+
//!
|
|
2160
|
+
//! @param[in] equality_op
|
|
2161
|
+
//! Binary functor that defines the equality of keys.
|
|
2162
|
+
//! Default is cuda::std::equal_to<>{}.
|
|
2163
|
+
//!
|
|
2164
|
+
//! @param[in] stream
|
|
2165
|
+
//! @rst
|
|
2166
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
2167
|
+
//! @endrst
|
|
2168
|
+
template <typename KeysInputIteratorT,
|
|
2169
|
+
typename ValuesInputIteratorT,
|
|
2170
|
+
typename ValuesOutputIteratorT,
|
|
2171
|
+
typename ScanOpT,
|
|
2172
|
+
typename EqualityOpT = ::cuda::std::equal_to<>,
|
|
2173
|
+
typename NumItemsT = uint32_t>
|
|
2174
|
+
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey(
|
|
2175
|
+
void* d_temp_storage,
|
|
2176
|
+
size_t& temp_storage_bytes,
|
|
2177
|
+
KeysInputIteratorT d_keys_in,
|
|
2178
|
+
ValuesInputIteratorT d_values_in,
|
|
2179
|
+
ValuesOutputIteratorT d_values_out,
|
|
2180
|
+
ScanOpT scan_op,
|
|
2181
|
+
NumItemsT num_items,
|
|
2182
|
+
EqualityOpT equality_op = EqualityOpT(),
|
|
2183
|
+
cudaStream_t stream = 0)
|
|
2184
|
+
{
|
|
2185
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::InclusiveScanByKey");
|
|
2186
|
+
|
|
2187
|
+
// Unsigned integer type for global offsets
|
|
2188
|
+
using OffsetT = detail::choose_offset_t<NumItemsT>;
|
|
2189
|
+
|
|
2190
|
+
return DispatchScanByKey<
|
|
2191
|
+
KeysInputIteratorT,
|
|
2192
|
+
ValuesInputIteratorT,
|
|
2193
|
+
ValuesOutputIteratorT,
|
|
2194
|
+
EqualityOpT,
|
|
2195
|
+
ScanOpT,
|
|
2196
|
+
NullType,
|
|
2197
|
+
OffsetT>::Dispatch(d_temp_storage,
|
|
2198
|
+
temp_storage_bytes,
|
|
2199
|
+
d_keys_in,
|
|
2200
|
+
d_values_in,
|
|
2201
|
+
d_values_out,
|
|
2202
|
+
equality_op,
|
|
2203
|
+
scan_op,
|
|
2204
|
+
NullType(),
|
|
2205
|
+
num_items,
|
|
2206
|
+
stream);
|
|
2207
|
+
}
|
|
2208
|
+
|
|
2209
|
+
//! @} end member group
|
|
2210
|
+
};
|
|
2211
|
+
|
|
2212
|
+
CUB_NAMESPACE_END
|