cuda-cccl 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/__init__.py +27 -0
- cuda/cccl/_cuda_version_utils.py +24 -0
- cuda/cccl/cooperative/__init__.py +9 -0
- cuda/cccl/cooperative/experimental/__init__.py +24 -0
- cuda/cccl/headers/__init__.py +7 -0
- cuda/cccl/headers/include/__init__.py +1 -0
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +259 -0
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1182 -0
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +81 -0
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +709 -0
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +748 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +786 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +703 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +555 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +589 -0
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +474 -0
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +289 -0
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1117 -0
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +606 -0
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +631 -0
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +963 -0
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1227 -0
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +1313 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +424 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +1264 -0
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1225 -0
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2196 -0
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +667 -0
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +2315 -0
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
- cuda/cccl/headers/include/cub/block/block_store.cuh +1247 -0
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
- cuda/cccl/headers/include/cub/config.cuh +53 -0
- cuda/cccl/headers/include/cub/cub.cuh +120 -0
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +114 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
- cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
- cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
- cuda/cccl/headers/include/cub/device/device_for.cuh +1063 -0
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
- cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
- cuda/cccl/headers/include/cub/device/device_partition.cuh +668 -0
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +2518 -0
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
- cuda/cccl/headers/include/cub/device/device_scan.cuh +2212 -0
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1430 -0
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
- cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
- cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
- cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1744 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1310 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +531 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +517 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +975 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +440 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +627 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +569 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +261 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +583 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +189 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +321 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +522 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1028 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +67 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +118 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +60 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +275 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +76 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +126 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1065 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +942 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +673 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +618 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1010 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +398 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +440 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +481 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +884 -0
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +548 -0
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
- cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
- cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
- cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
- cuda/cccl/headers/include/cub/util_device.cuh +800 -0
- cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
- cuda/cccl/headers/include/cub/util_math.cuh +118 -0
- cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
- cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
- cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
- cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
- cuda/cccl/headers/include/cub/version.cuh +89 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +737 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +408 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +829 -0
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1890 -0
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +521 -0
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
- cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
- cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +487 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
- cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
- cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
- cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
- cuda/cccl/headers/include/cuda/__cccl_config +37 -0
- cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
- cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
- cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
- cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
- cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
- cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
- cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
- cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
- cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
- cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
- cuda/cccl/headers/include/cuda/__complex_ +28 -0
- cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
- cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
- cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +835 -0
- cuda/cccl/headers/include/cuda/__event/event.h +171 -0
- cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
- cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
- cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
- cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
- cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
- cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
- cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
- cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
- cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
- cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
- cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +533 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
- cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
- cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
- cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
- cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
- cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
- cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +652 -0
- cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
- cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2983 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
- cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
- cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
- cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
- cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
- cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
- cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
- cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
- cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
- cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
- cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
- cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
- cuda/cccl/headers/include/cuda/access_property +26 -0
- cuda/cccl/headers/include/cuda/algorithm +27 -0
- cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
- cuda/cccl/headers/include/cuda/atomic +27 -0
- cuda/cccl/headers/include/cuda/barrier +267 -0
- cuda/cccl/headers/include/cuda/bit +29 -0
- cuda/cccl/headers/include/cuda/cmath +37 -0
- cuda/cccl/headers/include/cuda/devices +33 -0
- cuda/cccl/headers/include/cuda/discard_memory +32 -0
- cuda/cccl/headers/include/cuda/functional +32 -0
- cuda/cccl/headers/include/cuda/iterator +39 -0
- cuda/cccl/headers/include/cuda/latch +27 -0
- cuda/cccl/headers/include/cuda/mdspan +28 -0
- cuda/cccl/headers/include/cuda/memory +35 -0
- cuda/cccl/headers/include/cuda/memory_resource +35 -0
- cuda/cccl/headers/include/cuda/numeric +29 -0
- cuda/cccl/headers/include/cuda/pipeline +579 -0
- cuda/cccl/headers/include/cuda/ptx +129 -0
- cuda/cccl/headers/include/cuda/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
- cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
- cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
- cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
- cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
- cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
- cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
- cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
- cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
- cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
- cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
- cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
- cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +673 -0
- cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +91 -0
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
- cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
- cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
- cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
- cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
- cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
- cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
- cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
- cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
- cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +259 -0
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
- cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
- cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
- cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
- cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
- cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
- cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
- cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
- cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
- cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
- cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
- cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
- cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
- cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
- cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
- cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
- cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
- cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
- cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
- cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
- cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
- cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
- cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
- cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +139 -0
- cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
- cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
- cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
- cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +165 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
- cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
- cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
- cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
- cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
- cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
- cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
- cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
- cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
- cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
- cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
- cuda/cccl/headers/include/cuda/std/__format_ +45 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
- cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
- cuda/cccl/headers/include/cuda/std/__functional/function.h +1275 -0
- cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
- cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
- cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
- cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
- cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
- cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
- cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
- cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
- cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
- cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
- cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
- cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
- cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +72 -0
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
- cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
- cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
- cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
- cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
- cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
- cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
- cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
- cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
- cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
- cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
- cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
- cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +759 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
- cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +525 -0
- cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
- cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
- cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
- cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
- cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
- cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
- cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
- cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
- cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
- cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
- cuda/cccl/headers/include/cuda/std/__new_ +29 -0
- cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
- cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
- cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
- cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
- cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
- cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
- cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
- cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
- cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
- cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
- cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
- cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
- cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
- cuda/cccl/headers/include/cuda/std/__random_ +29 -0
- cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
- cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
- cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
- cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
- cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
- cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
- cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
- cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
- cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
- cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
- cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
- cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
- cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
- cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
- cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
- cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
- cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
- cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
- cuda/cccl/headers/include/cuda/std/__string_ +29 -0
- cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
- cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +84 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
- cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
- cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
- cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
- cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
- cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
- cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
- cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
- cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
- cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
- cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
- cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
- cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
- cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
- cuda/cccl/headers/include/cuda/std/array +518 -0
- cuda/cccl/headers/include/cuda/std/atomic +810 -0
- cuda/cccl/headers/include/cuda/std/barrier +42 -0
- cuda/cccl/headers/include/cuda/std/bit +35 -0
- cuda/cccl/headers/include/cuda/std/bitset +994 -0
- cuda/cccl/headers/include/cuda/std/cassert +28 -0
- cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
- cuda/cccl/headers/include/cuda/std/cfloat +59 -0
- cuda/cccl/headers/include/cuda/std/chrono +26 -0
- cuda/cccl/headers/include/cuda/std/climits +61 -0
- cuda/cccl/headers/include/cuda/std/cmath +87 -0
- cuda/cccl/headers/include/cuda/std/complex +50 -0
- cuda/cccl/headers/include/cuda/std/concepts +48 -0
- cuda/cccl/headers/include/cuda/std/cstddef +28 -0
- cuda/cccl/headers/include/cuda/std/cstdint +178 -0
- cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
- cuda/cccl/headers/include/cuda/std/cstring +110 -0
- cuda/cccl/headers/include/cuda/std/ctime +154 -0
- cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2144 -0
- cuda/cccl/headers/include/cuda/std/execution +29 -0
- cuda/cccl/headers/include/cuda/std/expected +30 -0
- cuda/cccl/headers/include/cuda/std/functional +56 -0
- cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
- cuda/cccl/headers/include/cuda/std/iterator +70 -0
- cuda/cccl/headers/include/cuda/std/latch +34 -0
- cuda/cccl/headers/include/cuda/std/limits +28 -0
- cuda/cccl/headers/include/cuda/std/linalg +30 -0
- cuda/cccl/headers/include/cuda/std/mdspan +38 -0
- cuda/cccl/headers/include/cuda/std/memory +39 -0
- cuda/cccl/headers/include/cuda/std/numbers +346 -0
- cuda/cccl/headers/include/cuda/std/numeric +41 -0
- cuda/cccl/headers/include/cuda/std/optional +31 -0
- cuda/cccl/headers/include/cuda/std/ranges +69 -0
- cuda/cccl/headers/include/cuda/std/ratio +416 -0
- cuda/cccl/headers/include/cuda/std/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/source_location +83 -0
- cuda/cccl/headers/include/cuda/std/span +628 -0
- cuda/cccl/headers/include/cuda/std/string_view +925 -0
- cuda/cccl/headers/include/cuda/std/tuple +26 -0
- cuda/cccl/headers/include/cuda/std/type_traits +177 -0
- cuda/cccl/headers/include/cuda/std/utility +70 -0
- cuda/cccl/headers/include/cuda/std/variant +25 -0
- cuda/cccl/headers/include/cuda/std/version +240 -0
- cuda/cccl/headers/include/cuda/stream +31 -0
- cuda/cccl/headers/include/cuda/stream_ref +59 -0
- cuda/cccl/headers/include/cuda/type_traits +27 -0
- cuda/cccl/headers/include/cuda/utility +28 -0
- cuda/cccl/headers/include/cuda/version +16 -0
- cuda/cccl/headers/include/cuda/warp +28 -0
- cuda/cccl/headers/include/cuda/work_stealing +26 -0
- cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
- cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
- cuda/cccl/headers/include/nv/target +240 -0
- cuda/cccl/headers/include/thrust/addressof.h +22 -0
- cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
- cuda/cccl/headers/include/thrust/advance.h +57 -0
- cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
- cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
- cuda/cccl/headers/include/thrust/complex.h +858 -0
- cuda/cccl/headers/include/thrust/copy.h +506 -0
- cuda/cccl/headers/include/thrust/count.h +245 -0
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
- cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +626 -0
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +192 -0
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +96 -0
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +78 -0
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +115 -0
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +116 -0
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
- cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
- cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
- cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
- cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
- cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
- cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
- cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
- cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config.h +36 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
- cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
- cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
- cuda/cccl/headers/include/thrust/detail/count.h +55 -0
- cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
- cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
- cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
- cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +81 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
- cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
- cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
- cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
- cuda/cccl/headers/include/thrust/detail/function.h +49 -0
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
- cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
- cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
- cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
- cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
- cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
- cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
- cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
- cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
- cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
- cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
- cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
- cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
- cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
- cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
- cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
- cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
- cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
- cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
- cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
- cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
- cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
- cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
- cuda/cccl/headers/include/thrust/device_delete.h +74 -0
- cuda/cccl/headers/include/thrust/device_free.h +85 -0
- cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
- cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
- cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
- cuda/cccl/headers/include/thrust/device_new.h +112 -0
- cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
- cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
- cuda/cccl/headers/include/thrust/device_reference.h +983 -0
- cuda/cccl/headers/include/thrust/device_vector.h +576 -0
- cuda/cccl/headers/include/thrust/distance.h +43 -0
- cuda/cccl/headers/include/thrust/equal.h +247 -0
- cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
- cuda/cccl/headers/include/thrust/extrema.h +657 -0
- cuda/cccl/headers/include/thrust/fill.h +200 -0
- cuda/cccl/headers/include/thrust/find.h +382 -0
- cuda/cccl/headers/include/thrust/for_each.h +261 -0
- cuda/cccl/headers/include/thrust/functional.h +395 -0
- cuda/cccl/headers/include/thrust/gather.h +464 -0
- cuda/cccl/headers/include/thrust/generate.h +193 -0
- cuda/cccl/headers/include/thrust/host_vector.h +576 -0
- cuda/cccl/headers/include/thrust/inner_product.h +264 -0
- cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
- cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
- cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
- cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
- cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
- cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
- cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
- cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
- cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
- cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
- cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
- cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
- cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
- cuda/cccl/headers/include/thrust/logical.h +290 -0
- cuda/cccl/headers/include/thrust/memory.h +299 -0
- cuda/cccl/headers/include/thrust/merge.h +725 -0
- cuda/cccl/headers/include/thrust/mismatch.h +261 -0
- cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +528 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
- cuda/cccl/headers/include/thrust/mr/new.h +100 -0
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
- cuda/cccl/headers/include/thrust/mr/pool.h +528 -0
- cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
- cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
- cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
- cuda/cccl/headers/include/thrust/pair.h +99 -0
- cuda/cccl/headers/include/thrust/partition.h +1391 -0
- cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
- cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
- cuda/cccl/headers/include/thrust/random.h +120 -0
- cuda/cccl/headers/include/thrust/reduce.h +1113 -0
- cuda/cccl/headers/include/thrust/remove.h +768 -0
- cuda/cccl/headers/include/thrust/replace.h +826 -0
- cuda/cccl/headers/include/thrust/reverse.h +215 -0
- cuda/cccl/headers/include/thrust/scan.h +1671 -0
- cuda/cccl/headers/include/thrust/scatter.h +446 -0
- cuda/cccl/headers/include/thrust/sequence.h +277 -0
- cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
- cuda/cccl/headers/include/thrust/shuffle.h +182 -0
- cuda/cccl/headers/include/thrust/sort.h +1320 -0
- cuda/cccl/headers/include/thrust/swap.h +147 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +273 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +233 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +170 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +223 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +341 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +469 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
- cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
- cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
- cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
- cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +73 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.inl +172 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +36 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +33 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system_error.h +57 -0
- cuda/cccl/headers/include/thrust/tabulate.h +125 -0
- cuda/cccl/headers/include/thrust/transform.h +1045 -0
- cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
- cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
- cuda/cccl/headers/include/thrust/tuple.h +139 -0
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
- cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
- cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
- cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
- cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
- cuda/cccl/headers/include/thrust/unique.h +1088 -0
- cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
- cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
- cuda/cccl/headers/include/thrust/version.h +93 -0
- cuda/cccl/headers/include/thrust/zip_function.h +176 -0
- cuda/cccl/headers/include_paths.py +51 -0
- cuda/cccl/parallel/__init__.py +9 -0
- cuda/cccl/parallel/experimental/__init__.py +24 -0
- cuda/cccl/py.typed +0 -0
- cuda/compute/__init__.py +79 -0
- cuda/compute/_bindings.py +79 -0
- cuda/compute/_bindings.pyi +475 -0
- cuda/compute/_bindings_impl.pyx +2273 -0
- cuda/compute/_caching.py +71 -0
- cuda/compute/_cccl_interop.py +422 -0
- cuda/compute/_utils/__init__.py +0 -0
- cuda/compute/_utils/protocols.py +132 -0
- cuda/compute/_utils/temp_storage_buffer.py +86 -0
- cuda/compute/algorithms/__init__.py +54 -0
- cuda/compute/algorithms/_histogram.py +243 -0
- cuda/compute/algorithms/_merge_sort.py +225 -0
- cuda/compute/algorithms/_radix_sort.py +312 -0
- cuda/compute/algorithms/_reduce.py +182 -0
- cuda/compute/algorithms/_scan.py +331 -0
- cuda/compute/algorithms/_segmented_reduce.py +257 -0
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/compute/algorithms/_transform.py +329 -0
- cuda/compute/algorithms/_unique_by_key.py +252 -0
- cuda/compute/cccl/.gitkeep +0 -0
- cuda/compute/cu12/_bindings_impl.cp313-win_amd64.pyd +0 -0
- cuda/compute/cu12/cccl/cccl.c.parallel.dll +0 -0
- cuda/compute/cu12/cccl/cccl.c.parallel.lib +0 -0
- cuda/compute/cu13/_bindings_impl.cp313-win_amd64.pyd +0 -0
- cuda/compute/cu13/cccl/cccl.c.parallel.dll +0 -0
- cuda/compute/cu13/cccl/cccl.c.parallel.lib +0 -0
- cuda/compute/iterators/__init__.py +21 -0
- cuda/compute/iterators/_factories.py +219 -0
- cuda/compute/iterators/_iterators.py +817 -0
- cuda/compute/iterators/_zip_iterator.py +199 -0
- cuda/compute/numba_utils.py +53 -0
- cuda/compute/op.py +3 -0
- cuda/compute/struct.py +272 -0
- cuda/compute/typing.py +37 -0
- cuda/coop/__init__.py +8 -0
- cuda/coop/_caching.py +48 -0
- cuda/coop/_common.py +275 -0
- cuda/coop/_nvrtc.py +92 -0
- cuda/coop/_scan_op.py +181 -0
- cuda/coop/_types.py +937 -0
- cuda/coop/_typing.py +107 -0
- cuda/coop/block/__init__.py +39 -0
- cuda/coop/block/_block_exchange.py +251 -0
- cuda/coop/block/_block_load_store.py +215 -0
- cuda/coop/block/_block_merge_sort.py +125 -0
- cuda/coop/block/_block_radix_sort.py +214 -0
- cuda/coop/block/_block_reduce.py +294 -0
- cuda/coop/block/_block_scan.py +983 -0
- cuda/coop/warp/__init__.py +9 -0
- cuda/coop/warp/_warp_merge_sort.py +92 -0
- cuda/coop/warp/_warp_reduce.py +153 -0
- cuda/coop/warp/_warp_scan.py +78 -0
- cuda_cccl-0.3.3.dist-info/METADATA +41 -0
- cuda_cccl-0.3.3.dist-info/RECORD +1968 -0
- cuda_cccl-0.3.3.dist-info/WHEEL +5 -0
- cuda_cccl-0.3.3.dist-info/licenses/LICENSE +1 -0
|
@@ -0,0 +1,1430 @@
|
|
|
1
|
+
/******************************************************************************
|
|
2
|
+
* Copyright (c) 2011, Duane Merrill. All rights reserved.
|
|
3
|
+
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
|
|
4
|
+
*
|
|
5
|
+
* Redistribution and use in source and binary forms, with or without
|
|
6
|
+
* modification, are permitted provided that the following conditions are met:
|
|
7
|
+
* * Redistributions of source code must retain the above copyright
|
|
8
|
+
* notice, this list of conditions and the following disclaimer.
|
|
9
|
+
* * Redistributions in binary form must reproduce the above copyright
|
|
10
|
+
* notice, this list of conditions and the following disclaimer in the
|
|
11
|
+
* documentation and/or other materials provided with the distribution.
|
|
12
|
+
* * Neither the name of the NVIDIA CORPORATION nor the
|
|
13
|
+
* names of its contributors may be used to endorse or promote products
|
|
14
|
+
* derived from this software without specific prior written permission.
|
|
15
|
+
*
|
|
16
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
17
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
18
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
19
|
+
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
20
|
+
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
21
|
+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
22
|
+
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
23
|
+
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
24
|
+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
25
|
+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
26
|
+
*
|
|
27
|
+
******************************************************************************/
|
|
28
|
+
|
|
29
|
+
//! @file
|
|
30
|
+
//! cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across
|
|
31
|
+
//! multiple sequences of data items residing within device-accessible memory.
|
|
32
|
+
|
|
33
|
+
#pragma once
|
|
34
|
+
|
|
35
|
+
#include <cub/config.cuh>
|
|
36
|
+
|
|
37
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
38
|
+
# pragma GCC system_header
|
|
39
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
40
|
+
# pragma clang system_header
|
|
41
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
42
|
+
# pragma system_header
|
|
43
|
+
#endif // no system header
|
|
44
|
+
|
|
45
|
+
#include <cub/detail/choose_offset.cuh>
|
|
46
|
+
#include <cub/device/dispatch/dispatch_reduce.cuh>
|
|
47
|
+
#include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
|
|
48
|
+
#include <cub/iterator/arg_index_input_iterator.cuh>
|
|
49
|
+
#include <cub/util_type.cuh>
|
|
50
|
+
|
|
51
|
+
#include <thrust/iterator/counting_iterator.h>
|
|
52
|
+
#include <thrust/iterator/transform_iterator.h>
|
|
53
|
+
|
|
54
|
+
#include <cuda/__functional/maximum.h>
|
|
55
|
+
#include <cuda/__functional/minimum.h>
|
|
56
|
+
#include <cuda/std/__functional/operations.h>
|
|
57
|
+
#include <cuda/std/__iterator/iterator_traits.h>
|
|
58
|
+
#include <cuda/std/__type_traits/integral_constant.h>
|
|
59
|
+
#include <cuda/std/__type_traits/is_integral.h>
|
|
60
|
+
#include <cuda/std/__type_traits/void_t.h>
|
|
61
|
+
#include <cuda/std/__utility/pair.h>
|
|
62
|
+
#include <cuda/std/cstdint>
|
|
63
|
+
#include <cuda/std/limits>
|
|
64
|
+
|
|
65
|
+
CUB_NAMESPACE_BEGIN
|
|
66
|
+
|
|
67
|
+
//! @rst
|
|
68
|
+
//! DeviceSegmentedReduce provides device-wide, parallel operations for
|
|
69
|
+
//! computing a reduction across multiple sequences of data items
|
|
70
|
+
//! residing within device-accessible memory.
|
|
71
|
+
//!
|
|
72
|
+
//! Overview
|
|
73
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
74
|
+
//!
|
|
75
|
+
//! A `reduction <http://en.wikipedia.org/wiki/Reduce_(higher-order_function)>`_
|
|
76
|
+
//! (or *fold*) uses a binary combining operator to compute a single aggregate
|
|
77
|
+
//! from a sequence of input elements.
|
|
78
|
+
//!
|
|
79
|
+
//! Usage Considerations
|
|
80
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
81
|
+
//!
|
|
82
|
+
//! @cdp_class{DeviceSegmentedReduce}
|
|
83
|
+
//!
|
|
84
|
+
//! @endrst
|
|
85
|
+
struct DeviceSegmentedReduce
|
|
86
|
+
{
|
|
87
|
+
//! @rst
|
|
88
|
+
//! Computes a device-wide segmented reduction using the specified
|
|
89
|
+
//! binary ``reduction_op`` functor.
|
|
90
|
+
//!
|
|
91
|
+
//! - Does not support binary reduction operators that are non-commutative.
|
|
92
|
+
//! - Provides "run-to-run" determinism for pseudo-associative reduction
|
|
93
|
+
//! (e.g., addition of floating point types) on the same GPU device.
|
|
94
|
+
//! However, results for pseudo-associative reduction may be inconsistent
|
|
95
|
+
//! from one device to a another device of a different compute-capability
|
|
96
|
+
//! because CUB can employ different tile-sizing for different architectures.
|
|
97
|
+
//! - When input a contiguous sequence of segments, a single sequence
|
|
98
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
99
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
100
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
101
|
+
//! - Let ``s`` be in ``[0, num_segments)``. The range
|
|
102
|
+
//! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
|
|
103
|
+
//! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
|
|
104
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
105
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)``.
|
|
106
|
+
//! - @devicestorage
|
|
107
|
+
//!
|
|
108
|
+
//! Snippet
|
|
109
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
110
|
+
//!
|
|
111
|
+
//! The code snippet below illustrates a custom min-reduction of a device vector of ``int`` data elements.
|
|
112
|
+
//!
|
|
113
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
|
|
114
|
+
//! :language: c++
|
|
115
|
+
//! :dedent:
|
|
116
|
+
//! :start-after: example-begin segmented-reduce-reduce
|
|
117
|
+
//! :end-before: example-end segmented-reduce-reduce
|
|
118
|
+
//!
|
|
119
|
+
//! @endrst
|
|
120
|
+
//!
|
|
121
|
+
//! @tparam InputIteratorT
|
|
122
|
+
//! **[inferred]** Random-access input iterator type for reading input items @iterator
|
|
123
|
+
//!
|
|
124
|
+
//! @tparam OutputIteratorT
|
|
125
|
+
//! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
|
|
126
|
+
//!
|
|
127
|
+
//! @tparam BeginOffsetIteratorT
|
|
128
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
|
|
129
|
+
//!
|
|
130
|
+
//! @tparam EndOffsetIteratorT
|
|
131
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
|
|
132
|
+
//!
|
|
133
|
+
//! @tparam ReductionOpT
|
|
134
|
+
//! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
|
|
135
|
+
//!
|
|
136
|
+
//! @tparam T
|
|
137
|
+
//! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
|
|
138
|
+
//!
|
|
139
|
+
//! @param[in] d_temp_storage
|
|
140
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
141
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
142
|
+
//!
|
|
143
|
+
//! @param[in,out] temp_storage_bytes
|
|
144
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
145
|
+
//!
|
|
146
|
+
//! @param[in] d_in
|
|
147
|
+
//! Pointer to the input sequence of data items
|
|
148
|
+
//!
|
|
149
|
+
//! @param[out] d_out
|
|
150
|
+
//! Pointer to the output aggregate
|
|
151
|
+
//!
|
|
152
|
+
//! @param[in] num_segments
|
|
153
|
+
//! The number of segments that comprise the segmented reduction data
|
|
154
|
+
//!
|
|
155
|
+
//! @param[in] d_begin_offsets
|
|
156
|
+
//! @rst
|
|
157
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
158
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
159
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
160
|
+
//! @endrst
|
|
161
|
+
//!
|
|
162
|
+
//! @param[in] d_end_offsets
|
|
163
|
+
//! @rst
|
|
164
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
165
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
166
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
167
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
168
|
+
//! @endrst
|
|
169
|
+
//!
|
|
170
|
+
//! @param[in] reduction_op
|
|
171
|
+
//! Binary reduction functor
|
|
172
|
+
//!
|
|
173
|
+
//! @param[in] initial_value
|
|
174
|
+
//! Initial value of the reduction for each segment
|
|
175
|
+
//!
|
|
176
|
+
//! @param[in] stream
|
|
177
|
+
//! @rst
|
|
178
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
179
|
+
//! @endrst
|
|
180
|
+
template <typename InputIteratorT,
|
|
181
|
+
typename OutputIteratorT,
|
|
182
|
+
typename BeginOffsetIteratorT,
|
|
183
|
+
typename EndOffsetIteratorT,
|
|
184
|
+
typename ReductionOpT,
|
|
185
|
+
typename T>
|
|
186
|
+
CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
|
|
187
|
+
void* d_temp_storage,
|
|
188
|
+
size_t& temp_storage_bytes,
|
|
189
|
+
InputIteratorT d_in,
|
|
190
|
+
OutputIteratorT d_out,
|
|
191
|
+
::cuda::std::int64_t num_segments,
|
|
192
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
193
|
+
EndOffsetIteratorT d_end_offsets,
|
|
194
|
+
ReductionOpT reduction_op,
|
|
195
|
+
T initial_value,
|
|
196
|
+
cudaStream_t stream = 0)
|
|
197
|
+
{
|
|
198
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Reduce");
|
|
199
|
+
|
|
200
|
+
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
201
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
202
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
203
|
+
{
|
|
204
|
+
return DispatchSegmentedReduce<
|
|
205
|
+
InputIteratorT,
|
|
206
|
+
OutputIteratorT,
|
|
207
|
+
BeginOffsetIteratorT,
|
|
208
|
+
EndOffsetIteratorT,
|
|
209
|
+
OffsetT,
|
|
210
|
+
ReductionOpT,
|
|
211
|
+
T>::Dispatch(d_temp_storage,
|
|
212
|
+
temp_storage_bytes,
|
|
213
|
+
d_in,
|
|
214
|
+
d_out,
|
|
215
|
+
num_segments,
|
|
216
|
+
d_begin_offsets,
|
|
217
|
+
d_end_offsets,
|
|
218
|
+
reduction_op,
|
|
219
|
+
initial_value, // zero-initialize
|
|
220
|
+
stream);
|
|
221
|
+
}
|
|
222
|
+
_CCCL_UNREACHABLE();
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
//! @rst
|
|
226
|
+
//! Computes a device-wide segmented reduction using the specified
|
|
227
|
+
//! binary ``reduction_op`` functor and a fixed segment size.
|
|
228
|
+
//!
|
|
229
|
+
//! - Does not support binary reduction operators that are non-commutative.
|
|
230
|
+
//! - Provides "run-to-run" determinism for pseudo-associative reduction
|
|
231
|
+
//! (e.g., addition of floating point types) on the same GPU device.
|
|
232
|
+
//! However, results for pseudo-associative reduction may be inconsistent
|
|
233
|
+
//! from one device to a another device of a different compute-capability
|
|
234
|
+
//! because CUB can employ different tile-sizing for different architectures.
|
|
235
|
+
//! - @devicestorage
|
|
236
|
+
//!
|
|
237
|
+
//! Snippet
|
|
238
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
239
|
+
//!
|
|
240
|
+
//! The code snippet below illustrates a custom min-reduction of a device vector of ``int`` data elements.
|
|
241
|
+
//!
|
|
242
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
|
|
243
|
+
//! :language: c++
|
|
244
|
+
//! :dedent:
|
|
245
|
+
//! :start-after: example-begin fixed-size-segmented-reduce-reduce
|
|
246
|
+
//! :end-before: example-end fixed-size-segmented-reduce-reduce
|
|
247
|
+
//!
|
|
248
|
+
//! @endrst
|
|
249
|
+
//!
|
|
250
|
+
//! @tparam InputIteratorT
|
|
251
|
+
//! **[inferred]** Random-access input iterator type for reading input items @iterator
|
|
252
|
+
//!
|
|
253
|
+
//! @tparam OutputIteratorT
|
|
254
|
+
//! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
|
|
255
|
+
//!
|
|
256
|
+
//! @tparam ReductionOpT
|
|
257
|
+
//! **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
|
|
258
|
+
//!
|
|
259
|
+
//! @tparam T
|
|
260
|
+
//! **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
|
|
261
|
+
//!
|
|
262
|
+
//! @param[in] d_temp_storage
|
|
263
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
264
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
265
|
+
//!
|
|
266
|
+
//! @param[in,out] temp_storage_bytes
|
|
267
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
268
|
+
//!
|
|
269
|
+
//! @param[in] d_in
|
|
270
|
+
//! Pointer to the input sequence of data items
|
|
271
|
+
//!
|
|
272
|
+
//! @param[out] d_out
|
|
273
|
+
//! Pointer to the output aggregates
|
|
274
|
+
//!
|
|
275
|
+
//! @param[in] num_segments
|
|
276
|
+
//! The number of segments that comprise the segmented reduction data
|
|
277
|
+
//!
|
|
278
|
+
//! @param[in] segment_size
|
|
279
|
+
//! The fixed segment size of each segment
|
|
280
|
+
//!
|
|
281
|
+
//! @param[in] reduction_op
|
|
282
|
+
//! Binary reduction functor
|
|
283
|
+
//!
|
|
284
|
+
//! @param[in] initial_value
|
|
285
|
+
//! Initial value of the reduction for each segment
|
|
286
|
+
//!
|
|
287
|
+
//! @param[in] stream
|
|
288
|
+
//! @rst
|
|
289
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
290
|
+
//! @endrst
|
|
291
|
+
template <typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT, typename T>
|
|
292
|
+
CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
|
|
293
|
+
void* d_temp_storage,
|
|
294
|
+
size_t& temp_storage_bytes,
|
|
295
|
+
InputIteratorT d_in,
|
|
296
|
+
OutputIteratorT d_out,
|
|
297
|
+
::cuda::std::int64_t num_segments,
|
|
298
|
+
int segment_size,
|
|
299
|
+
ReductionOpT reduction_op,
|
|
300
|
+
T initial_value,
|
|
301
|
+
cudaStream_t stream = 0)
|
|
302
|
+
{
|
|
303
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Reduce");
|
|
304
|
+
|
|
305
|
+
// `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
|
|
306
|
+
// integral constant or larger integral types
|
|
307
|
+
using offset_t = int;
|
|
308
|
+
|
|
309
|
+
return detail::reduce::
|
|
310
|
+
DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ReductionOpT, T>::Dispatch(
|
|
311
|
+
d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, segment_size, reduction_op, initial_value, stream);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
//! @rst
|
|
315
|
+
//! Computes a device-wide segmented sum using the addition (``+``) operator.
|
|
316
|
+
//!
|
|
317
|
+
//! - Uses ``0`` as the initial value of the reduction for each segment.
|
|
318
|
+
//! - When input a contiguous sequence of segments, a single sequence
|
|
319
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
320
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
321
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
322
|
+
//! - Does not support ``+`` operators that are non-commutative.
|
|
323
|
+
//! - Let ``s`` be in ``[0, num_segments)``. The range
|
|
324
|
+
//! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
|
|
325
|
+
//! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
|
|
326
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
327
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)``.
|
|
328
|
+
//! - @devicestorage
|
|
329
|
+
//!
|
|
330
|
+
//! Snippet
|
|
331
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
332
|
+
//!
|
|
333
|
+
//! The code snippet below illustrates the sum reduction of a device vector of ``int`` data elements.
|
|
334
|
+
//!
|
|
335
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
|
|
336
|
+
//! :language: c++
|
|
337
|
+
//! :dedent:
|
|
338
|
+
//! :start-after: example-begin segmented-reduce-sum
|
|
339
|
+
//! :end-before: example-end segmented-reduce-sum
|
|
340
|
+
//!
|
|
341
|
+
//! @endrst
|
|
342
|
+
//!
|
|
343
|
+
//! @tparam InputIteratorT
|
|
344
|
+
//! **[inferred]** Random-access input iterator type for reading input items @iterator
|
|
345
|
+
//!
|
|
346
|
+
//! @tparam OutputIteratorT
|
|
347
|
+
//! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
|
|
348
|
+
//!
|
|
349
|
+
//! @tparam BeginOffsetIteratorT
|
|
350
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
|
|
351
|
+
//!
|
|
352
|
+
//! @tparam EndOffsetIteratorT
|
|
353
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
|
|
354
|
+
//!
|
|
355
|
+
//! @param[in] d_temp_storage
|
|
356
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
357
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
358
|
+
//!
|
|
359
|
+
//! @param[in,out] temp_storage_bytes
|
|
360
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
361
|
+
//!
|
|
362
|
+
//! @param[in] d_in
|
|
363
|
+
//! Pointer to the input sequence of data items
|
|
364
|
+
//!
|
|
365
|
+
//! @param[out] d_out
|
|
366
|
+
//! Pointer to the output aggregate
|
|
367
|
+
//!
|
|
368
|
+
//! @param[in] num_segments
|
|
369
|
+
//! The number of segments that comprise the segmented reduction data
|
|
370
|
+
//!
|
|
371
|
+
//! @param[in] d_begin_offsets
|
|
372
|
+
//! @rst
|
|
373
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
374
|
+
//! length ``num_segments`, such that ``d_begin_offsets[i]`` is the first
|
|
375
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
376
|
+
//! @endrst
|
|
377
|
+
//!
|
|
378
|
+
//! @param[in] d_end_offsets
|
|
379
|
+
//! @rst
|
|
380
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
381
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
382
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
383
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
384
|
+
//! @endrst
|
|
385
|
+
//!
|
|
386
|
+
//! @param[in] stream
|
|
387
|
+
//! @rst
|
|
388
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
389
|
+
//! @endrst
|
|
390
|
+
template <typename InputIteratorT,
|
|
391
|
+
typename OutputIteratorT,
|
|
392
|
+
typename BeginOffsetIteratorT,
|
|
393
|
+
typename EndOffsetIteratorT,
|
|
394
|
+
typename = ::cuda::std::void_t<typename ::cuda::std::iterator_traits<BeginOffsetIteratorT>::value_type,
|
|
395
|
+
typename ::cuda::std::iterator_traits<EndOffsetIteratorT>::value_type>>
|
|
396
|
+
CUB_RUNTIME_FUNCTION static cudaError_t
|
|
397
|
+
Sum(void* d_temp_storage,
|
|
398
|
+
size_t& temp_storage_bytes,
|
|
399
|
+
InputIteratorT d_in,
|
|
400
|
+
OutputIteratorT d_out,
|
|
401
|
+
::cuda::std::int64_t num_segments,
|
|
402
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
403
|
+
EndOffsetIteratorT d_end_offsets,
|
|
404
|
+
cudaStream_t stream = 0)
|
|
405
|
+
{
|
|
406
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Sum");
|
|
407
|
+
|
|
408
|
+
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
409
|
+
using OutputT = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
|
|
410
|
+
using init_t = OutputT;
|
|
411
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
412
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
413
|
+
{
|
|
414
|
+
return DispatchSegmentedReduce<
|
|
415
|
+
InputIteratorT,
|
|
416
|
+
OutputIteratorT,
|
|
417
|
+
BeginOffsetIteratorT,
|
|
418
|
+
EndOffsetIteratorT,
|
|
419
|
+
OffsetT,
|
|
420
|
+
::cuda::std::plus<>,
|
|
421
|
+
init_t>::Dispatch(d_temp_storage,
|
|
422
|
+
temp_storage_bytes,
|
|
423
|
+
d_in,
|
|
424
|
+
d_out,
|
|
425
|
+
num_segments,
|
|
426
|
+
d_begin_offsets,
|
|
427
|
+
d_end_offsets,
|
|
428
|
+
::cuda::std::plus<>{},
|
|
429
|
+
init_t{}, // zero-initialize
|
|
430
|
+
stream);
|
|
431
|
+
}
|
|
432
|
+
_CCCL_UNREACHABLE();
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
//! @rst
|
|
436
|
+
//! Computes a device-wide segmented sum using the addition (``+``) operator.
|
|
437
|
+
//!
|
|
438
|
+
//! - Uses ``0`` as the initial value of the reduction for each segment.
|
|
439
|
+
//! - @devicestorage
|
|
440
|
+
//!
|
|
441
|
+
//! Snippet
|
|
442
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
443
|
+
//!
|
|
444
|
+
//! The code snippet below illustrates the sum reduction of a device vector of ``int`` data elements.
|
|
445
|
+
//!
|
|
446
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
|
|
447
|
+
//! :language: c++
|
|
448
|
+
//! :dedent:
|
|
449
|
+
//! :start-after: example-begin fixed-size-segmented-reduce-sum
|
|
450
|
+
//! :end-before: example-end fixed-size-segmented-reduce-sum
|
|
451
|
+
//!
|
|
452
|
+
//! @endrst
|
|
453
|
+
//!
|
|
454
|
+
//! @tparam InputIteratorT
|
|
455
|
+
//! **[inferred]** Random-access input iterator type for reading input items @iterator
|
|
456
|
+
//!
|
|
457
|
+
//! @tparam OutputIteratorT
|
|
458
|
+
//! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
|
|
459
|
+
//!
|
|
460
|
+
//! @param[in] d_temp_storage
|
|
461
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
462
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
463
|
+
//!
|
|
464
|
+
//! @param[in,out] temp_storage_bytes
|
|
465
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
466
|
+
//!
|
|
467
|
+
//! @param[in] d_in
|
|
468
|
+
//! Pointer to the input sequence of data items
|
|
469
|
+
//!
|
|
470
|
+
//! @param[out] d_out
|
|
471
|
+
//! Pointer to the output aggregate
|
|
472
|
+
//!
|
|
473
|
+
//! @param[in] num_segments
|
|
474
|
+
//! The number of segments that comprise the segmented reduction data
|
|
475
|
+
//!
|
|
476
|
+
//! @param[in] segment_size
|
|
477
|
+
//! The fixed segment size of each segment
|
|
478
|
+
//!
|
|
479
|
+
//! @param[in] stream
|
|
480
|
+
//! @rst
|
|
481
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
482
|
+
//! @endrst
|
|
483
|
+
template <typename InputIteratorT, typename OutputIteratorT>
|
|
484
|
+
CUB_RUNTIME_FUNCTION static cudaError_t
|
|
485
|
+
Sum(void* d_temp_storage,
|
|
486
|
+
size_t& temp_storage_bytes,
|
|
487
|
+
InputIteratorT d_in,
|
|
488
|
+
OutputIteratorT d_out,
|
|
489
|
+
::cuda::std::int64_t num_segments,
|
|
490
|
+
int segment_size,
|
|
491
|
+
cudaStream_t stream = 0)
|
|
492
|
+
{
|
|
493
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Sum");
|
|
494
|
+
|
|
495
|
+
// `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
|
|
496
|
+
// integral constant or larger integral types
|
|
497
|
+
using offset_t = int;
|
|
498
|
+
using output_t = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
|
|
499
|
+
|
|
500
|
+
return detail::reduce::
|
|
501
|
+
DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::std::plus<>, output_t>::Dispatch(
|
|
502
|
+
d_temp_storage,
|
|
503
|
+
temp_storage_bytes,
|
|
504
|
+
d_in,
|
|
505
|
+
d_out,
|
|
506
|
+
num_segments,
|
|
507
|
+
segment_size,
|
|
508
|
+
::cuda::std::plus{},
|
|
509
|
+
output_t{},
|
|
510
|
+
stream);
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
//! @rst
|
|
514
|
+
//! Computes a device-wide segmented minimum using the less-than (``<``) operator.
|
|
515
|
+
//!
|
|
516
|
+
//! - Uses ``::cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction for each segment.
|
|
517
|
+
//! - When input a contiguous sequence of segments, a single sequence
|
|
518
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
|
|
519
|
+
//! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter is
|
|
520
|
+
//! specified as ``segment_offsets + 1``).
|
|
521
|
+
//! - Does not support ``<`` operators that are non-commutative.
|
|
522
|
+
//! - Let ``s`` be in ``[0, num_segments)``. The range
|
|
523
|
+
//! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
|
|
524
|
+
//! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
|
|
525
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
526
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)``.
|
|
527
|
+
//! - @devicestorage
|
|
528
|
+
//!
|
|
529
|
+
//! Snippet
|
|
530
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
531
|
+
//!
|
|
532
|
+
//! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements.
|
|
533
|
+
//!
|
|
534
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
|
|
535
|
+
//! :language: c++
|
|
536
|
+
//! :dedent:
|
|
537
|
+
//! :start-after: example-begin segmented-reduce-custommin
|
|
538
|
+
//! :end-before: example-end segmented-reduce-custommin
|
|
539
|
+
//!
|
|
540
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
|
|
541
|
+
//! :language: c++
|
|
542
|
+
//! :dedent:
|
|
543
|
+
//! :start-after: example-begin segmented-reduce-min
|
|
544
|
+
//! :end-before: example-end segmented-reduce-min
|
|
545
|
+
//!
|
|
546
|
+
//! @endrst
|
|
547
|
+
//!
|
|
548
|
+
//! @tparam InputIteratorT
|
|
549
|
+
//! **[inferred]** Random-access input iterator type for reading input items @iterator
|
|
550
|
+
//!
|
|
551
|
+
//! @tparam OutputIteratorT
|
|
552
|
+
//! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
|
|
553
|
+
//!
|
|
554
|
+
//! @tparam BeginOffsetIteratorT
|
|
555
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
|
|
556
|
+
//!
|
|
557
|
+
//! @tparam EndOffsetIteratorT
|
|
558
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
|
|
559
|
+
//!
|
|
560
|
+
//! @param[in] d_temp_storage
|
|
561
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
562
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
563
|
+
//!
|
|
564
|
+
//! @param[in,out] temp_storage_bytes
|
|
565
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
566
|
+
//!
|
|
567
|
+
//! @param[in] d_in
|
|
568
|
+
//! Pointer to the input sequence of data items
|
|
569
|
+
//!
|
|
570
|
+
//! @param[out] d_out
|
|
571
|
+
//! Pointer to the output aggregate
|
|
572
|
+
//!
|
|
573
|
+
//! @param[in] num_segments
|
|
574
|
+
//! The number of segments that comprise the segmented reduction data
|
|
575
|
+
//!
|
|
576
|
+
//! @param[in] d_begin_offsets
|
|
577
|
+
//! @rst
|
|
578
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
579
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
580
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
581
|
+
//! @endrst
|
|
582
|
+
//!
|
|
583
|
+
//! @param[in] d_end_offsets
|
|
584
|
+
//! @rst
|
|
585
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
586
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
587
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
588
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
589
|
+
//! @endrst
|
|
590
|
+
//!
|
|
591
|
+
//! @param[in] stream
|
|
592
|
+
//! @rst
|
|
593
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
594
|
+
//! @endrst
|
|
595
|
+
template <typename InputIteratorT,
|
|
596
|
+
typename OutputIteratorT,
|
|
597
|
+
typename BeginOffsetIteratorT,
|
|
598
|
+
typename EndOffsetIteratorT,
|
|
599
|
+
typename = ::cuda::std::void_t<typename ::cuda::std::iterator_traits<BeginOffsetIteratorT>::value_type,
|
|
600
|
+
typename ::cuda::std::iterator_traits<EndOffsetIteratorT>::value_type>>
|
|
601
|
+
CUB_RUNTIME_FUNCTION static cudaError_t
|
|
602
|
+
Min(void* d_temp_storage,
|
|
603
|
+
size_t& temp_storage_bytes,
|
|
604
|
+
InputIteratorT d_in,
|
|
605
|
+
OutputIteratorT d_out,
|
|
606
|
+
::cuda::std::int64_t num_segments,
|
|
607
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
608
|
+
EndOffsetIteratorT d_end_offsets,
|
|
609
|
+
cudaStream_t stream = 0)
|
|
610
|
+
{
|
|
611
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Min");
|
|
612
|
+
|
|
613
|
+
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
614
|
+
using InputT = detail::it_value_t<InputIteratorT>;
|
|
615
|
+
using init_t = InputT;
|
|
616
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
617
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
618
|
+
{
|
|
619
|
+
return DispatchSegmentedReduce<
|
|
620
|
+
InputIteratorT,
|
|
621
|
+
OutputIteratorT,
|
|
622
|
+
BeginOffsetIteratorT,
|
|
623
|
+
EndOffsetIteratorT,
|
|
624
|
+
OffsetT,
|
|
625
|
+
::cuda::minimum<>,
|
|
626
|
+
init_t>::Dispatch(d_temp_storage,
|
|
627
|
+
temp_storage_bytes,
|
|
628
|
+
d_in,
|
|
629
|
+
d_out,
|
|
630
|
+
num_segments,
|
|
631
|
+
d_begin_offsets,
|
|
632
|
+
d_end_offsets,
|
|
633
|
+
::cuda::minimum<>{},
|
|
634
|
+
::cuda::std::numeric_limits<init_t>::max(),
|
|
635
|
+
stream);
|
|
636
|
+
}
|
|
637
|
+
_CCCL_UNREACHABLE();
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
//! @rst
|
|
641
|
+
//! Computes a device-wide segmented minimum using the less-than (``<``) operator.
|
|
642
|
+
//!
|
|
643
|
+
//! - Uses ``::cuda::std::numeric_limits<T>::max()`` as the initial value of the reduction for each segment.
|
|
644
|
+
//!
|
|
645
|
+
//! Snippet
|
|
646
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
647
|
+
//!
|
|
648
|
+
//! The code snippet below illustrates the min-reduction of a device vector of ``int`` data elements.
|
|
649
|
+
//!
|
|
650
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
|
|
651
|
+
//! :language: c++
|
|
652
|
+
//! :dedent:
|
|
653
|
+
//! :start-after: example-begin segmented-reduce-custommin
|
|
654
|
+
//! :end-before: example-end segmented-reduce-custommin
|
|
655
|
+
//!
|
|
656
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
|
|
657
|
+
//! :language: c++
|
|
658
|
+
//! :dedent:
|
|
659
|
+
//! :start-after: example-begin fixed-size-segmented-reduce-min
|
|
660
|
+
//! :end-before: example-end fixed-size-segmented-reduce-min
|
|
661
|
+
//!
|
|
662
|
+
//! @endrst
|
|
663
|
+
//!
|
|
664
|
+
//! @tparam InputIteratorT
|
|
665
|
+
//! **[inferred]** Random-access input iterator type for reading input items @iterator
|
|
666
|
+
//!
|
|
667
|
+
//! @tparam OutputIteratorT
|
|
668
|
+
//! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
|
|
669
|
+
//!
|
|
670
|
+
//! @param[in] d_temp_storage
|
|
671
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
672
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
673
|
+
//!
|
|
674
|
+
//! @param[in,out] temp_storage_bytes
|
|
675
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
676
|
+
//!
|
|
677
|
+
//! @param[in] d_in
|
|
678
|
+
//! Pointer to the input sequence of data items
|
|
679
|
+
//!
|
|
680
|
+
//! @param[out] d_out
|
|
681
|
+
//! Pointer to the output aggregate
|
|
682
|
+
//!
|
|
683
|
+
//! @param[in] num_segments
|
|
684
|
+
//! The number of segments that comprise the segmented reduction data
|
|
685
|
+
//!
|
|
686
|
+
//! @param[in] segment_size
|
|
687
|
+
//! The fixed segment size of each segment
|
|
688
|
+
//!
|
|
689
|
+
//! @param[in] stream
|
|
690
|
+
//! @rst
|
|
691
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
692
|
+
//! @endrst
|
|
693
|
+
template <typename InputIteratorT, typename OutputIteratorT>
|
|
694
|
+
CUB_RUNTIME_FUNCTION static cudaError_t
|
|
695
|
+
Min(void* d_temp_storage,
|
|
696
|
+
size_t& temp_storage_bytes,
|
|
697
|
+
InputIteratorT d_in,
|
|
698
|
+
OutputIteratorT d_out,
|
|
699
|
+
::cuda::std::int64_t num_segments,
|
|
700
|
+
int segment_size,
|
|
701
|
+
cudaStream_t stream = 0)
|
|
702
|
+
{
|
|
703
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Min");
|
|
704
|
+
|
|
705
|
+
// `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
|
|
706
|
+
// integral constant or larger integral types
|
|
707
|
+
using offset_t = int;
|
|
708
|
+
using input_t = detail::it_value_t<InputIteratorT>;
|
|
709
|
+
|
|
710
|
+
return detail::reduce::
|
|
711
|
+
DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::minimum<>, input_t>::Dispatch(
|
|
712
|
+
d_temp_storage,
|
|
713
|
+
temp_storage_bytes,
|
|
714
|
+
d_in,
|
|
715
|
+
d_out,
|
|
716
|
+
num_segments,
|
|
717
|
+
segment_size,
|
|
718
|
+
::cuda::minimum<>{},
|
|
719
|
+
::cuda::std::numeric_limits<input_t>::max(),
|
|
720
|
+
stream);
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
//! @rst
|
|
724
|
+
//! Finds the first device-wide minimum in each segment using the
|
|
725
|
+
//! less-than (``<``) operator, also returning the in-segment index of that item.
|
|
726
|
+
//!
|
|
727
|
+
//! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
|
|
728
|
+
//! (assuming the value type of ``d_in`` is ``T``)
|
|
729
|
+
//!
|
|
730
|
+
//! - The minimum of the *i*\ :sup:`th` segment is written to
|
|
731
|
+
//! ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``.
|
|
732
|
+
//! - The ``{1, ::cuda::std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
|
|
733
|
+
//!
|
|
734
|
+
//! - When input a contiguous sequence of segments, a single sequence
|
|
735
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased for both
|
|
736
|
+
//! the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where the latter
|
|
737
|
+
//! is specified as ``segment_offsets + 1``).
|
|
738
|
+
//! - Does not support ``<`` operators that are non-commutative.
|
|
739
|
+
//! - Let ``s`` be in ``[0, num_segments)``. The range
|
|
740
|
+
//! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
|
|
741
|
+
//! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
|
|
742
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
743
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)``.
|
|
744
|
+
//! - @devicestorage
|
|
745
|
+
//!
|
|
746
|
+
//! Snippet
|
|
747
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
748
|
+
//!
|
|
749
|
+
//! The code snippet below illustrates the argmin-reduction of a device vector of ``int`` data elements.
|
|
750
|
+
//!
|
|
751
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
|
|
752
|
+
//! :language: c++
|
|
753
|
+
//! :dedent:
|
|
754
|
+
//! :start-after: example-begin segmented-reduce-argmin
|
|
755
|
+
//! :end-before: example-end segmented-reduce-argmin
|
|
756
|
+
//!
|
|
757
|
+
//! @endrst
|
|
758
|
+
//!
|
|
759
|
+
//! @tparam InputIteratorT
|
|
760
|
+
//! **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator
|
|
761
|
+
//!
|
|
762
|
+
//! @tparam OutputIteratorT
|
|
763
|
+
//! **[inferred]** Output iterator type for recording the reduced aggregate
|
|
764
|
+
//! (having value type `KeyValuePair<int, T>`) @iterator
|
|
765
|
+
//!
|
|
766
|
+
//! @tparam BeginOffsetIteratorT
|
|
767
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
768
|
+
//! beginning offsets @iterator
|
|
769
|
+
//!
|
|
770
|
+
//! @tparam EndOffsetIteratorT
|
|
771
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
772
|
+
//! ending offsets @iterator
|
|
773
|
+
//!
|
|
774
|
+
//! @param[in] d_temp_storage
|
|
775
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
776
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
777
|
+
//!
|
|
778
|
+
//! @param[in,out] temp_storage_bytes
|
|
779
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
780
|
+
//!
|
|
781
|
+
//! @param[in] d_in
|
|
782
|
+
//! Pointer to the input sequence of data items
|
|
783
|
+
//!
|
|
784
|
+
//! @param[out] d_out
|
|
785
|
+
//! Pointer to the output aggregate
|
|
786
|
+
//!
|
|
787
|
+
//! @param[in] num_segments
|
|
788
|
+
//! The number of segments that comprise the segmented reduction data
|
|
789
|
+
//!
|
|
790
|
+
//! @param[in] d_begin_offsets
|
|
791
|
+
//! @rst
|
|
792
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
793
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
794
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
795
|
+
//! @endrst
|
|
796
|
+
//!
|
|
797
|
+
//! @param[in] d_end_offsets
|
|
798
|
+
//! @rst
|
|
799
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
800
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
801
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
802
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
803
|
+
//! @endrst
|
|
804
|
+
//!
|
|
805
|
+
//! @param[in] stream
|
|
806
|
+
//! @rst
|
|
807
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
808
|
+
//! @endrst
|
|
809
|
+
template <typename InputIteratorT,
|
|
810
|
+
typename OutputIteratorT,
|
|
811
|
+
typename BeginOffsetIteratorT,
|
|
812
|
+
typename EndOffsetIteratorT,
|
|
813
|
+
typename = ::cuda::std::void_t<typename ::cuda::std::iterator_traits<BeginOffsetIteratorT>::value_type,
|
|
814
|
+
typename ::cuda::std::iterator_traits<EndOffsetIteratorT>::value_type>>
|
|
815
|
+
CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
|
|
816
|
+
void* d_temp_storage,
|
|
817
|
+
size_t& temp_storage_bytes,
|
|
818
|
+
InputIteratorT d_in,
|
|
819
|
+
OutputIteratorT d_out,
|
|
820
|
+
::cuda::std::int64_t num_segments,
|
|
821
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
822
|
+
EndOffsetIteratorT d_end_offsets,
|
|
823
|
+
cudaStream_t stream = 0)
|
|
824
|
+
{
|
|
825
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMin");
|
|
826
|
+
|
|
827
|
+
// Using common iterator value type is a breaking change, see:
|
|
828
|
+
// https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
|
|
829
|
+
using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
830
|
+
|
|
831
|
+
using InputValueT = detail::it_value_t<InputIteratorT>;
|
|
832
|
+
using OutputTupleT = detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
|
|
833
|
+
using OutputValueT = typename OutputTupleT::Value;
|
|
834
|
+
using AccumT = OutputTupleT;
|
|
835
|
+
using InitT = detail::reduce::empty_problem_init_t<AccumT>;
|
|
836
|
+
|
|
837
|
+
// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
|
|
838
|
+
using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
|
|
839
|
+
ArgIndexInputIteratorT d_indexed_in(d_in);
|
|
840
|
+
|
|
841
|
+
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
|
|
842
|
+
|
|
843
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
844
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
845
|
+
{
|
|
846
|
+
return DispatchSegmentedReduce<
|
|
847
|
+
ArgIndexInputIteratorT,
|
|
848
|
+
OutputIteratorT,
|
|
849
|
+
BeginOffsetIteratorT,
|
|
850
|
+
EndOffsetIteratorT,
|
|
851
|
+
OffsetT,
|
|
852
|
+
cub::ArgMin,
|
|
853
|
+
InitT,
|
|
854
|
+
AccumT>::Dispatch(d_temp_storage,
|
|
855
|
+
temp_storage_bytes,
|
|
856
|
+
d_indexed_in,
|
|
857
|
+
d_out,
|
|
858
|
+
num_segments,
|
|
859
|
+
d_begin_offsets,
|
|
860
|
+
d_end_offsets,
|
|
861
|
+
cub::ArgMin{},
|
|
862
|
+
initial_value,
|
|
863
|
+
stream);
|
|
864
|
+
}
|
|
865
|
+
_CCCL_UNREACHABLE();
|
|
866
|
+
}
|
|
867
|
+
|
|
868
|
+
//! @rst
|
|
869
|
+
//! Finds the first device-wide minimum in each segment using the
|
|
870
|
+
//! less-than (``<``) operator, also returning the in-segment index of that item.
|
|
871
|
+
//!
|
|
872
|
+
//! - The output value type of ``d_out`` is ``::cuda::std::pair<int, T>``
|
|
873
|
+
//! (assuming the value type of ``d_in`` is ``T``)
|
|
874
|
+
//!
|
|
875
|
+
//! - The minimum of the *i*\ :sup:`th` segment is written to
|
|
876
|
+
//! ``d_out[i].second`` and its offset in that segment is written to ``d_out[i].first``.
|
|
877
|
+
//! - The ``{1, ::cuda::std::numeric_limits<T>::max()}`` tuple is produced for zero-length inputs
|
|
878
|
+
//!
|
|
879
|
+
//! Snippet
|
|
880
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
881
|
+
//!
|
|
882
|
+
//! The code snippet below illustrates the argmin-reduction of a device vector of ``int`` data elements.
|
|
883
|
+
//!
|
|
884
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
|
|
885
|
+
//! :language: c++
|
|
886
|
+
//! :dedent:
|
|
887
|
+
//! :start-after: example-begin fixed-size-segmented-reduce-argmin
|
|
888
|
+
//! :end-before: example-end fixed-size-segmented-reduce-argmin
|
|
889
|
+
//!
|
|
890
|
+
//! @endrst
|
|
891
|
+
//!
|
|
892
|
+
//! @tparam InputIteratorT
|
|
893
|
+
//! **[inferred]** Random-access input iterator type for reading input items (of some type `T`) @iterator
|
|
894
|
+
//!
|
|
895
|
+
//! @tparam OutputIteratorT
|
|
896
|
+
//! **[inferred]** Output iterator type for recording the reduced aggregate
|
|
897
|
+
//! (having value type `cuda::std::pair<int, T>`) @iterator
|
|
898
|
+
//!
|
|
899
|
+
//! @param[in] d_temp_storage
|
|
900
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
901
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
902
|
+
//!
|
|
903
|
+
//! @param[in,out] temp_storage_bytes
|
|
904
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
905
|
+
//!
|
|
906
|
+
//! @param[in] d_in
|
|
907
|
+
//! Pointer to the input sequence of data items
|
|
908
|
+
//!
|
|
909
|
+
//! @param[out] d_out
|
|
910
|
+
//! Pointer to the output aggregate
|
|
911
|
+
//!
|
|
912
|
+
//! @param[in] num_segments
|
|
913
|
+
//! The number of segments that comprise the segmented reduction data
|
|
914
|
+
//!
|
|
915
|
+
//! @param[in] segment_size
|
|
916
|
+
//! The fixed segment size of each segment
|
|
917
|
+
//!
|
|
918
|
+
//! @param[in] stream
|
|
919
|
+
//! @rst
|
|
920
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
921
|
+
//! @endrst
|
|
922
|
+
template <typename InputIteratorT, typename OutputIteratorT>
|
|
923
|
+
CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(
|
|
924
|
+
void* d_temp_storage,
|
|
925
|
+
size_t& temp_storage_bytes,
|
|
926
|
+
InputIteratorT d_in,
|
|
927
|
+
OutputIteratorT d_out,
|
|
928
|
+
::cuda::std::int64_t num_segments,
|
|
929
|
+
int segment_size,
|
|
930
|
+
cudaStream_t stream = 0)
|
|
931
|
+
{
|
|
932
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMin");
|
|
933
|
+
|
|
934
|
+
// `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
|
|
935
|
+
// integral constant or larger integral types
|
|
936
|
+
using offset_t = int;
|
|
937
|
+
|
|
938
|
+
// The input type
|
|
939
|
+
using input_value_t = cub::detail::it_value_t<InputIteratorT>;
|
|
940
|
+
|
|
941
|
+
// The output tuple type
|
|
942
|
+
using output_tuple_t = cub::detail::non_void_value_t<OutputIteratorT, ::cuda::std::pair<offset_t, input_value_t>>;
|
|
943
|
+
|
|
944
|
+
using accum_t = output_tuple_t;
|
|
945
|
+
|
|
946
|
+
using init_t = detail::reduce::empty_problem_init_t<accum_t>;
|
|
947
|
+
|
|
948
|
+
// The output value type
|
|
949
|
+
using output_value_t = typename output_tuple_t::second_type;
|
|
950
|
+
|
|
951
|
+
// Wrapped input iterator to produce index-value <offset_t, InputT> tuples
|
|
952
|
+
auto d_indexed_in = THRUST_NS_QUALIFIER::make_transform_iterator(
|
|
953
|
+
THRUST_NS_QUALIFIER::counting_iterator<::cuda::std::int64_t>{0},
|
|
954
|
+
detail::reduce::generate_idx_value<InputIteratorT, output_value_t>(d_in, segment_size));
|
|
955
|
+
|
|
956
|
+
using arg_index_input_iterator_t = decltype(d_indexed_in);
|
|
957
|
+
|
|
958
|
+
// Initial value
|
|
959
|
+
init_t initial_value{accum_t(1, ::cuda::std::numeric_limits<input_value_t>::max())};
|
|
960
|
+
|
|
961
|
+
return detail::reduce::DispatchFixedSizeSegmentedReduce<
|
|
962
|
+
arg_index_input_iterator_t,
|
|
963
|
+
OutputIteratorT,
|
|
964
|
+
offset_t,
|
|
965
|
+
cub::detail::arg_min,
|
|
966
|
+
init_t,
|
|
967
|
+
accum_t>::Dispatch(d_temp_storage,
|
|
968
|
+
temp_storage_bytes,
|
|
969
|
+
d_indexed_in,
|
|
970
|
+
d_out,
|
|
971
|
+
num_segments,
|
|
972
|
+
segment_size,
|
|
973
|
+
cub::detail::arg_min(),
|
|
974
|
+
initial_value,
|
|
975
|
+
stream);
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
//! @rst
|
|
979
|
+
//! Computes a device-wide segmented maximum using the greater-than (``>``) operator.
|
|
980
|
+
//!
|
|
981
|
+
//! - Uses ``::cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
|
|
982
|
+
//! - When input a contiguous sequence of segments, a single sequence
|
|
983
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
984
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
985
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
986
|
+
//! - Does not support ``>`` operators that are non-commutative.
|
|
987
|
+
//! - Let ``s`` be in ``[0, num_segments)``. The range
|
|
988
|
+
//! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
|
|
989
|
+
//! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
|
|
990
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
991
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)``.
|
|
992
|
+
//! - @devicestorage
|
|
993
|
+
//!
|
|
994
|
+
//! Snippet
|
|
995
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
996
|
+
//!
|
|
997
|
+
//! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements.
|
|
998
|
+
//!
|
|
999
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
|
|
1000
|
+
//! :language: c++
|
|
1001
|
+
//! :dedent:
|
|
1002
|
+
//! :start-after: example-begin segmented-reduce-max
|
|
1003
|
+
//! :end-before: example-end segmented-reduce-max
|
|
1004
|
+
//!
|
|
1005
|
+
//! @endrst
|
|
1006
|
+
//!
|
|
1007
|
+
//! @tparam InputIteratorT
|
|
1008
|
+
//! **[inferred]** Random-access input iterator type for reading input items @iterator
|
|
1009
|
+
//!
|
|
1010
|
+
//! @tparam OutputIteratorT
|
|
1011
|
+
//! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
|
|
1012
|
+
//!
|
|
1013
|
+
//! @tparam BeginOffsetIteratorT
|
|
1014
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets @iterator
|
|
1015
|
+
//!
|
|
1016
|
+
//! @tparam EndOffsetIteratorT
|
|
1017
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets @iterator
|
|
1018
|
+
//!
|
|
1019
|
+
//! @param[in] d_temp_storage
|
|
1020
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1021
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
1022
|
+
//!
|
|
1023
|
+
//! @param[in,out] temp_storage_bytes
|
|
1024
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1025
|
+
//!
|
|
1026
|
+
//! @param[in] d_in
|
|
1027
|
+
//! Pointer to the input sequence of data items
|
|
1028
|
+
//!
|
|
1029
|
+
//! @param[out] d_out
|
|
1030
|
+
//! Pointer to the output aggregate
|
|
1031
|
+
//!
|
|
1032
|
+
//! @param[in] num_segments
|
|
1033
|
+
//! The number of segments that comprise the segmented reduction data
|
|
1034
|
+
//!
|
|
1035
|
+
//! @param[in] d_begin_offsets
|
|
1036
|
+
//! @rst
|
|
1037
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1038
|
+
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
1039
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
1040
|
+
//! @endrst
|
|
1041
|
+
//!
|
|
1042
|
+
//! @param[in] d_end_offsets
|
|
1043
|
+
//! @rst
|
|
1044
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1045
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
1046
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
1047
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
1048
|
+
//! @endrst
|
|
1049
|
+
//!
|
|
1050
|
+
//! @param[in] stream
|
|
1051
|
+
//! @rst
|
|
1052
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1053
|
+
//! @endrst
|
|
1054
|
+
template <typename InputIteratorT,
|
|
1055
|
+
typename OutputIteratorT,
|
|
1056
|
+
typename BeginOffsetIteratorT,
|
|
1057
|
+
typename EndOffsetIteratorT,
|
|
1058
|
+
typename = ::cuda::std::void_t<typename ::cuda::std::iterator_traits<BeginOffsetIteratorT>::value_type,
|
|
1059
|
+
typename ::cuda::std::iterator_traits<EndOffsetIteratorT>::value_type>>
|
|
1060
|
+
CUB_RUNTIME_FUNCTION static cudaError_t
|
|
1061
|
+
Max(void* d_temp_storage,
|
|
1062
|
+
size_t& temp_storage_bytes,
|
|
1063
|
+
InputIteratorT d_in,
|
|
1064
|
+
OutputIteratorT d_out,
|
|
1065
|
+
::cuda::std::int64_t num_segments,
|
|
1066
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
1067
|
+
EndOffsetIteratorT d_end_offsets,
|
|
1068
|
+
cudaStream_t stream = 0)
|
|
1069
|
+
{
|
|
1070
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Max");
|
|
1071
|
+
|
|
1072
|
+
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
1073
|
+
using InputT = cub::detail::it_value_t<InputIteratorT>;
|
|
1074
|
+
using init_t = InputT;
|
|
1075
|
+
|
|
1076
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
1077
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
1078
|
+
{
|
|
1079
|
+
return DispatchSegmentedReduce<
|
|
1080
|
+
InputIteratorT,
|
|
1081
|
+
OutputIteratorT,
|
|
1082
|
+
BeginOffsetIteratorT,
|
|
1083
|
+
EndOffsetIteratorT,
|
|
1084
|
+
OffsetT,
|
|
1085
|
+
::cuda::maximum<>,
|
|
1086
|
+
init_t>::Dispatch(d_temp_storage,
|
|
1087
|
+
temp_storage_bytes,
|
|
1088
|
+
d_in,
|
|
1089
|
+
d_out,
|
|
1090
|
+
num_segments,
|
|
1091
|
+
d_begin_offsets,
|
|
1092
|
+
d_end_offsets,
|
|
1093
|
+
::cuda::maximum<>{},
|
|
1094
|
+
::cuda::std::numeric_limits<init_t>::lowest(),
|
|
1095
|
+
stream);
|
|
1096
|
+
}
|
|
1097
|
+
_CCCL_UNREACHABLE();
|
|
1098
|
+
}
|
|
1099
|
+
|
|
1100
|
+
//! @rst
|
|
1101
|
+
//! Computes a device-wide segmented maximum using the greater-than (``>``) operator.
|
|
1102
|
+
//!
|
|
1103
|
+
//! - Uses ``::cuda::std::numeric_limits<T>::lowest()`` as the initial value of the reduction.
|
|
1104
|
+
//!
|
|
1105
|
+
//! Snippet
|
|
1106
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1107
|
+
//!
|
|
1108
|
+
//! The code snippet below illustrates the max-reduction of a device vector of ``int`` data elements.
|
|
1109
|
+
//!
|
|
1110
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
|
|
1111
|
+
//! :language: c++
|
|
1112
|
+
//! :dedent:
|
|
1113
|
+
//! :start-after: example-begin fixed-size-segmented-reduce-max
|
|
1114
|
+
//! :end-before: example-end fixed-size-segmented-reduce-max
|
|
1115
|
+
//!
|
|
1116
|
+
//! @endrst
|
|
1117
|
+
//!
|
|
1118
|
+
//! @tparam InputIteratorT
|
|
1119
|
+
//! **[inferred]** Random-access input iterator type for reading input items @iterator
|
|
1120
|
+
//!
|
|
1121
|
+
//! @tparam OutputIteratorT
|
|
1122
|
+
//! **[inferred]** Output iterator type for recording the reduced aggregate @iterator
|
|
1123
|
+
//!
|
|
1124
|
+
//! @param[in] d_temp_storage
|
|
1125
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1126
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
1127
|
+
//!
|
|
1128
|
+
//! @param[in,out] temp_storage_bytes
|
|
1129
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1130
|
+
//!
|
|
1131
|
+
//! @param[in] d_in
|
|
1132
|
+
//! Pointer to the input sequence of data items
|
|
1133
|
+
//!
|
|
1134
|
+
//! @param[out] d_out
|
|
1135
|
+
//! Pointer to the output aggregate
|
|
1136
|
+
//!
|
|
1137
|
+
//! @param[in] num_segments
|
|
1138
|
+
//! The number of segments that comprise the segmented reduction data
|
|
1139
|
+
//!
|
|
1140
|
+
//! @param[in] segment_size
|
|
1141
|
+
//! The fixed segment size of each segment
|
|
1142
|
+
//!
|
|
1143
|
+
//! @param[in] stream
|
|
1144
|
+
//! @rst
|
|
1145
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1146
|
+
//! @endrst
|
|
1147
|
+
template <typename InputIteratorT, typename OutputIteratorT>
|
|
1148
|
+
CUB_RUNTIME_FUNCTION static cudaError_t
|
|
1149
|
+
Max(void* d_temp_storage,
|
|
1150
|
+
size_t& temp_storage_bytes,
|
|
1151
|
+
InputIteratorT d_in,
|
|
1152
|
+
OutputIteratorT d_out,
|
|
1153
|
+
::cuda::std::int64_t num_segments,
|
|
1154
|
+
int segment_size,
|
|
1155
|
+
cudaStream_t stream = 0)
|
|
1156
|
+
{
|
|
1157
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Max");
|
|
1158
|
+
|
|
1159
|
+
// `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
|
|
1160
|
+
// integral constant or larger integral types
|
|
1161
|
+
using offset_t = int;
|
|
1162
|
+
using input_t = detail::it_value_t<InputIteratorT>;
|
|
1163
|
+
|
|
1164
|
+
return detail::reduce::
|
|
1165
|
+
DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::maximum<>, input_t>::Dispatch(
|
|
1166
|
+
d_temp_storage,
|
|
1167
|
+
temp_storage_bytes,
|
|
1168
|
+
d_in,
|
|
1169
|
+
d_out,
|
|
1170
|
+
num_segments,
|
|
1171
|
+
segment_size,
|
|
1172
|
+
::cuda::maximum<>{},
|
|
1173
|
+
::cuda::std::numeric_limits<input_t>::lowest(),
|
|
1174
|
+
stream);
|
|
1175
|
+
}
|
|
1176
|
+
|
|
1177
|
+
//! @rst
|
|
1178
|
+
//! Finds the first device-wide maximum in each segment using the
|
|
1179
|
+
//! greater-than (``>``) operator, also returning the in-segment index of that item
|
|
1180
|
+
//!
|
|
1181
|
+
//! - The output value type of ``d_out`` is ``cub::KeyValuePair<int, T>``
|
|
1182
|
+
//! (assuming the value type of ``d_in`` is ``T``)
|
|
1183
|
+
//!
|
|
1184
|
+
//! - The maximum of the *i*\ :sup:`th` segment is written to
|
|
1185
|
+
//! ``d_out[i].value`` and its offset in that segment is written to ``d_out[i].key``.
|
|
1186
|
+
//! - The ``{1, ::cuda::std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
|
|
1187
|
+
//!
|
|
1188
|
+
//! - When input a contiguous sequence of segments, a single sequence
|
|
1189
|
+
//! ``segment_offsets`` (of length ``num_segments + 1``) can be aliased
|
|
1190
|
+
//! for both the ``d_begin_offsets`` and ``d_end_offsets`` parameters (where
|
|
1191
|
+
//! the latter is specified as ``segment_offsets + 1``).
|
|
1192
|
+
//! - Does not support ``>`` operators that are non-commutative.
|
|
1193
|
+
//! - Let ``s`` be in ``[0, num_segments)``. The range
|
|
1194
|
+
//! ``[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])`` shall not
|
|
1195
|
+
//! overlap ``[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])``,
|
|
1196
|
+
//! ``[d_begin_offsets, d_begin_offsets + num_segments)`` nor
|
|
1197
|
+
//! ``[d_end_offsets, d_end_offsets + num_segments)``.
|
|
1198
|
+
//! - @devicestorage
|
|
1199
|
+
//!
|
|
1200
|
+
//! Snippet
|
|
1201
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1202
|
+
//!
|
|
1203
|
+
//! The code snippet below illustrates the argmax-reduction of a device vector
|
|
1204
|
+
//! of `int` data elements.
|
|
1205
|
+
//!
|
|
1206
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
|
|
1207
|
+
//! :language: c++
|
|
1208
|
+
//! :dedent:
|
|
1209
|
+
//! :start-after: example-begin segmented-reduce-argmax
|
|
1210
|
+
//! :end-before: example-end segmented-reduce-argmax
|
|
1211
|
+
//!
|
|
1212
|
+
//! @endrst
|
|
1213
|
+
//!
|
|
1214
|
+
//! @tparam InputIteratorT
|
|
1215
|
+
//! **[inferred]** Random-access input iterator type for reading input items
|
|
1216
|
+
//! (of some type `T`) @iterator
|
|
1217
|
+
//!
|
|
1218
|
+
//! @tparam OutputIteratorT
|
|
1219
|
+
//! **[inferred]** Output iterator type for recording the reduced aggregate
|
|
1220
|
+
//! (having value type `KeyValuePair<int, T>`) @iterator
|
|
1221
|
+
//!
|
|
1222
|
+
//! @tparam BeginOffsetIteratorT
|
|
1223
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1224
|
+
//! beginning offsets @iterator
|
|
1225
|
+
//!
|
|
1226
|
+
//! @tparam EndOffsetIteratorT
|
|
1227
|
+
//! **[inferred]** Random-access input iterator type for reading segment
|
|
1228
|
+
//! ending offsets @iterator
|
|
1229
|
+
//!
|
|
1230
|
+
//! @param[in] d_temp_storage
|
|
1231
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1232
|
+
//! required allocation size is written to `temp_storage_bytes` and no work
|
|
1233
|
+
//! is done.
|
|
1234
|
+
//!
|
|
1235
|
+
//! @param[in,out] temp_storage_bytes
|
|
1236
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1237
|
+
//!
|
|
1238
|
+
//! @param[in] d_in
|
|
1239
|
+
//! Pointer to the input sequence of data items
|
|
1240
|
+
//!
|
|
1241
|
+
//! @param[out] d_out
|
|
1242
|
+
//! Pointer to the output aggregate
|
|
1243
|
+
//!
|
|
1244
|
+
//! @param[in] num_segments
|
|
1245
|
+
//! The number of segments that comprise the segmented reduction data
|
|
1246
|
+
//!
|
|
1247
|
+
//! @param[in] d_begin_offsets
|
|
1248
|
+
//! @rst
|
|
1249
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1250
|
+
//! length `num_segments`, such that ``d_begin_offsets[i]`` is the first
|
|
1251
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
1252
|
+
//! @endrst
|
|
1253
|
+
//!
|
|
1254
|
+
//! @param[in] d_end_offsets
|
|
1255
|
+
//! @rst
|
|
1256
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1257
|
+
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
1258
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
1259
|
+
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
1260
|
+
//! @endrst
|
|
1261
|
+
//!
|
|
1262
|
+
//! @param[in] stream
|
|
1263
|
+
//! @rst
|
|
1264
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1265
|
+
//! @endrst
|
|
1266
|
+
template <typename InputIteratorT,
|
|
1267
|
+
typename OutputIteratorT,
|
|
1268
|
+
typename BeginOffsetIteratorT,
|
|
1269
|
+
typename EndOffsetIteratorT,
|
|
1270
|
+
typename = ::cuda::std::void_t<typename ::cuda::std::iterator_traits<BeginOffsetIteratorT>::value_type,
|
|
1271
|
+
typename ::cuda::std::iterator_traits<EndOffsetIteratorT>::value_type>>
|
|
1272
|
+
CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
|
|
1273
|
+
void* d_temp_storage,
|
|
1274
|
+
size_t& temp_storage_bytes,
|
|
1275
|
+
InputIteratorT d_in,
|
|
1276
|
+
OutputIteratorT d_out,
|
|
1277
|
+
::cuda::std::int64_t num_segments,
|
|
1278
|
+
BeginOffsetIteratorT d_begin_offsets,
|
|
1279
|
+
EndOffsetIteratorT d_end_offsets,
|
|
1280
|
+
cudaStream_t stream = 0)
|
|
1281
|
+
{
|
|
1282
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMax");
|
|
1283
|
+
|
|
1284
|
+
// Using common iterator value type is a breaking change, see:
|
|
1285
|
+
// https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
|
|
1286
|
+
using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
1287
|
+
|
|
1288
|
+
using InputValueT = cub::detail::it_value_t<InputIteratorT>;
|
|
1289
|
+
using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
|
|
1290
|
+
using AccumT = OutputTupleT;
|
|
1291
|
+
using InitT = detail::reduce::empty_problem_init_t<AccumT>;
|
|
1292
|
+
using OutputValueT = typename OutputTupleT::Value;
|
|
1293
|
+
|
|
1294
|
+
// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
|
|
1295
|
+
using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
|
|
1296
|
+
ArgIndexInputIteratorT d_indexed_in(d_in);
|
|
1297
|
+
|
|
1298
|
+
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
|
|
1299
|
+
|
|
1300
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
1301
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
1302
|
+
{
|
|
1303
|
+
return DispatchSegmentedReduce<
|
|
1304
|
+
ArgIndexInputIteratorT,
|
|
1305
|
+
OutputIteratorT,
|
|
1306
|
+
BeginOffsetIteratorT,
|
|
1307
|
+
EndOffsetIteratorT,
|
|
1308
|
+
OffsetT,
|
|
1309
|
+
cub::ArgMax,
|
|
1310
|
+
InitT,
|
|
1311
|
+
AccumT>::Dispatch(d_temp_storage,
|
|
1312
|
+
temp_storage_bytes,
|
|
1313
|
+
d_indexed_in,
|
|
1314
|
+
d_out,
|
|
1315
|
+
num_segments,
|
|
1316
|
+
d_begin_offsets,
|
|
1317
|
+
d_end_offsets,
|
|
1318
|
+
cub::ArgMax{},
|
|
1319
|
+
initial_value,
|
|
1320
|
+
stream);
|
|
1321
|
+
}
|
|
1322
|
+
_CCCL_UNREACHABLE();
|
|
1323
|
+
}
|
|
1324
|
+
|
|
1325
|
+
//! @rst
|
|
1326
|
+
//! Finds the first device-wide maximum in each segment using the
|
|
1327
|
+
//! greater-than (``>``) operator, also returning the in-segment index of that item
|
|
1328
|
+
//!
|
|
1329
|
+
//! - The output value type of ``d_out`` is ``::cuda::std::pair<int, T>``
|
|
1330
|
+
//! (assuming the value type of ``d_in`` is ``T``)
|
|
1331
|
+
//!
|
|
1332
|
+
//! - The maximum of the *i*\ :sup:`th` segment is written to
|
|
1333
|
+
//! ``d_out[i].second`` and its offset in that segment is written to ``d_out[i].first``.
|
|
1334
|
+
//! - The ``{1, ::cuda::std::numeric_limits<T>::lowest()}`` tuple is produced for zero-length inputs
|
|
1335
|
+
//!
|
|
1336
|
+
//! Snippet
|
|
1337
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1338
|
+
//!
|
|
1339
|
+
//! The code snippet below illustrates the argmax-reduction of a device vector
|
|
1340
|
+
//! of `int` data elements.
|
|
1341
|
+
//!
|
|
1342
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_segmented_reduce_api.cu
|
|
1343
|
+
//! :language: c++
|
|
1344
|
+
//! :dedent:
|
|
1345
|
+
//! :start-after: example-begin fixed-size-segmented-reduce-argmax
|
|
1346
|
+
//! :end-before: example-end fixed-size-segmented-reduce-argmax
|
|
1347
|
+
//!
|
|
1348
|
+
//! @endrst
|
|
1349
|
+
//!
|
|
1350
|
+
//! @tparam InputIteratorT
|
|
1351
|
+
//! **[inferred]** Random-access input iterator type for reading input items
|
|
1352
|
+
//! (of some type `T`) @iterator
|
|
1353
|
+
//!
|
|
1354
|
+
//! @tparam OutputIteratorT
|
|
1355
|
+
//! **[inferred]** Output iterator type for recording the reduced aggregate
|
|
1356
|
+
//! (having value type `cuda::std::pair<int, T>`) @iterator
|
|
1357
|
+
//!
|
|
1358
|
+
//! @param[in] d_temp_storage
|
|
1359
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1360
|
+
//! required allocation size is written to `temp_storage_bytes` and no work
|
|
1361
|
+
//! is done.
|
|
1362
|
+
//!
|
|
1363
|
+
//! @param[in,out] temp_storage_bytes
|
|
1364
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1365
|
+
//!
|
|
1366
|
+
//! @param[in] d_in
|
|
1367
|
+
//! Pointer to the input sequence of data items
|
|
1368
|
+
//!
|
|
1369
|
+
//! @param[out] d_out
|
|
1370
|
+
//! Pointer to the output aggregate
|
|
1371
|
+
//!
|
|
1372
|
+
//! @param[in] num_segments
|
|
1373
|
+
//! The number of segments that comprise the segmented reduction data
|
|
1374
|
+
//!
|
|
1375
|
+
//! @param[in] segment_size
|
|
1376
|
+
//! The fixed segment size of each segment
|
|
1377
|
+
//!
|
|
1378
|
+
//! @param[in] stream
|
|
1379
|
+
//! @rst
|
|
1380
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1381
|
+
//! @endrst
|
|
1382
|
+
template <typename InputIteratorT, typename OutputIteratorT>
|
|
1383
|
+
CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(
|
|
1384
|
+
void* d_temp_storage,
|
|
1385
|
+
size_t& temp_storage_bytes,
|
|
1386
|
+
InputIteratorT d_in,
|
|
1387
|
+
OutputIteratorT d_out,
|
|
1388
|
+
::cuda::std::int64_t num_segments,
|
|
1389
|
+
int segment_size,
|
|
1390
|
+
cudaStream_t stream = 0)
|
|
1391
|
+
{
|
|
1392
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMax");
|
|
1393
|
+
|
|
1394
|
+
// `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
|
|
1395
|
+
// integral constant or larger integral types
|
|
1396
|
+
using input_t = int;
|
|
1397
|
+
|
|
1398
|
+
using input_value_t = detail::it_value_t<InputIteratorT>;
|
|
1399
|
+
using output_tuple_t = detail::non_void_value_t<OutputIteratorT, ::cuda::std::pair<input_t, input_value_t>>;
|
|
1400
|
+
using accum_t = output_tuple_t;
|
|
1401
|
+
using init_t = detail::reduce::empty_problem_init_t<accum_t>;
|
|
1402
|
+
using output_value_t = typename output_tuple_t::second_type;
|
|
1403
|
+
|
|
1404
|
+
// Wrapped input iterator to produce index-value <input_t, InputT> tuples
|
|
1405
|
+
auto d_indexed_in = THRUST_NS_QUALIFIER::make_transform_iterator(
|
|
1406
|
+
THRUST_NS_QUALIFIER::counting_iterator<::cuda::std::int64_t>{0},
|
|
1407
|
+
detail::reduce::generate_idx_value<InputIteratorT, output_value_t>(d_in, segment_size));
|
|
1408
|
+
using arg_index_input_iterator_t = decltype(d_indexed_in);
|
|
1409
|
+
|
|
1410
|
+
init_t initial_value{accum_t(1, ::cuda::std::numeric_limits<input_value_t>::lowest())};
|
|
1411
|
+
|
|
1412
|
+
return detail::reduce::DispatchFixedSizeSegmentedReduce<
|
|
1413
|
+
arg_index_input_iterator_t,
|
|
1414
|
+
OutputIteratorT,
|
|
1415
|
+
input_t,
|
|
1416
|
+
detail::arg_max,
|
|
1417
|
+
init_t,
|
|
1418
|
+
accum_t>::Dispatch(d_temp_storage,
|
|
1419
|
+
temp_storage_bytes,
|
|
1420
|
+
d_indexed_in,
|
|
1421
|
+
d_out,
|
|
1422
|
+
num_segments,
|
|
1423
|
+
segment_size,
|
|
1424
|
+
detail::arg_max(),
|
|
1425
|
+
initial_value,
|
|
1426
|
+
stream);
|
|
1427
|
+
}
|
|
1428
|
+
};
|
|
1429
|
+
|
|
1430
|
+
CUB_NAMESPACE_END
|