cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
- cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
- cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +27 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +13 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +155 -13
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +79 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
- cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
- cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -47,9 +47,7 @@
|
|
|
47
47
|
|
|
48
48
|
CUB_NAMESPACE_BEGIN
|
|
49
49
|
|
|
50
|
-
namespace detail
|
|
51
|
-
{
|
|
52
|
-
namespace three_way_partition
|
|
50
|
+
namespace detail::three_way_partition
|
|
53
51
|
{
|
|
54
52
|
|
|
55
53
|
template <typename PolicyT, typename = void>
|
|
@@ -437,7 +435,6 @@ struct policy_hub
|
|
|
437
435
|
|
|
438
436
|
using MaxPolicy = Policy1000;
|
|
439
437
|
};
|
|
440
|
-
} // namespace three_way_partition
|
|
441
|
-
} // namespace detail
|
|
438
|
+
} // namespace detail::three_way_partition
|
|
442
439
|
|
|
443
440
|
CUB_NAMESPACE_END
|
|
@@ -113,11 +113,11 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
|
113
113
|
(max_items_per_thread, MaxItemsPerThread, int),
|
|
114
114
|
(not_a_vectorized_policy, NotAVectorizedPolicy, int) ) // TODO: remove with C++20
|
|
115
115
|
|
|
116
|
-
template <
|
|
117
|
-
struct vectorized_policy_t : prefetch_policy_t<
|
|
116
|
+
template <typename Tuning>
|
|
117
|
+
struct vectorized_policy_t : prefetch_policy_t<Tuning::block_threads>
|
|
118
118
|
{
|
|
119
|
-
static constexpr int items_per_thread_vectorized =
|
|
120
|
-
static constexpr int
|
|
119
|
+
static constexpr int items_per_thread_vectorized = Tuning::items_per_thread;
|
|
120
|
+
static constexpr int vec_size = Tuning::vec_size;
|
|
121
121
|
|
|
122
122
|
using not_a_vectorized_policy = void; // TODO: remove with C++20, shadows the variable in prefetch_policy_t
|
|
123
123
|
};
|
|
@@ -130,7 +130,7 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
|
130
130
|
(min_items_per_thread, MinItemsPerThread, int),
|
|
131
131
|
(max_items_per_thread, MaxItemsPerThread, int),
|
|
132
132
|
(items_per_thread_vectorized, ItemsPerThreadVectorized, int),
|
|
133
|
-
(
|
|
133
|
+
(vec_size, VecSize, int) )
|
|
134
134
|
|
|
135
135
|
template <int BlockThreads, int BulkCopyAlignment>
|
|
136
136
|
struct async_copy_policy_t
|
|
@@ -282,47 +282,6 @@ _CCCL_HOST_DEVICE constexpr int arch_to_min_bytes_in_flight(int sm_arch)
|
|
|
282
282
|
return 12 * 1024; // V100 and below
|
|
283
283
|
}
|
|
284
284
|
|
|
285
|
-
template <typename H, typename... Ts>
|
|
286
|
-
_CCCL_HOST_DEVICE constexpr bool all_nonzero_equal(H head, Ts... values)
|
|
287
|
-
{
|
|
288
|
-
size_t first = 0;
|
|
289
|
-
for (size_t v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
|
|
290
|
-
{
|
|
291
|
-
if (v == 0)
|
|
292
|
-
{
|
|
293
|
-
continue;
|
|
294
|
-
}
|
|
295
|
-
if (first == 0)
|
|
296
|
-
{
|
|
297
|
-
first = v;
|
|
298
|
-
}
|
|
299
|
-
else if (v != first)
|
|
300
|
-
{
|
|
301
|
-
return false;
|
|
302
|
-
}
|
|
303
|
-
}
|
|
304
|
-
return true;
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
_CCCL_HOST_DEVICE constexpr bool all_nonzero_equal()
|
|
308
|
-
{
|
|
309
|
-
return true;
|
|
310
|
-
}
|
|
311
|
-
|
|
312
|
-
template <typename H, typename... Ts>
|
|
313
|
-
_CCCL_HOST_DEVICE constexpr auto first_nonzero_value(H head, Ts... values)
|
|
314
|
-
{
|
|
315
|
-
for (auto v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
|
|
316
|
-
{
|
|
317
|
-
if (v != 0)
|
|
318
|
-
{
|
|
319
|
-
return v;
|
|
320
|
-
}
|
|
321
|
-
}
|
|
322
|
-
// we only reach here when all input are not contiguous and the output has a void value type
|
|
323
|
-
return H{1};
|
|
324
|
-
}
|
|
325
|
-
|
|
326
285
|
template <typename T>
|
|
327
286
|
inline constexpr size_t size_of = sizeof(T);
|
|
328
287
|
|
|
@@ -337,6 +296,47 @@ _CCCL_HOST_DEVICE static constexpr auto make_sizes_alignments()
|
|
|
337
296
|
{{sizeof(it_value_t<RandomAccessIteratorsIn>), alignof(it_value_t<RandomAccessIteratorsIn>)}...}};
|
|
338
297
|
}
|
|
339
298
|
|
|
299
|
+
template <int PtxVersion, int StoreSize, int... LoadSizes>
|
|
300
|
+
struct tuning_vec
|
|
301
|
+
{
|
|
302
|
+
// defaults from fill on RTX 5090, but can be changed
|
|
303
|
+
static constexpr int block_threads = 256;
|
|
304
|
+
static constexpr int vec_size = 4;
|
|
305
|
+
static constexpr int items_per_thread = 8;
|
|
306
|
+
};
|
|
307
|
+
|
|
308
|
+
// manually tuned fill on A100
|
|
309
|
+
template <int StoreSize>
|
|
310
|
+
struct tuning_vec<800, StoreSize>
|
|
311
|
+
{
|
|
312
|
+
static constexpr int block_threads = 256;
|
|
313
|
+
static constexpr int vec_size = ::cuda::std::max(8 / StoreSize, 1); // 64-bit instructions
|
|
314
|
+
static constexpr int items_per_thread = 8;
|
|
315
|
+
};
|
|
316
|
+
|
|
317
|
+
// manually tuned fill on H200
|
|
318
|
+
template <int StoreSize>
|
|
319
|
+
struct tuning_vec<900, StoreSize>
|
|
320
|
+
{
|
|
321
|
+
static constexpr int block_threads = StoreSize > 4 ? 128 : 256;
|
|
322
|
+
static constexpr int vec_size = ::cuda::std::max(8 / StoreSize, 1); // 64-bit instructions
|
|
323
|
+
static constexpr int items_per_thread = 16;
|
|
324
|
+
};
|
|
325
|
+
|
|
326
|
+
// manually tuned fill on B200, same as H200
|
|
327
|
+
template <int StoreSize>
|
|
328
|
+
struct tuning_vec<1000, StoreSize> : tuning_vec<900, StoreSize>
|
|
329
|
+
{};
|
|
330
|
+
|
|
331
|
+
// manually tuned fill on RTX 5090
|
|
332
|
+
template <int StoreSize>
|
|
333
|
+
struct tuning_vec<1200, StoreSize>
|
|
334
|
+
{
|
|
335
|
+
static constexpr int block_threads = 256;
|
|
336
|
+
static constexpr int vec_size = 4;
|
|
337
|
+
static constexpr int items_per_thread = 8;
|
|
338
|
+
};
|
|
339
|
+
|
|
340
340
|
template <bool RequiresStableAddress,
|
|
341
341
|
bool DenseOutput,
|
|
342
342
|
typename RandomAccessIteratorTupleIn,
|
|
@@ -367,29 +367,12 @@ struct policy_hub<RequiresStableAddress,
|
|
|
367
367
|
|| THRUST_NS_QUALIFIER::is_trivially_relocatable_v<it_value_t<RandomAccessIteratorsIn>>)
|
|
368
368
|
&& ...);
|
|
369
369
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
(
|
|
373
|
-
* THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...);
|
|
374
|
-
static constexpr int load_store_word_size = 8; // TODO(bgruber): make this 16, and 32 on Blackwell+
|
|
375
|
-
// find the value type size of the first contiguous iterator. if there are no inputs, we take the size of the output
|
|
376
|
-
// value type
|
|
377
|
-
static constexpr int contiguous_value_type_size = first_nonzero_value(
|
|
378
|
-
(int{sizeof(it_value_t<RandomAccessIteratorsIn>)}
|
|
379
|
-
* THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
|
|
380
|
-
int{size_of<it_value_t<RandomAccessIteratorOut>>});
|
|
381
|
-
static constexpr bool value_type_divides_load_store_size =
|
|
382
|
-
load_store_word_size % contiguous_value_type_size == 0; // implicitly checks that value_type_size <=
|
|
383
|
-
// load_store_word_size
|
|
384
|
-
static constexpr int target_bytes_per_thread =
|
|
385
|
-
no_input_streams ? 16 /* by experiment on RTX 5090 */ : 32 /* guestimate by gevtushenko for loading */;
|
|
386
|
-
static constexpr int items_per_thread_vec =
|
|
387
|
-
::cuda::round_up(target_bytes_per_thread, load_store_word_size) / contiguous_value_type_size;
|
|
388
|
-
using default_vectorized_policy_t = vectorized_policy_t<256, items_per_thread_vec, load_store_word_size>;
|
|
370
|
+
static constexpr bool all_value_types_have_power_of_two_size =
|
|
371
|
+
(::cuda::is_power_of_two(sizeof(it_value_t<RandomAccessIteratorsIn>)) && ...)
|
|
372
|
+
&& ::cuda::is_power_of_two(size_of<it_value_t<RandomAccessIteratorOut>>);
|
|
389
373
|
|
|
390
374
|
static constexpr bool fallback_to_prefetch =
|
|
391
|
-
RequiresStableAddress || !can_memcpy_contiguous_inputs || !
|
|
392
|
-
|| !value_type_divides_load_store_size || !DenseOutput;
|
|
375
|
+
RequiresStableAddress || !can_memcpy_contiguous_inputs || !all_value_types_have_power_of_two_size || !DenseOutput;
|
|
393
376
|
|
|
394
377
|
// TODO(bgruber): consider a separate kernel for just filling
|
|
395
378
|
|
|
@@ -398,12 +381,16 @@ struct policy_hub<RequiresStableAddress,
|
|
|
398
381
|
static constexpr int min_bif = arch_to_min_bytes_in_flight(300);
|
|
399
382
|
// TODO(bgruber): we don't need algo, because we can just detect the type of algo_policy
|
|
400
383
|
static constexpr auto algorithm = fallback_to_prefetch ? Algorithm::prefetch : Algorithm::vectorized;
|
|
401
|
-
using
|
|
384
|
+
using vec_policy_t = vectorized_policy_t<
|
|
385
|
+
tuning_vec<500, size_of<it_value_t<RandomAccessIteratorOut>>, sizeof(it_value_t<RandomAccessIteratorsIn>)...>>;
|
|
386
|
+
using algo_policy = ::cuda::std::_If<fallback_to_prefetch, prefetch_policy_t<256>, vec_policy_t>;
|
|
402
387
|
};
|
|
403
388
|
|
|
404
389
|
struct policy800 : ChainedPolicy<800, policy800, policy300>
|
|
405
390
|
{
|
|
406
391
|
private:
|
|
392
|
+
using vec_policy_t = vectorized_policy_t<
|
|
393
|
+
tuning_vec<800, size_of<it_value_t<RandomAccessIteratorOut>>, sizeof(it_value_t<RandomAccessIteratorsIn>)...>>;
|
|
407
394
|
static constexpr int block_threads = 256;
|
|
408
395
|
using async_policy = async_copy_policy_t<block_threads, ldgsts_size_and_align>;
|
|
409
396
|
// We cannot use the architecture-specific amount of SMEM here instead of max_smem_per_block, because this is not
|
|
@@ -427,13 +414,17 @@ struct policy_hub<RequiresStableAddress,
|
|
|
427
414
|
using algo_policy =
|
|
428
415
|
::cuda::std::_If<fallback_to_prefetch,
|
|
429
416
|
prefetch_policy_t<block_threads>,
|
|
430
|
-
::cuda::std::_If<fallback_to_vectorized,
|
|
417
|
+
::cuda::std::_If<fallback_to_vectorized, vec_policy_t, async_policy>>;
|
|
431
418
|
};
|
|
432
419
|
|
|
433
420
|
template <int AsyncBlockSize, int PtxVersion>
|
|
434
421
|
struct bulk_copy_policy_base
|
|
435
422
|
{
|
|
436
423
|
private:
|
|
424
|
+
using vec_policy_t =
|
|
425
|
+
vectorized_policy_t<tuning_vec<PtxVersion,
|
|
426
|
+
size_of<it_value_t<RandomAccessIteratorOut>>,
|
|
427
|
+
sizeof(it_value_t<RandomAccessIteratorsIn>)...>>;
|
|
437
428
|
static constexpr int alignment = bulk_copy_alignment(PtxVersion);
|
|
438
429
|
using async_policy = async_copy_policy_t<AsyncBlockSize, alignment>;
|
|
439
430
|
// We cannot use the architecture-specific amount of SMEM here instead of max_smem_per_block, because this is not
|
|
@@ -469,7 +460,7 @@ struct policy_hub<RequiresStableAddress,
|
|
|
469
460
|
using algo_policy =
|
|
470
461
|
::cuda::std::_If<fallback_to_prefetch,
|
|
471
462
|
prefetch_policy_t<256>,
|
|
472
|
-
::cuda::std::_If<fallback_to_vectorized,
|
|
463
|
+
::cuda::std::_If<fallback_to_vectorized, vec_policy_t, async_policy>>;
|
|
473
464
|
};
|
|
474
465
|
|
|
475
466
|
struct policy900
|
|
@@ -788,6 +788,16 @@ struct UniqueByKeyPolicyWrapper<StaticPolicyT,
|
|
|
788
788
|
{
|
|
789
789
|
return cub::detail::MakePolicyWrapper(typename StaticPolicyT::UniqueByKeyPolicyT());
|
|
790
790
|
}
|
|
791
|
+
|
|
792
|
+
#if defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
793
|
+
_CCCL_DEVICE static constexpr auto EncodedPolicy()
|
|
794
|
+
{
|
|
795
|
+
using namespace ptx_json;
|
|
796
|
+
return object<key<"UniqueByKeyPolicyT">() = UniqueByKey().EncodedPolicy(),
|
|
797
|
+
key<"DelayConstructor">() =
|
|
798
|
+
StaticPolicyT::UniqueByKeyPolicyT::detail::delay_constructor_t::EncodedConstructor()>();
|
|
799
|
+
}
|
|
800
|
+
#endif
|
|
791
801
|
};
|
|
792
802
|
|
|
793
803
|
template <typename PolicyT>
|
|
@@ -136,6 +136,7 @@ CUB_NAMESPACE_BEGIN
|
|
|
136
136
|
//! {
|
|
137
137
|
//! int array[4] = {1, 2, 3, 4};
|
|
138
138
|
//! int sum = cub::ThreadReduce(array, ::cuda::std::plus<>{}); // sum = 10
|
|
139
|
+
//! }
|
|
139
140
|
//!
|
|
140
141
|
//! @endrst
|
|
141
142
|
//!
|
|
@@ -437,10 +438,13 @@ template <typename Input, typename ReductionOp, typename ValueT, typename AccumT
|
|
|
437
438
|
"Input must support the subscript operator[] and have a compile-time size");
|
|
438
439
|
static_assert(has_binary_call_operator<ReductionOp, ValueT>::value,
|
|
439
440
|
"ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
|
|
440
|
-
|
|
441
|
+
|
|
442
|
+
static constexpr auto length = static_size_v<Input>;
|
|
443
|
+
if constexpr (length == 1)
|
|
441
444
|
{
|
|
442
445
|
return static_cast<AccumT>(input[0]);
|
|
443
446
|
}
|
|
447
|
+
|
|
444
448
|
using PromT = ::cuda::std::_If<enable_min_max_promotion_v<ReductionOp, ValueT>, int, AccumT>;
|
|
445
449
|
// TODO: should be part of the tuning policy
|
|
446
450
|
if constexpr ((!is_simd_enabled_cuda_operator<ReductionOp, ValueT> && !is_simd_operator_v<ReductionOp>)
|
|
@@ -449,38 +453,41 @@ template <typename Input, typename ReductionOp, typename ValueT, typename AccumT
|
|
|
449
453
|
return ThreadReduceSequential<AccumT>(input, reduction_op);
|
|
450
454
|
}
|
|
451
455
|
|
|
452
|
-
constexpr
|
|
453
|
-
if constexpr (::cuda::std::is_same_v<Input, AccumT> && enable_sm90_simd_reduction_v<Input, ReductionOp, length>)
|
|
456
|
+
if constexpr (::cuda::std::is_same_v<ValueT, AccumT> && enable_sm90_simd_reduction_v<ValueT, ReductionOp, length>)
|
|
454
457
|
{
|
|
455
458
|
NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSimd(input, reduction_op);))
|
|
456
459
|
}
|
|
457
460
|
|
|
458
|
-
if constexpr (::cuda::std::is_same_v<
|
|
461
|
+
if constexpr (::cuda::std::is_same_v<ValueT, AccumT> && enable_sm80_simd_reduction_v<ValueT, ReductionOp, length>)
|
|
459
462
|
{
|
|
460
463
|
NV_IF_TARGET(NV_PROVIDES_SM_80, (return ThreadReduceSimd(input, reduction_op);))
|
|
461
464
|
}
|
|
462
465
|
|
|
463
|
-
if constexpr (::cuda::std::is_same_v<
|
|
466
|
+
if constexpr (::cuda::std::is_same_v<ValueT, AccumT> && enable_sm70_simd_reduction_v<ValueT, ReductionOp, length>)
|
|
464
467
|
{
|
|
465
468
|
NV_IF_TARGET(NV_PROVIDES_SM_70, (return ThreadReduceSimd(input, reduction_op);))
|
|
466
469
|
}
|
|
467
470
|
|
|
468
|
-
if constexpr (
|
|
471
|
+
if constexpr (length >= 6)
|
|
469
472
|
{
|
|
470
|
-
//
|
|
471
|
-
if constexpr (
|
|
472
|
-
&& is_one_of_v<PromT, int32_t, uint32_t>)
|
|
473
|
-
// the compiler generates bad code for int8/uint8 and min/max for SM90
|
|
474
|
-
|| (is_cuda_minimum_maximum_v<ReductionOp, ValueT> && is_one_of_v<PromT, int8_t, uint8_t>) )
|
|
473
|
+
// apply SM90 min/max ternary reduction only if the input is natively int32/uint32
|
|
474
|
+
if constexpr (enable_ternary_reduction_sm90_v<ValueT, ReductionOp>)
|
|
475
475
|
{
|
|
476
|
-
|
|
476
|
+
// with the current tuning policies, SM90/int32/+ uses too many registers (TODO: fix tuning policy)
|
|
477
|
+
if constexpr ((is_one_of_v<ReductionOp, ::cuda::std::plus<>, ::cuda::std::plus<PromT>>
|
|
478
|
+
&& is_one_of_v<PromT, int32_t, uint32_t>)
|
|
479
|
+
// the compiler generates bad code for int8/uint8 and min/max for SM90
|
|
480
|
+
|| (is_cuda_minimum_maximum_v<ReductionOp, ValueT> && is_one_of_v<PromT, int8_t, uint8_t>) )
|
|
481
|
+
{
|
|
482
|
+
NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSequential<PromT>(input, reduction_op);));
|
|
483
|
+
}
|
|
484
|
+
NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceTernaryTree<PromT>(input, reduction_op);));
|
|
477
485
|
}
|
|
478
|
-
NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceTernaryTree<PromT>(input, reduction_op);));
|
|
479
|
-
}
|
|
480
486
|
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
487
|
+
if constexpr (enable_ternary_reduction_sm50_v<ValueT, ReductionOp>)
|
|
488
|
+
{
|
|
489
|
+
NV_IF_TARGET(NV_PROVIDES_SM_50, (return ThreadReduceSequential<PromT>(input, reduction_op);));
|
|
490
|
+
}
|
|
484
491
|
}
|
|
485
492
|
|
|
486
493
|
return ThreadReduceBinaryTree<PromT>(input, reduction_op);
|
|
@@ -51,6 +51,7 @@
|
|
|
51
51
|
#include <cuda/__functional/maximum.h>
|
|
52
52
|
#include <cuda/__functional/minimum.h>
|
|
53
53
|
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
54
|
+
#include <cuda/std/__bit/countr.h>
|
|
54
55
|
#include <cuda/std/__functional/operations.h>
|
|
55
56
|
#include <cuda/std/__type_traits/enable_if.h>
|
|
56
57
|
#include <cuda/std/__type_traits/integral_constant.h>
|
|
@@ -701,7 +702,7 @@ struct WarpReduceShfl
|
|
|
701
702
|
_CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
|
|
702
703
|
{
|
|
703
704
|
// Get the start flags for each thread in the warp.
|
|
704
|
-
|
|
705
|
+
unsigned warp_flags = __ballot_sync(member_mask, flag);
|
|
705
706
|
|
|
706
707
|
// Convert to tail-segmented
|
|
707
708
|
if (HEAD_SEGMENTED)
|
|
@@ -722,7 +723,7 @@ struct WarpReduceShfl
|
|
|
722
723
|
warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
|
|
723
724
|
|
|
724
725
|
// Find the next set flag
|
|
725
|
-
int last_lane =
|
|
726
|
+
int last_lane = ::cuda::std::countr_zero(warp_flags);
|
|
726
727
|
|
|
727
728
|
T output = input;
|
|
728
729
|
// Template-iterate reduction steps
|
|
@@ -49,6 +49,7 @@
|
|
|
49
49
|
#include <cub/util_type.cuh>
|
|
50
50
|
|
|
51
51
|
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
52
|
+
#include <cuda/std/__bit/countr.h>
|
|
52
53
|
#include <cuda/std/__type_traits/integral_constant.h>
|
|
53
54
|
|
|
54
55
|
CUB_NAMESPACE_BEGIN
|
|
@@ -215,7 +216,7 @@ struct WarpReduceSmem
|
|
|
215
216
|
SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, ::cuda::std::true_type /*has_ballot*/)
|
|
216
217
|
{
|
|
217
218
|
// Get the start flags for each thread in the warp.
|
|
218
|
-
|
|
219
|
+
unsigned warp_flags = __ballot_sync(member_mask, flag);
|
|
219
220
|
|
|
220
221
|
if (!HEAD_SEGMENTED)
|
|
221
222
|
{
|
|
@@ -232,7 +233,7 @@ struct WarpReduceSmem
|
|
|
232
233
|
}
|
|
233
234
|
|
|
234
235
|
// Find next flag
|
|
235
|
-
int next_flag =
|
|
236
|
+
int next_flag = ::cuda::std::countr_zero(warp_flags);
|
|
236
237
|
|
|
237
238
|
// Clip the next segment at the warp boundary if necessary
|
|
238
239
|
if (LOGICAL_WARP_THREADS != 32)
|
|
@@ -50,8 +50,8 @@
|
|
|
50
50
|
|
|
51
51
|
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
52
52
|
#include <cuda/std/__algorithm/clamp.h>
|
|
53
|
-
#include <cuda/std/__algorithm/max.h>
|
|
54
53
|
#include <cuda/std/__bit/has_single_bit.h>
|
|
54
|
+
#include <cuda/std/__bit/integral.h>
|
|
55
55
|
#include <cuda/std/__functional/operations.h>
|
|
56
56
|
#include <cuda/std/__type_traits/integral_constant.h>
|
|
57
57
|
#include <cuda/std/__type_traits/is_integral.h>
|
|
@@ -630,7 +630,7 @@ struct WarpScanShfl
|
|
|
630
630
|
ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le();
|
|
631
631
|
|
|
632
632
|
// Find index of first set bit
|
|
633
|
-
int segment_first_lane = ::cuda::std::
|
|
633
|
+
int segment_first_lane = ::cuda::std::__bit_log2(ballot);
|
|
634
634
|
|
|
635
635
|
// Iterate scan steps
|
|
636
636
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
@@ -191,8 +191,8 @@ enum WarpLoadAlgorithm
|
|
|
191
191
|
//!
|
|
192
192
|
//! // Load a segment of consecutive items that are blocked across threads
|
|
193
193
|
//! int thread_data[items_per_thread];
|
|
194
|
-
//! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
|
|
195
|
-
//!
|
|
194
|
+
//! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data);
|
|
195
|
+
//! }
|
|
196
196
|
//!
|
|
197
197
|
//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``.
|
|
198
198
|
//! The set of ``thread_data`` across the first logical warp of threads in those
|
|
@@ -484,8 +484,8 @@ public:
|
|
|
484
484
|
//!
|
|
485
485
|
//! // Load a segment of consecutive items that are blocked across threads
|
|
486
486
|
//! int thread_data[items_per_thread];
|
|
487
|
-
//! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
|
|
488
|
-
//!
|
|
487
|
+
//! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data);
|
|
488
|
+
//! }
|
|
489
489
|
//!
|
|
490
490
|
//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``,
|
|
491
491
|
//! The set of ``thread_data`` across the first logical warp of threads in those
|
|
@@ -533,9 +533,9 @@ public:
|
|
|
533
533
|
//!
|
|
534
534
|
//! // Load a segment of consecutive items that are blocked across threads
|
|
535
535
|
//! int thread_data[items_per_thread];
|
|
536
|
-
//! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
|
|
537
|
-
//! thread_data,
|
|
536
|
+
//! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data,
|
|
538
537
|
//! valid_items);
|
|
538
|
+
//! }
|
|
539
539
|
//!
|
|
540
540
|
//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...`` and ``valid_items`` is ``5``.
|
|
541
541
|
//! The set of ``thread_data`` across the first logical warp of threads in those threads will be:
|
|
@@ -105,6 +105,7 @@ CUB_NAMESPACE_BEGIN
|
|
|
105
105
|
//! // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
|
|
106
106
|
//! int warp_id = threadIdx.x / 32;
|
|
107
107
|
//! int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
|
|
108
|
+
//! }
|
|
108
109
|
//!
|
|
109
110
|
//! Suppose the set of input ``thread_data`` across the block of threads is ``{0, 1, 2, 3, ..., 127}``.
|
|
110
111
|
//! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will be
|
|
@@ -130,6 +131,8 @@ CUB_NAMESPACE_BEGIN
|
|
|
130
131
|
//! int thread_data = ...
|
|
131
132
|
//! // Return the warp-wide sum to lane0
|
|
132
133
|
//! int aggregate = WarpReduce(temp_storage).Sum(thread_data);
|
|
134
|
+
//! }
|
|
135
|
+
//! }
|
|
133
136
|
//!
|
|
134
137
|
//! Suppose the set of input ``thread_data`` across the warp of threads is ``{0, 1, 2, 3, ..., 31}``.
|
|
135
138
|
//! The corresponding output ``aggregate`` in thread0 will be ``496`` (and is undefined in other threads).
|
|
@@ -218,6 +221,7 @@ public:
|
|
|
218
221
|
//! // Return the warp-wide sums to each lane0
|
|
219
222
|
//! int warp_id = threadIdx.x / 32;
|
|
220
223
|
//! int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
|
|
224
|
+
//! }
|
|
221
225
|
//!
|
|
222
226
|
//! Suppose the set of input ``thread_data`` across the block of threads is ``{0, 1, 2, 3, ..., 127}``.
|
|
223
227
|
//! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will ``496``, ``1520``, ``2544``, and
|
|
@@ -299,8 +303,8 @@ public:
|
|
|
299
303
|
//! thread_data = d_data[threadIdx.x];
|
|
300
304
|
//!
|
|
301
305
|
//! // Return the warp-wide sums to each lane0
|
|
302
|
-
//! int aggregate = WarpReduce(temp_storage).Sum(
|
|
303
|
-
//!
|
|
306
|
+
//! int aggregate = WarpReduce(temp_storage).Sum(thread_data, valid_items);
|
|
307
|
+
//! }
|
|
304
308
|
//!
|
|
305
309
|
//! Suppose the input ``d_data`` is ``{0, 1, 2, 3, 4, ...`` and ``valid_items`` is ``4``.
|
|
306
310
|
//! The corresponding output ``aggregate`` in *lane*\ :sub:`0` is ``6``
|
|
@@ -363,6 +367,7 @@ public:
|
|
|
363
367
|
//! // Return the warp-wide sums to each lane0
|
|
364
368
|
//! int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
|
|
365
369
|
//! thread_data, head_flag);
|
|
370
|
+
//! }
|
|
366
371
|
//!
|
|
367
372
|
//! Suppose the set of input ``thread_data`` and ``head_flag`` across the block of threads
|
|
368
373
|
//! is ``{0, 1, 2, 3, ..., 31`` and is ``{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0``,
|
|
@@ -114,6 +114,7 @@ CUB_NAMESPACE_BEGIN
|
|
|
114
114
|
//! // Compute warp-wide prefix sums
|
|
115
115
|
//! int warp_id = threadIdx.x / 32;
|
|
116
116
|
//! WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
|
|
117
|
+
//! }
|
|
117
118
|
//!
|
|
118
119
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
119
120
|
//! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps of
|
|
@@ -143,6 +144,8 @@ CUB_NAMESPACE_BEGIN
|
|
|
143
144
|
//!
|
|
144
145
|
//! // Compute warp-wide prefix sums
|
|
145
146
|
//! WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
|
|
147
|
+
//! }
|
|
148
|
+
//! }
|
|
146
149
|
//!
|
|
147
150
|
//! Suppose the set of input ``thread_data`` across the warp of threads is
|
|
148
151
|
//! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` will be
|
|
@@ -248,6 +251,7 @@ public:
|
|
|
248
251
|
//! // Compute inclusive warp-wide prefix sums
|
|
249
252
|
//! int warp_id = threadIdx.x / 32;
|
|
250
253
|
//! WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
|
|
254
|
+
//! }
|
|
251
255
|
//!
|
|
252
256
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
253
257
|
//! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
|
|
@@ -294,9 +298,8 @@ public:
|
|
|
294
298
|
//! // Compute inclusive warp-wide prefix sums
|
|
295
299
|
//! int warp_aggregate;
|
|
296
300
|
//! int warp_id = threadIdx.x / 32;
|
|
297
|
-
//! WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data,
|
|
298
|
-
//!
|
|
299
|
-
//! warp_aggregate);
|
|
301
|
+
//! WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
|
|
302
|
+
//! }
|
|
300
303
|
//!
|
|
301
304
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
302
305
|
//! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
|
|
@@ -352,6 +355,7 @@ public:
|
|
|
352
355
|
//! // Compute exclusive warp-wide prefix sums
|
|
353
356
|
//! int warp_id = threadIdx.x / 32;
|
|
354
357
|
//! WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
|
|
358
|
+
//! }
|
|
355
359
|
//!
|
|
356
360
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
357
361
|
//! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
|
|
@@ -201,6 +201,7 @@ enum WarpStoreAlgorithm
|
|
|
201
201
|
//!
|
|
202
202
|
//! // Store items to linear memory
|
|
203
203
|
//! WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
|
|
204
|
+
//! }
|
|
204
205
|
//!
|
|
205
206
|
//! Suppose the set of ``thread_data`` across the warp threads is
|
|
206
207
|
//! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``.
|
|
@@ -26,6 +26,7 @@
|
|
|
26
26
|
#if _CCCL_CUDA_COMPILATION()
|
|
27
27
|
# include <cuda/__ptx/instructions/get_sreg.h>
|
|
28
28
|
# include <cuda/__ptx/instructions/mbarrier_arrive.h>
|
|
29
|
+
# include <cuda/__ptx/instructions/mbarrier_wait.h>
|
|
29
30
|
# include <cuda/__ptx/ptx_dot_variants.h>
|
|
30
31
|
# include <cuda/__ptx/ptx_helper_functions.h>
|
|
31
32
|
#endif // _CCCL_CUDA_COMPILATION()
|
|
@@ -381,12 +382,30 @@ private:
|
|
|
381
382
|
public:
|
|
382
383
|
_CCCL_API inline void wait(arrival_token&& __phase) const
|
|
383
384
|
{
|
|
385
|
+
// no need to back off on a barrier in SMEM on SM90+, SYNCS unit is taking care of this
|
|
386
|
+
NV_IF_TARGET(NV_PROVIDES_SM_90,
|
|
387
|
+
(if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared)) {
|
|
388
|
+
while (!::cuda::ptx::mbarrier_try_wait(
|
|
389
|
+
reinterpret_cast<uint64_t*>(const_cast<__barrier_base*>(&__barrier)), __phase))
|
|
390
|
+
;
|
|
391
|
+
return;
|
|
392
|
+
}))
|
|
393
|
+
// fallback implementation
|
|
384
394
|
::cuda::std::__cccl_thread_poll_with_backoff(
|
|
385
395
|
::cuda::std::__barrier_poll_tester_phase<barrier>(this, ::cuda::std::move(__phase)));
|
|
386
396
|
}
|
|
387
397
|
|
|
388
398
|
_CCCL_API inline void wait_parity(bool __phase_parity) const
|
|
389
399
|
{
|
|
400
|
+
// no need to back off on a barrier in SMEM on SM90+, SYNCS unit is taking care of this
|
|
401
|
+
NV_IF_TARGET(NV_PROVIDES_SM_90,
|
|
402
|
+
(if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared)) {
|
|
403
|
+
while (!::cuda::ptx::mbarrier_try_wait_parity(
|
|
404
|
+
reinterpret_cast<uint64_t*>(const_cast<__barrier_base*>(&__barrier)), __phase_parity))
|
|
405
|
+
;
|
|
406
|
+
return;
|
|
407
|
+
}))
|
|
408
|
+
// fallback implementation
|
|
390
409
|
::cuda::std::__cccl_thread_poll_with_backoff(
|
|
391
410
|
::cuda::std::__barrier_poll_tester_parity<barrier>(this, __phase_parity));
|
|
392
411
|
}
|
|
@@ -23,6 +23,7 @@
|
|
|
23
23
|
#include <cuda/std/__cccl/exceptions.h> // IWYU pragma: export
|
|
24
24
|
#include <cuda/std/__cccl/execution_space.h> // IWYU pragma: export
|
|
25
25
|
#include <cuda/std/__cccl/extended_data_types.h> // IWYU pragma: export
|
|
26
|
+
#include <cuda/std/__cccl/host_std_lib.h> // IWYU pragma: export
|
|
26
27
|
#include <cuda/std/__cccl/os.h> // IWYU pragma: export
|
|
27
28
|
#include <cuda/std/__cccl/preprocessor.h> // IWYU pragma: export
|
|
28
29
|
#include <cuda/std/__cccl/ptx_isa.h> // IWYU pragma: export
|