cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
- cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
- cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +27 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +13 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +155 -13
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +79 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
- cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
- cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -174,10 +174,12 @@ CUB_NAMESPACE_BEGIN
|
|
|
174
174
|
//!
|
|
175
175
|
//! .. code-block:: python
|
|
176
176
|
//!
|
|
177
|
-
//!
|
|
177
|
+
//! from cuda import coop
|
|
178
|
+
//! from pynvjitlink import patch
|
|
179
|
+
//! patch.patch_numba_linker(lto=True)
|
|
178
180
|
//!
|
|
179
181
|
//! # Specialize radix sort for a 1D block of 128 threads owning 4 integer items each
|
|
180
|
-
//! block_radix_sort =
|
|
182
|
+
//! block_radix_sort = coop.block.radix_sort_keys(numba.int32, 128, 4)
|
|
181
183
|
//! temp_storage_bytes = block_radix_sort.temp_storage_bytes
|
|
182
184
|
//!
|
|
183
185
|
//! @cuda.jit(link=block_radix_sort.files)
|
|
@@ -190,6 +190,7 @@ enum BlockScanAlgorithm
|
|
|
190
190
|
//!
|
|
191
191
|
//! // Collectively compute the block-wide exclusive prefix sum
|
|
192
192
|
//! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
|
|
193
|
+
//! }
|
|
193
194
|
//!
|
|
194
195
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
195
196
|
//! ``{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}``.
|
|
@@ -333,6 +334,7 @@ public:
|
|
|
333
334
|
//!
|
|
334
335
|
//! // Collectively compute the block-wide exclusive prefix sum
|
|
335
336
|
//! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
|
|
337
|
+
//! }
|
|
336
338
|
//!
|
|
337
339
|
//! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
|
|
338
340
|
//! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
|
|
@@ -386,6 +388,7 @@ public:
|
|
|
386
388
|
//! // Collectively compute the block-wide exclusive prefix sum
|
|
387
389
|
//! int block_aggregate;
|
|
388
390
|
//! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
|
|
391
|
+
//! }
|
|
389
392
|
//!
|
|
390
393
|
//! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
|
|
391
394
|
//! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
|
|
@@ -479,6 +482,7 @@ public:
|
|
|
479
482
|
//! // Store scanned items to output segment
|
|
480
483
|
//! d_data[block_offset + threadIdx.x] = thread_data;
|
|
481
484
|
//! }
|
|
485
|
+
//! }
|
|
482
486
|
//!
|
|
483
487
|
//! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
|
|
484
488
|
//! The corresponding output for the first segment will be ``0, 1, ..., 127``.
|
|
@@ -545,6 +549,7 @@ public:
|
|
|
545
549
|
//!
|
|
546
550
|
//! // Collectively compute the block-wide exclusive prefix sum
|
|
547
551
|
//! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
|
|
552
|
+
//! }
|
|
548
553
|
//!
|
|
549
554
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
550
555
|
//! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
|
|
@@ -606,6 +611,7 @@ public:
|
|
|
606
611
|
//! // Collectively compute the block-wide exclusive prefix sum
|
|
607
612
|
//! int block_aggregate;
|
|
608
613
|
//! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
|
|
614
|
+
//! }
|
|
609
615
|
//!
|
|
610
616
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
611
617
|
//! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
|
|
@@ -720,6 +726,7 @@ public:
|
|
|
720
726
|
//! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
|
|
721
727
|
//! __syncthreads();
|
|
722
728
|
//! }
|
|
729
|
+
//! }
|
|
723
730
|
//!
|
|
724
731
|
//! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
|
|
725
732
|
//! The corresponding output for the first segment will be ``0, 1, 2, 3, ..., 510, 511``.
|
|
@@ -788,6 +795,7 @@ public:
|
|
|
788
795
|
//!
|
|
789
796
|
//! // Collectively compute the block-wide exclusive prefix max scan
|
|
790
797
|
//! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
|
|
798
|
+
//! }
|
|
791
799
|
//!
|
|
792
800
|
//! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
|
|
793
801
|
//! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
|
|
@@ -849,8 +857,9 @@ public:
|
|
|
849
857
|
//!
|
|
850
858
|
//! // Collectively compute the block-wide exclusive prefix max scan
|
|
851
859
|
//! int block_aggregate;
|
|
852
|
-
//! BlockScan(temp_storage).ExclusiveScan(
|
|
853
|
-
//!
|
|
860
|
+
//! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data,
|
|
861
|
+
//! INT_MIN, cuda::maximum<>{}, block_aggregate);
|
|
862
|
+
//! }
|
|
854
863
|
//!
|
|
855
864
|
//! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
|
|
856
865
|
//! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
|
|
@@ -960,6 +969,7 @@ public:
|
|
|
960
969
|
//! // Store scanned items to output segment
|
|
961
970
|
//! d_data[block_offset + threadIdx.x] = thread_data;
|
|
962
971
|
//! }
|
|
972
|
+
//! }
|
|
963
973
|
//!
|
|
964
974
|
//! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
|
|
965
975
|
//! The corresponding output for the first segment will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
|
|
@@ -616,6 +616,7 @@ enum BlockStoreAlgorithm
|
|
|
616
616
|
//!
|
|
617
617
|
//! // Store items to linear memory
|
|
618
618
|
//! BlockStore(temp_storage).Store(d_data, thread_data);
|
|
619
|
+
//! }
|
|
619
620
|
//!
|
|
620
621
|
//! Suppose the set of ``thread_data`` across the block of threads is
|
|
621
622
|
//! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
|
|
@@ -1156,8 +1157,8 @@ public:
|
|
|
1156
1157
|
//! ...
|
|
1157
1158
|
//!
|
|
1158
1159
|
//! // Store items to linear memory
|
|
1159
|
-
//! int thread_data[4];
|
|
1160
1160
|
//! BlockStore(temp_storage).Store(d_data, thread_data);
|
|
1161
|
+
//! }
|
|
1161
1162
|
//!
|
|
1162
1163
|
//! Suppose the set of ``thread_data`` across the block of threads is
|
|
1163
1164
|
//! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
|
|
@@ -1208,8 +1209,8 @@ public:
|
|
|
1208
1209
|
//! ...
|
|
1209
1210
|
//!
|
|
1210
1211
|
//! // Store items to linear memory
|
|
1211
|
-
//! int thread_data[4];
|
|
1212
1212
|
//! BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
|
|
1213
|
+
//! }
|
|
1213
1214
|
//!
|
|
1214
1215
|
//! Suppose the set of ``thread_data`` across the block of threads is
|
|
1215
1216
|
//! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }`` and ``valid_items`` is ``5``.
|
|
@@ -15,71 +15,76 @@
|
|
|
15
15
|
|
|
16
16
|
#include <cub/detail/fast_modulo_division.cuh> // fast_div_mod
|
|
17
17
|
|
|
18
|
+
#include <cuda/std/__mdspan/extents.h>
|
|
18
19
|
#include <cuda/std/__type_traits/make_unsigned.h>
|
|
19
20
|
#include <cuda/std/__utility/integer_sequence.h>
|
|
20
21
|
#include <cuda/std/array>
|
|
21
22
|
#include <cuda/std/cstddef>
|
|
22
|
-
#include <cuda/std/mdspan>
|
|
23
23
|
|
|
24
24
|
CUB_NAMESPACE_BEGIN
|
|
25
|
-
|
|
26
25
|
namespace detail
|
|
27
26
|
{
|
|
28
27
|
|
|
28
|
+
_CCCL_DIAG_PUSH
|
|
29
|
+
_CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code (even if there are no branches!)
|
|
30
|
+
|
|
29
31
|
// Compute the submdspan size of a given rank
|
|
30
|
-
template <
|
|
31
|
-
[[nodiscard]]
|
|
32
|
-
|
|
32
|
+
template <typename IndexType, size_t... Extents>
|
|
33
|
+
[[nodiscard]] _CCCL_API constexpr ::cuda::std::make_unsigned_t<IndexType>
|
|
34
|
+
size_range(const ::cuda::std::extents<IndexType, Extents...>& ext, int start, int end)
|
|
33
35
|
{
|
|
36
|
+
_CCCL_ASSERT(start >= 0 && end <= static_cast<int>(ext.rank()), "invalid start or end");
|
|
34
37
|
::cuda::std::make_unsigned_t<IndexType> s = 1;
|
|
35
|
-
for (
|
|
38
|
+
for (auto i = start; i < end; i++)
|
|
36
39
|
{
|
|
37
40
|
s *= ext.extent(i);
|
|
38
41
|
}
|
|
39
42
|
return s;
|
|
40
43
|
}
|
|
41
44
|
|
|
42
|
-
//
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
45
|
+
_CCCL_DIAG_POP // MSVC(4702)
|
|
46
|
+
|
|
47
|
+
template <typename IndexType, size_t... Extents>
|
|
48
|
+
[[nodiscard]] _CCCL_API constexpr ::cuda::std::make_unsigned_t<IndexType>
|
|
49
|
+
size(const ::cuda::std::extents<IndexType, Extents...>& ext)
|
|
46
50
|
{
|
|
47
|
-
return ::
|
|
51
|
+
return cub::detail::size_range(ext, 0, static_cast<int>(ext.rank()));
|
|
48
52
|
}
|
|
49
53
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
[[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
|
|
53
|
-
size(const ::cuda::std::extents<IndexType, Extents...>& ext)
|
|
54
|
+
template <bool IsLayoutRight, int Position, typename IndexType, size_t... E>
|
|
55
|
+
[[nodiscard]] _CCCL_API auto sub_size_fast_div_mod_impl(const ::cuda::std::extents<IndexType, E...>& ext)
|
|
54
56
|
{
|
|
55
|
-
|
|
57
|
+
using fast_mod_div_t = fast_div_mod<IndexType>;
|
|
58
|
+
constexpr auto start = IsLayoutRight ? Position + 1 : 0;
|
|
59
|
+
constexpr auto end = IsLayoutRight ? sizeof...(E) : Position;
|
|
60
|
+
return fast_mod_div_t(cub::detail::size_range(ext, start, end));
|
|
56
61
|
}
|
|
57
62
|
|
|
58
63
|
// precompute modulo/division for each submdspan size (by rank)
|
|
59
|
-
template <typename IndexType, size_t... E, size_t...
|
|
60
|
-
[[nodiscard]]
|
|
61
|
-
sub_sizes_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<
|
|
64
|
+
template <bool IsLayoutRight, typename IndexType, size_t... E, size_t... Positions>
|
|
65
|
+
[[nodiscard]] _CCCL_API auto
|
|
66
|
+
sub_sizes_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Positions...> = {})
|
|
62
67
|
{
|
|
63
|
-
// deduction guides don't work with nvcc 11.x
|
|
64
68
|
using fast_mod_div_t = fast_div_mod<IndexType>;
|
|
65
|
-
|
|
69
|
+
using array_t = ::cuda::std::array<fast_mod_div_t, sizeof...(Positions)>;
|
|
70
|
+
return array_t{cub::detail::sub_size_fast_div_mod_impl<IsLayoutRight, Positions>(ext)...};
|
|
66
71
|
}
|
|
67
72
|
|
|
68
73
|
// precompute modulo/division for each mdspan extent
|
|
69
|
-
template <typename IndexType, size_t... E, size_t...
|
|
70
|
-
[[nodiscard]]
|
|
71
|
-
extents_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<
|
|
74
|
+
template <typename IndexType, size_t... E, size_t... Positions>
|
|
75
|
+
[[nodiscard]] _CCCL_API auto
|
|
76
|
+
extents_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Positions...> = {})
|
|
72
77
|
{
|
|
73
78
|
using fast_mod_div_t = fast_div_mod<IndexType>;
|
|
74
|
-
|
|
79
|
+
using array_t = ::cuda::std::array<fast_mod_div_t, sizeof...(Positions)>;
|
|
80
|
+
return array_t{fast_mod_div_t(ext.extent(Positions))...};
|
|
75
81
|
}
|
|
76
82
|
|
|
77
83
|
// GCC <= 9 constexpr workaround: Extent must be passed as type only, even const Extent& doesn't work
|
|
78
|
-
template <
|
|
79
|
-
[[nodiscard]]
|
|
84
|
+
template <typename Extents>
|
|
85
|
+
[[nodiscard]] _CCCL_API constexpr bool are_extents_in_range_static(int start, int end)
|
|
80
86
|
{
|
|
81
|
-
|
|
82
|
-
for (index_type i = Rank; i < Extents::rank(); i++)
|
|
87
|
+
for (auto i = start; i < end; i++)
|
|
83
88
|
{
|
|
84
89
|
if (Extents::static_extent(i) == ::cuda::std::dynamic_extent)
|
|
85
90
|
{
|
|
@@ -106,5 +111,4 @@ template <typename MappingTypeLhs, typename MappingTypeRhs>
|
|
|
106
111
|
}
|
|
107
112
|
|
|
108
113
|
} // namespace detail
|
|
109
|
-
|
|
110
114
|
CUB_NAMESPACE_END
|
|
@@ -1,29 +1,5 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
*
|
|
4
|
-
* Redistribution and use in source and binary forms, with or without
|
|
5
|
-
* modification, are permitted provided that the following conditions are met:
|
|
6
|
-
* * Redistributions of source code must retain the above copyright
|
|
7
|
-
* notice, this list of conditions and the following disclaimer.
|
|
8
|
-
* * Redistributions in binary form must reproduce the above copyright
|
|
9
|
-
* notice, this list of conditions and the following disclaimer in the
|
|
10
|
-
* documentation and/or other materials provided with the distribution.
|
|
11
|
-
* * Neither the name of the NVIDIA CORPORATION nor the
|
|
12
|
-
* names of its contributors may be used to endorse or promote products
|
|
13
|
-
* derived from this software without specific prior written permission.
|
|
14
|
-
*
|
|
15
|
-
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
16
|
-
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
17
|
-
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
18
|
-
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
19
|
-
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
20
|
-
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
21
|
-
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
22
|
-
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
23
|
-
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
24
|
-
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
25
|
-
*
|
|
26
|
-
******************************************************************************/
|
|
1
|
+
// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
// SPDX-License-Identifier: BSD-3-Clause
|
|
27
3
|
|
|
28
4
|
#pragma once
|
|
29
5
|
|
|
@@ -41,24 +17,23 @@
|
|
|
41
17
|
#include <cub/util_namespace.cuh>
|
|
42
18
|
|
|
43
19
|
#include <thrust/detail/raw_reference_cast.h>
|
|
44
|
-
#include <thrust/distance.h>
|
|
45
20
|
#include <thrust/type_traits/is_contiguous_iterator.h>
|
|
46
21
|
#include <thrust/type_traits/unwrap_contiguous_iterator.h>
|
|
47
22
|
|
|
48
23
|
#include <cuda/__cmath/ceil_div.h>
|
|
24
|
+
#include <cuda/std/__concepts/concept_macros.h>
|
|
25
|
+
#include <cuda/std/__fwd/mdspan.h>
|
|
49
26
|
#include <cuda/std/__iterator/distance.h>
|
|
50
27
|
#include <cuda/std/__mdspan/extents.h>
|
|
28
|
+
#include <cuda/std/__mdspan/layout_left.h>
|
|
29
|
+
#include <cuda/std/__mdspan/layout_right.h>
|
|
51
30
|
#include <cuda/std/__memory/is_sufficiently_aligned.h>
|
|
52
31
|
#include <cuda/std/__type_traits/is_integral.h>
|
|
53
|
-
#include <cuda/std/__utility/integer_sequence.h>
|
|
54
32
|
#include <cuda/std/array>
|
|
55
33
|
|
|
56
34
|
CUB_NAMESPACE_BEGIN
|
|
57
35
|
|
|
58
|
-
namespace detail
|
|
59
|
-
{
|
|
60
|
-
|
|
61
|
-
namespace for_each
|
|
36
|
+
namespace detail::for_each
|
|
62
37
|
{
|
|
63
38
|
|
|
64
39
|
/**
|
|
@@ -122,8 +97,7 @@ struct op_wrapper_vectorized_t
|
|
|
122
97
|
}
|
|
123
98
|
};
|
|
124
99
|
|
|
125
|
-
} // namespace for_each
|
|
126
|
-
} // namespace detail
|
|
100
|
+
} // namespace detail::for_each
|
|
127
101
|
|
|
128
102
|
struct DeviceFor
|
|
129
103
|
{
|
|
@@ -568,6 +542,10 @@ public:
|
|
|
568
542
|
{
|
|
569
543
|
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceFor::Bulk");
|
|
570
544
|
static_assert(::cuda::std::is_integral_v<ShapeT>, "ShapeT must be an integral type");
|
|
545
|
+
if (shape == 0)
|
|
546
|
+
{
|
|
547
|
+
return cudaSuccess;
|
|
548
|
+
}
|
|
571
549
|
using offset_t = ShapeT;
|
|
572
550
|
return detail::for_each::dispatch_t<offset_t, OpT>::dispatch(static_cast<offset_t>(shape), op, stream);
|
|
573
551
|
}
|
|
@@ -833,7 +811,8 @@ public:
|
|
|
833
811
|
//! Overview
|
|
834
812
|
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
835
813
|
//!
|
|
836
|
-
//! Iterate through a multi-dimensional extents into
|
|
814
|
+
//! Iterate through a multi-dimensional extents into a single linear index and a list of indices for each extent
|
|
815
|
+
//! dimension.
|
|
837
816
|
//!
|
|
838
817
|
//! - a single linear index that represents the current iteration
|
|
839
818
|
//! - indices of each extent dimension
|
|
@@ -899,8 +878,6 @@ public:
|
|
|
899
878
|
OpType op,
|
|
900
879
|
cudaStream_t stream = {})
|
|
901
880
|
{
|
|
902
|
-
// TODO: check dimensions overflows
|
|
903
|
-
// TODO: check tha arity of OpType is equal to sizeof...(ExtentsType)
|
|
904
881
|
if (d_temp_storage == nullptr)
|
|
905
882
|
{
|
|
906
883
|
temp_storage_bytes = 1;
|
|
@@ -967,19 +944,120 @@ public:
|
|
|
967
944
|
template <typename IndexType, size_t... Extents, typename OpType>
|
|
968
945
|
CUB_RUNTIME_FUNCTION static cudaError_t
|
|
969
946
|
ForEachInExtents(const ::cuda::std::extents<IndexType, Extents...>& extents, OpType op, cudaStream_t stream = {})
|
|
947
|
+
{
|
|
948
|
+
using extents_type = ::cuda::std::extents<IndexType, Extents...>;
|
|
949
|
+
return cub::DeviceFor::ForEachInLayout(::cuda::std::layout_right::mapping<extents_type>{extents}, op, stream);
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
/*********************************************************************************************************************
|
|
953
|
+
* ForEachInLayout
|
|
954
|
+
********************************************************************************************************************/
|
|
955
|
+
|
|
956
|
+
//! @rst
|
|
957
|
+
//! Overview
|
|
958
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
959
|
+
//!
|
|
960
|
+
//! Iterate through multi-dimensional extents using a specific mdspan layout, applying a function object for each
|
|
961
|
+
//! element, passing
|
|
962
|
+
//!
|
|
963
|
+
//! - a single linear index that represents the current iteration
|
|
964
|
+
//! - a list of indices containing the coordinates for each extent dimension
|
|
965
|
+
//!
|
|
966
|
+
//! The iteration order depends on the layout type:
|
|
967
|
+
//!
|
|
968
|
+
//! - ``layout_right``: Iterates in row-major order (rightmost index varies fastest)
|
|
969
|
+
//! - ``layout_left``: Iterates in column-major order (leftmost index varies fastest)
|
|
970
|
+
//!
|
|
971
|
+
//! - The return value of ``op``, if any, is ignored.
|
|
972
|
+
//!
|
|
973
|
+
//! A Simple Example
|
|
974
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
975
|
+
//!
|
|
976
|
+
//! The following code snippet demonstrates how to use ``ForEachInLayout`` to iterate through a 2D matrix in
|
|
977
|
+
//! column-major order using ``layout_left``.
|
|
978
|
+
//!
|
|
979
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_for_each_in_layout_api.cu
|
|
980
|
+
//! :language: c++
|
|
981
|
+
//! :dedent:
|
|
982
|
+
//! :start-after: example-begin for-each-in-layout-op
|
|
983
|
+
//! :end-before: example-end for-each-in-layout-op
|
|
984
|
+
//!
|
|
985
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_for_each_in_layout_api.cu
|
|
986
|
+
//! :language: c++
|
|
987
|
+
//! :dedent:
|
|
988
|
+
//! :start-after: example-begin for-each-in-layout-example
|
|
989
|
+
//! :end-before: example-end for-each-in-layout-example
|
|
990
|
+
//!
|
|
991
|
+
//! @endrst
|
|
992
|
+
//!
|
|
993
|
+
//! @tparam Layout
|
|
994
|
+
//! **[inferred]** The mdspan layout type, must be either ``cuda::std::layout_left`` or ``cuda::std::layout_right``
|
|
995
|
+
//!
|
|
996
|
+
//! @tparam IndexType
|
|
997
|
+
//! **[inferred]** An integral type that represents the extent index space
|
|
998
|
+
//!
|
|
999
|
+
//! @tparam Extents
|
|
1000
|
+
//! **[inferred]** The extent sizes for each rank index
|
|
1001
|
+
//!
|
|
1002
|
+
//! @tparam OpType
|
|
1003
|
+
//! **[inferred]** A function object with arity equal to the number of extents + 1 for the linear index (iteration).
|
|
1004
|
+
//! The first parameter is the linear index, followed by one parameter for each dimension coordinate.
|
|
1005
|
+
//!
|
|
1006
|
+
//! @param[in] layout
|
|
1007
|
+
//! Layout object that determines the iteration order (layout_left for column-major, layout_right for row-major)
|
|
1008
|
+
//!
|
|
1009
|
+
//! @param[in] extents
|
|
1010
|
+
//! Extents object that represents a multi-dimensional index space
|
|
1011
|
+
//!
|
|
1012
|
+
//! @param[in] op
|
|
1013
|
+
//! Function object to apply to each linear index (iteration) and multi-dimensional coordinates.
|
|
1014
|
+
//! Called as ``op(linear_index, coord_0, coord_1, ..., coord_n)``
|
|
1015
|
+
//!
|
|
1016
|
+
//! @param[in] stream
|
|
1017
|
+
//! CUDA stream to launch kernels within. Default stream is `nullptr`
|
|
1018
|
+
//!
|
|
1019
|
+
//! @return cudaError_t
|
|
1020
|
+
//! error status
|
|
1021
|
+
_CCCL_TEMPLATE(typename LayoutMapping, typename OpType)
|
|
1022
|
+
_CCCL_REQUIRES(::cuda::std::__is_any_mdspan_layout_mapping_left_or_right_v<LayoutMapping>)
|
|
1023
|
+
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
|
|
1024
|
+
ForEachInLayout(const LayoutMapping& layout_mapping, OpType op, cudaStream_t stream = {})
|
|
970
1025
|
{
|
|
971
1026
|
using namespace cub::detail;
|
|
972
|
-
using extents_type = ::
|
|
1027
|
+
using extents_type = typename LayoutMapping::extents_type;
|
|
973
1028
|
using extent_index_type = typename extents_type::index_type;
|
|
974
1029
|
using fast_mod_array_t = ::cuda::std::array<fast_div_mod<extent_index_type>, extents_type::rank()>;
|
|
975
1030
|
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachInExtents");
|
|
976
1031
|
static constexpr auto seq = ::cuda::std::make_index_sequence<extents_type::rank()>{};
|
|
977
|
-
|
|
1032
|
+
constexpr bool is_layout_right = ::cuda::std::__is_any_mdspan_layout_mapping_right_v<LayoutMapping>;
|
|
1033
|
+
auto extents = layout_mapping.extents();
|
|
1034
|
+
fast_mod_array_t sub_sizes_div_array = cub::detail::sub_sizes_fast_div_mod<is_layout_right>(extents, seq);
|
|
978
1035
|
fast_mod_array_t extents_div_array = cub::detail::extents_fast_div_mod(extents, seq);
|
|
979
|
-
for_each::op_wrapper_extents_t<OpType, extents_type, fast_mod_array_t> op_wrapper{
|
|
1036
|
+
for_each::op_wrapper_extents_t<OpType, extents_type, is_layout_right, fast_mod_array_t> op_wrapper{
|
|
980
1037
|
op, extents, sub_sizes_div_array, extents_div_array};
|
|
981
1038
|
return Bulk(static_cast<implicit_prom_t<extent_index_type>>(cub::detail::size(extents)), op_wrapper, stream);
|
|
982
1039
|
}
|
|
1040
|
+
|
|
1041
|
+
#ifndef _CCCL_DOXYGEN_INVOKED
|
|
1042
|
+
|
|
1043
|
+
_CCCL_TEMPLATE(typename LayoutMapping, typename OpType)
|
|
1044
|
+
_CCCL_REQUIRES(::cuda::std::__is_any_mdspan_layout_mapping_left_or_right_v<LayoutMapping>)
|
|
1045
|
+
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ForEachInLayout(
|
|
1046
|
+
void* d_temp_storage,
|
|
1047
|
+
size_t& temp_storage_bytes,
|
|
1048
|
+
const LayoutMapping& layout_mapping,
|
|
1049
|
+
OpType op,
|
|
1050
|
+
cudaStream_t stream = {})
|
|
1051
|
+
{
|
|
1052
|
+
if (d_temp_storage == nullptr)
|
|
1053
|
+
{
|
|
1054
|
+
temp_storage_bytes = 1;
|
|
1055
|
+
return cudaSuccess;
|
|
1056
|
+
}
|
|
1057
|
+
return ForEachInLayout(layout_mapping, op, stream);
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
#endif // !_CCCL_DOXYGEN_INVOKED
|
|
983
1061
|
};
|
|
984
1062
|
|
|
985
1063
|
CUB_NAMESPACE_END
|
|
@@ -52,15 +52,15 @@
|
|
|
52
52
|
#include <cub/thread/thread_operators.cuh>
|
|
53
53
|
#include <cub/util_type.cuh>
|
|
54
54
|
|
|
55
|
-
#include <thrust/iterator/tabulate_output_iterator.h>
|
|
56
|
-
|
|
57
55
|
#include <cuda/__execution/determinism.h>
|
|
58
56
|
#include <cuda/__execution/require.h>
|
|
59
57
|
#include <cuda/__execution/tune.h>
|
|
60
58
|
#include <cuda/__functional/maximum.h>
|
|
61
59
|
#include <cuda/__functional/minimum.h>
|
|
60
|
+
#include <cuda/__iterator/tabulate_output_iterator.h>
|
|
62
61
|
#include <cuda/__memory_resource/get_memory_resource.h>
|
|
63
62
|
#include <cuda/__stream/get_stream.h>
|
|
63
|
+
#include <cuda/__stream/stream_ref.h>
|
|
64
64
|
#include <cuda/std/__execution/env.h>
|
|
65
65
|
#include <cuda/std/__functional/identity.h>
|
|
66
66
|
#include <cuda/std/__functional/invoke.h>
|
|
@@ -70,7 +70,6 @@
|
|
|
70
70
|
#include <cuda/std/__type_traits/is_same.h>
|
|
71
71
|
#include <cuda/std/cstdint>
|
|
72
72
|
#include <cuda/std/limits>
|
|
73
|
-
#include <cuda/stream_ref>
|
|
74
73
|
|
|
75
74
|
CUB_NAMESPACE_BEGIN
|
|
76
75
|
|
|
@@ -1215,7 +1214,7 @@ public:
|
|
|
1215
1214
|
OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
|
|
1216
1215
|
|
|
1217
1216
|
// Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
|
|
1218
|
-
auto out_it =
|
|
1217
|
+
auto out_it = ::cuda::make_tabulate_output_iterator(
|
|
1219
1218
|
detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
|
|
1220
1219
|
|
|
1221
1220
|
return detail::reduce::dispatch_streaming_arg_reduce_t<
|
|
@@ -1341,7 +1340,7 @@ public:
|
|
|
1341
1340
|
OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
|
|
1342
1341
|
|
|
1343
1342
|
// Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
|
|
1344
|
-
auto out_it =
|
|
1343
|
+
auto out_it = ::cuda::make_tabulate_output_iterator(
|
|
1345
1344
|
detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
|
|
1346
1345
|
|
|
1347
1346
|
// Query the required temporary storage size
|
|
@@ -1883,7 +1882,7 @@ public:
|
|
|
1883
1882
|
OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::lowest()};
|
|
1884
1883
|
|
|
1885
1884
|
// Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
|
|
1886
|
-
auto out_it =
|
|
1885
|
+
auto out_it = ::cuda::make_tabulate_output_iterator(
|
|
1887
1886
|
detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
|
|
1888
1887
|
|
|
1889
1888
|
return detail::reduce::dispatch_streaming_arg_reduce_t<
|
|
@@ -2133,7 +2132,7 @@ public:
|
|
|
2133
2132
|
OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
|
|
2134
2133
|
|
|
2135
2134
|
// Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
|
|
2136
|
-
auto out_it =
|
|
2135
|
+
auto out_it = ::cuda::make_tabulate_output_iterator(
|
|
2137
2136
|
detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
|
|
2138
2137
|
|
|
2139
2138
|
// Query the required temporary storage size
|