cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
- cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
- cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +27 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +13 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +155 -13
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +79 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
- cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
- cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -84,70 +84,6 @@ CUB_NAMESPACE_BEGIN
|
|
|
84
84
|
//! @endrst
|
|
85
85
|
struct DeviceSegmentedReduce
|
|
86
86
|
{
|
|
87
|
-
private:
|
|
88
|
-
template <typename InputIteratorT,
|
|
89
|
-
typename OutputIteratorT,
|
|
90
|
-
typename BeginOffsetIteratorT,
|
|
91
|
-
typename EndOffsetIteratorT,
|
|
92
|
-
typename OffsetT,
|
|
93
|
-
typename ReductionOpT,
|
|
94
|
-
typename InitT,
|
|
95
|
-
typename... Ts>
|
|
96
|
-
CUB_RUNTIME_FUNCTION static cudaError_t segmented_reduce(
|
|
97
|
-
::cuda::std::false_type,
|
|
98
|
-
void* d_temp_storage,
|
|
99
|
-
size_t& temp_storage_bytes,
|
|
100
|
-
InputIteratorT d_in,
|
|
101
|
-
OutputIteratorT d_out,
|
|
102
|
-
::cuda::std::int64_t num_segments,
|
|
103
|
-
BeginOffsetIteratorT d_begin_offsets,
|
|
104
|
-
EndOffsetIteratorT d_end_offsets,
|
|
105
|
-
ReductionOpT reduction_op,
|
|
106
|
-
InitT initial_value,
|
|
107
|
-
cudaStream_t stream);
|
|
108
|
-
|
|
109
|
-
template <typename InputIteratorT,
|
|
110
|
-
typename OutputIteratorT,
|
|
111
|
-
typename BeginOffsetIteratorT,
|
|
112
|
-
typename EndOffsetIteratorT,
|
|
113
|
-
typename OffsetT,
|
|
114
|
-
typename ReductionOpT,
|
|
115
|
-
typename InitT,
|
|
116
|
-
typename... Ts>
|
|
117
|
-
CUB_RUNTIME_FUNCTION static cudaError_t segmented_reduce(
|
|
118
|
-
::cuda::std::true_type,
|
|
119
|
-
void* d_temp_storage,
|
|
120
|
-
size_t& temp_storage_bytes,
|
|
121
|
-
InputIteratorT d_in,
|
|
122
|
-
OutputIteratorT d_out,
|
|
123
|
-
::cuda::std::int64_t num_segments,
|
|
124
|
-
BeginOffsetIteratorT d_begin_offsets,
|
|
125
|
-
EndOffsetIteratorT d_end_offsets,
|
|
126
|
-
ReductionOpT reduction_op,
|
|
127
|
-
InitT initial_value,
|
|
128
|
-
cudaStream_t stream)
|
|
129
|
-
{
|
|
130
|
-
return DispatchSegmentedReduce<
|
|
131
|
-
InputIteratorT,
|
|
132
|
-
OutputIteratorT,
|
|
133
|
-
BeginOffsetIteratorT,
|
|
134
|
-
EndOffsetIteratorT,
|
|
135
|
-
OffsetT,
|
|
136
|
-
ReductionOpT,
|
|
137
|
-
InitT,
|
|
138
|
-
Ts...>::Dispatch(d_temp_storage,
|
|
139
|
-
temp_storage_bytes,
|
|
140
|
-
d_in,
|
|
141
|
-
d_out,
|
|
142
|
-
num_segments,
|
|
143
|
-
d_begin_offsets,
|
|
144
|
-
d_end_offsets,
|
|
145
|
-
reduction_op,
|
|
146
|
-
initial_value,
|
|
147
|
-
stream);
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
public:
|
|
151
87
|
//! @rst
|
|
152
88
|
//! Computes a device-wide segmented reduction using the specified
|
|
153
89
|
//! binary ``reduction_op`` functor.
|
|
@@ -220,14 +156,14 @@ public:
|
|
|
220
156
|
//! @rst
|
|
221
157
|
//! Random-access input iterator to the sequence of beginning offsets of
|
|
222
158
|
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
223
|
-
//! element of the *i*\ :sup:`th` data segment in ``
|
|
159
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
224
160
|
//! @endrst
|
|
225
161
|
//!
|
|
226
162
|
//! @param[in] d_end_offsets
|
|
227
163
|
//! @rst
|
|
228
164
|
//! Random-access input iterator to the sequence of ending offsets of length
|
|
229
165
|
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
230
|
-
//! the *i*\ :sup:`th` data segment in ``
|
|
166
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
231
167
|
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
232
168
|
//! @endrst
|
|
233
169
|
//!
|
|
@@ -261,24 +197,29 @@ public:
|
|
|
261
197
|
{
|
|
262
198
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Reduce");
|
|
263
199
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
200
|
+
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
201
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
202
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
203
|
+
{
|
|
204
|
+
return DispatchSegmentedReduce<
|
|
205
|
+
InputIteratorT,
|
|
206
|
+
OutputIteratorT,
|
|
207
|
+
BeginOffsetIteratorT,
|
|
208
|
+
EndOffsetIteratorT,
|
|
209
|
+
OffsetT,
|
|
210
|
+
ReductionOpT,
|
|
211
|
+
T>::Dispatch(d_temp_storage,
|
|
212
|
+
temp_storage_bytes,
|
|
213
|
+
d_in,
|
|
214
|
+
d_out,
|
|
215
|
+
num_segments,
|
|
216
|
+
d_begin_offsets,
|
|
217
|
+
d_end_offsets,
|
|
218
|
+
reduction_op,
|
|
219
|
+
initial_value, // zero-initialize
|
|
220
|
+
stream);
|
|
221
|
+
}
|
|
222
|
+
_CCCL_UNREACHABLE();
|
|
282
223
|
}
|
|
283
224
|
|
|
284
225
|
//! @rst
|
|
@@ -431,15 +372,14 @@ public:
|
|
|
431
372
|
//! @rst
|
|
432
373
|
//! Random-access input iterator to the sequence of beginning offsets of
|
|
433
374
|
//! length ``num_segments`, such that ``d_begin_offsets[i]`` is the first
|
|
434
|
-
//! element of the *i*\ :sup:`th` data segment in ``
|
|
435
|
-
//! ``d_values_*``
|
|
375
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
436
376
|
//! @endrst
|
|
437
377
|
//!
|
|
438
378
|
//! @param[in] d_end_offsets
|
|
439
379
|
//! @rst
|
|
440
380
|
//! Random-access input iterator to the sequence of ending offsets of length
|
|
441
381
|
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
442
|
-
//! the *i*\ :sup:`th` data segment in ``
|
|
382
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
443
383
|
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
444
384
|
//! @endrst
|
|
445
385
|
//!
|
|
@@ -465,32 +405,31 @@ public:
|
|
|
465
405
|
{
|
|
466
406
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Sum");
|
|
467
407
|
|
|
468
|
-
// Integer type for global offsets
|
|
469
408
|
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
409
|
+
using OutputT = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
|
|
410
|
+
using init_t = OutputT;
|
|
411
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
412
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
413
|
+
{
|
|
414
|
+
return DispatchSegmentedReduce<
|
|
415
|
+
InputIteratorT,
|
|
416
|
+
OutputIteratorT,
|
|
417
|
+
BeginOffsetIteratorT,
|
|
418
|
+
EndOffsetIteratorT,
|
|
419
|
+
OffsetT,
|
|
420
|
+
::cuda::std::plus<>,
|
|
421
|
+
init_t>::Dispatch(d_temp_storage,
|
|
422
|
+
temp_storage_bytes,
|
|
423
|
+
d_in,
|
|
424
|
+
d_out,
|
|
425
|
+
num_segments,
|
|
426
|
+
d_begin_offsets,
|
|
427
|
+
d_end_offsets,
|
|
428
|
+
::cuda::std::plus<>{},
|
|
429
|
+
init_t{}, // zero-initialize
|
|
430
|
+
stream);
|
|
431
|
+
}
|
|
432
|
+
_CCCL_UNREACHABLE();
|
|
494
433
|
}
|
|
495
434
|
|
|
496
435
|
//! @rst
|
|
@@ -556,9 +495,7 @@ public:
|
|
|
556
495
|
// `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
|
|
557
496
|
// integral constant or larger integral types
|
|
558
497
|
using offset_t = int;
|
|
559
|
-
|
|
560
|
-
// The output value type
|
|
561
|
-
using output_t = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
|
|
498
|
+
using output_t = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
|
|
562
499
|
|
|
563
500
|
return detail::reduce::
|
|
564
501
|
DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::std::plus<>, output_t>::Dispatch(
|
|
@@ -640,14 +577,14 @@ public:
|
|
|
640
577
|
//! @rst
|
|
641
578
|
//! Random-access input iterator to the sequence of beginning offsets of
|
|
642
579
|
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
643
|
-
//! element of the *i*\ :sup:`th` data segment in ``
|
|
580
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
644
581
|
//! @endrst
|
|
645
582
|
//!
|
|
646
583
|
//! @param[in] d_end_offsets
|
|
647
584
|
//! @rst
|
|
648
585
|
//! Random-access input iterator to the sequence of ending offsets of length
|
|
649
586
|
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
650
|
-
//! the *i*\ :sup:`th` data segment in ``
|
|
587
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
651
588
|
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
652
589
|
//! @endrst
|
|
653
590
|
//!
|
|
@@ -673,32 +610,31 @@ public:
|
|
|
673
610
|
{
|
|
674
611
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Min");
|
|
675
612
|
|
|
676
|
-
// Integer type for global offsets
|
|
677
613
|
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
614
|
+
using InputT = detail::it_value_t<InputIteratorT>;
|
|
615
|
+
using init_t = InputT;
|
|
616
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
617
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
618
|
+
{
|
|
619
|
+
return DispatchSegmentedReduce<
|
|
620
|
+
InputIteratorT,
|
|
621
|
+
OutputIteratorT,
|
|
622
|
+
BeginOffsetIteratorT,
|
|
623
|
+
EndOffsetIteratorT,
|
|
624
|
+
OffsetT,
|
|
625
|
+
::cuda::minimum<>,
|
|
626
|
+
init_t>::Dispatch(d_temp_storage,
|
|
627
|
+
temp_storage_bytes,
|
|
628
|
+
d_in,
|
|
629
|
+
d_out,
|
|
630
|
+
num_segments,
|
|
631
|
+
d_begin_offsets,
|
|
632
|
+
d_end_offsets,
|
|
633
|
+
::cuda::minimum<>{},
|
|
634
|
+
::cuda::std::numeric_limits<init_t>::max(),
|
|
635
|
+
stream);
|
|
636
|
+
}
|
|
637
|
+
_CCCL_UNREACHABLE();
|
|
702
638
|
}
|
|
703
639
|
|
|
704
640
|
//! @rst
|
|
@@ -769,9 +705,7 @@ public:
|
|
|
769
705
|
// `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
|
|
770
706
|
// integral constant or larger integral types
|
|
771
707
|
using offset_t = int;
|
|
772
|
-
|
|
773
|
-
// The input value type
|
|
774
|
-
using input_t = cub::detail::it_value_t<InputIteratorT>;
|
|
708
|
+
using input_t = detail::it_value_t<InputIteratorT>;
|
|
775
709
|
|
|
776
710
|
return detail::reduce::
|
|
777
711
|
DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::minimum<>, input_t>::Dispatch(
|
|
@@ -857,14 +791,14 @@ public:
|
|
|
857
791
|
//! @rst
|
|
858
792
|
//! Random-access input iterator to the sequence of beginning offsets of
|
|
859
793
|
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
860
|
-
//! element of the *i*\ :sup:`th` data segment in ``
|
|
794
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
861
795
|
//! @endrst
|
|
862
796
|
//!
|
|
863
797
|
//! @param[in] d_end_offsets
|
|
864
798
|
//! @rst
|
|
865
799
|
//! Random-access input iterator to the sequence of ending offsets of length
|
|
866
800
|
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
867
|
-
//! the *i*\ :sup:`th` data segment in ``
|
|
801
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
868
802
|
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
869
803
|
//! @endrst
|
|
870
804
|
//!
|
|
@@ -890,54 +824,45 @@ public:
|
|
|
890
824
|
{
|
|
891
825
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMin");
|
|
892
826
|
|
|
893
|
-
// Integer type for global offsets
|
|
894
827
|
// Using common iterator value type is a breaking change, see:
|
|
895
828
|
// https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
|
|
896
829
|
using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
897
830
|
|
|
898
|
-
|
|
899
|
-
using
|
|
900
|
-
|
|
901
|
-
// The output tuple type
|
|
902
|
-
using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
|
|
903
|
-
|
|
904
|
-
// The output value type
|
|
831
|
+
using InputValueT = detail::it_value_t<InputIteratorT>;
|
|
832
|
+
using OutputTupleT = detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
|
|
905
833
|
using OutputValueT = typename OutputTupleT::Value;
|
|
906
|
-
|
|
907
|
-
using
|
|
908
|
-
|
|
909
|
-
using InitT = detail::reduce::empty_problem_init_t<AccumT>;
|
|
834
|
+
using AccumT = OutputTupleT;
|
|
835
|
+
using InitT = detail::reduce::empty_problem_init_t<AccumT>;
|
|
910
836
|
|
|
911
837
|
// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
|
|
912
838
|
using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
|
|
913
|
-
|
|
914
839
|
ArgIndexInputIteratorT d_indexed_in(d_in);
|
|
915
840
|
|
|
916
|
-
// Initial value
|
|
917
841
|
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
|
|
918
842
|
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
843
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
844
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
845
|
+
{
|
|
846
|
+
return DispatchSegmentedReduce<
|
|
847
|
+
ArgIndexInputIteratorT,
|
|
848
|
+
OutputIteratorT,
|
|
849
|
+
BeginOffsetIteratorT,
|
|
850
|
+
EndOffsetIteratorT,
|
|
851
|
+
OffsetT,
|
|
852
|
+
cub::ArgMin,
|
|
853
|
+
InitT,
|
|
854
|
+
AccumT>::Dispatch(d_temp_storage,
|
|
855
|
+
temp_storage_bytes,
|
|
856
|
+
d_indexed_in,
|
|
857
|
+
d_out,
|
|
858
|
+
num_segments,
|
|
859
|
+
d_begin_offsets,
|
|
860
|
+
d_end_offsets,
|
|
861
|
+
cub::ArgMin{},
|
|
862
|
+
initial_value,
|
|
863
|
+
stream);
|
|
864
|
+
}
|
|
865
|
+
_CCCL_UNREACHABLE();
|
|
941
866
|
}
|
|
942
867
|
|
|
943
868
|
//! @rst
|
|
@@ -1111,14 +1036,14 @@ public:
|
|
|
1111
1036
|
//! @rst
|
|
1112
1037
|
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1113
1038
|
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
1114
|
-
//! element of the *i*\ :sup:`th` data segment in ``
|
|
1039
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
1115
1040
|
//! @endrst
|
|
1116
1041
|
//!
|
|
1117
1042
|
//! @param[in] d_end_offsets
|
|
1118
1043
|
//! @rst
|
|
1119
1044
|
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1120
1045
|
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
1121
|
-
//! the *i*\ :sup:`th` data segment in ``
|
|
1046
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
1122
1047
|
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
1123
1048
|
//! @endrst
|
|
1124
1049
|
//!
|
|
@@ -1144,27 +1069,32 @@ public:
|
|
|
1144
1069
|
{
|
|
1145
1070
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Max");
|
|
1146
1071
|
|
|
1147
|
-
// Integer type for global offsets
|
|
1148
1072
|
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1073
|
+
using InputT = cub::detail::it_value_t<InputIteratorT>;
|
|
1074
|
+
using init_t = InputT;
|
|
1075
|
+
|
|
1076
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
1077
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
1078
|
+
{
|
|
1079
|
+
return DispatchSegmentedReduce<
|
|
1080
|
+
InputIteratorT,
|
|
1081
|
+
OutputIteratorT,
|
|
1082
|
+
BeginOffsetIteratorT,
|
|
1083
|
+
EndOffsetIteratorT,
|
|
1084
|
+
OffsetT,
|
|
1085
|
+
::cuda::maximum<>,
|
|
1086
|
+
init_t>::Dispatch(d_temp_storage,
|
|
1087
|
+
temp_storage_bytes,
|
|
1088
|
+
d_in,
|
|
1089
|
+
d_out,
|
|
1090
|
+
num_segments,
|
|
1091
|
+
d_begin_offsets,
|
|
1092
|
+
d_end_offsets,
|
|
1093
|
+
::cuda::maximum<>{},
|
|
1094
|
+
::cuda::std::numeric_limits<init_t>::lowest(),
|
|
1095
|
+
stream);
|
|
1096
|
+
}
|
|
1097
|
+
_CCCL_UNREACHABLE();
|
|
1168
1098
|
}
|
|
1169
1099
|
|
|
1170
1100
|
//! @rst
|
|
@@ -1229,9 +1159,7 @@ public:
|
|
|
1229
1159
|
// `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
|
|
1230
1160
|
// integral constant or larger integral types
|
|
1231
1161
|
using offset_t = int;
|
|
1232
|
-
|
|
1233
|
-
// The input value type
|
|
1234
|
-
using input_t = cub::detail::it_value_t<InputIteratorT>;
|
|
1162
|
+
using input_t = detail::it_value_t<InputIteratorT>;
|
|
1235
1163
|
|
|
1236
1164
|
return detail::reduce::
|
|
1237
1165
|
DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::maximum<>, input_t>::Dispatch(
|
|
@@ -1320,14 +1248,14 @@ public:
|
|
|
1320
1248
|
//! @rst
|
|
1321
1249
|
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1322
1250
|
//! length `num_segments`, such that ``d_begin_offsets[i]`` is the first
|
|
1323
|
-
//! element of the *i*\ :sup:`th` data segment in ``
|
|
1251
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
1324
1252
|
//! @endrst
|
|
1325
1253
|
//!
|
|
1326
1254
|
//! @param[in] d_end_offsets
|
|
1327
1255
|
//! @rst
|
|
1328
1256
|
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1329
1257
|
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
1330
|
-
//! the *i*\ :sup:`th` data segment in ``
|
|
1258
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
1331
1259
|
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
1332
1260
|
//! @endrst
|
|
1333
1261
|
//!
|
|
@@ -1353,54 +1281,45 @@ public:
|
|
|
1353
1281
|
{
|
|
1354
1282
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMax");
|
|
1355
1283
|
|
|
1356
|
-
// Integer type for global offsets
|
|
1357
1284
|
// Using common iterator value type is a breaking change, see:
|
|
1358
1285
|
// https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
|
|
1359
1286
|
using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
1360
1287
|
|
|
1361
|
-
|
|
1362
|
-
using InputValueT = cub::detail::it_value_t<InputIteratorT>;
|
|
1363
|
-
|
|
1364
|
-
// The output tuple type
|
|
1288
|
+
using InputValueT = cub::detail::it_value_t<InputIteratorT>;
|
|
1365
1289
|
using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
|
|
1366
|
-
|
|
1367
|
-
using
|
|
1368
|
-
|
|
1369
|
-
using InitT = detail::reduce::empty_problem_init_t<AccumT>;
|
|
1370
|
-
|
|
1371
|
-
// The output value type
|
|
1290
|
+
using AccumT = OutputTupleT;
|
|
1291
|
+
using InitT = detail::reduce::empty_problem_init_t<AccumT>;
|
|
1372
1292
|
using OutputValueT = typename OutputTupleT::Value;
|
|
1373
1293
|
|
|
1374
1294
|
// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
|
|
1375
1295
|
using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
|
|
1376
|
-
|
|
1377
1296
|
ArgIndexInputIteratorT d_indexed_in(d_in);
|
|
1378
1297
|
|
|
1379
|
-
// Initial value
|
|
1380
1298
|
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
|
|
1381
1299
|
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1300
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
1301
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
1302
|
+
{
|
|
1303
|
+
return DispatchSegmentedReduce<
|
|
1304
|
+
ArgIndexInputIteratorT,
|
|
1305
|
+
OutputIteratorT,
|
|
1306
|
+
BeginOffsetIteratorT,
|
|
1307
|
+
EndOffsetIteratorT,
|
|
1308
|
+
OffsetT,
|
|
1309
|
+
cub::ArgMax,
|
|
1310
|
+
InitT,
|
|
1311
|
+
AccumT>::Dispatch(d_temp_storage,
|
|
1312
|
+
temp_storage_bytes,
|
|
1313
|
+
d_indexed_in,
|
|
1314
|
+
d_out,
|
|
1315
|
+
num_segments,
|
|
1316
|
+
d_begin_offsets,
|
|
1317
|
+
d_end_offsets,
|
|
1318
|
+
cub::ArgMax{},
|
|
1319
|
+
initial_value,
|
|
1320
|
+
stream);
|
|
1321
|
+
}
|
|
1322
|
+
_CCCL_UNREACHABLE();
|
|
1404
1323
|
}
|
|
1405
1324
|
|
|
1406
1325
|
//! @rst
|
|
@@ -1476,34 +1395,25 @@ public:
|
|
|
1476
1395
|
// integral constant or larger integral types
|
|
1477
1396
|
using input_t = int;
|
|
1478
1397
|
|
|
1479
|
-
|
|
1480
|
-
using
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
using output_tuple_t = cub::detail::non_void_value_t<OutputIteratorT, ::cuda::std::pair<input_t, input_value_t>>;
|
|
1484
|
-
|
|
1485
|
-
using accum_t = output_tuple_t;
|
|
1486
|
-
|
|
1487
|
-
using init_t = detail::reduce::empty_problem_init_t<accum_t>;
|
|
1488
|
-
|
|
1489
|
-
// The output value type
|
|
1398
|
+
using input_value_t = detail::it_value_t<InputIteratorT>;
|
|
1399
|
+
using output_tuple_t = detail::non_void_value_t<OutputIteratorT, ::cuda::std::pair<input_t, input_value_t>>;
|
|
1400
|
+
using accum_t = output_tuple_t;
|
|
1401
|
+
using init_t = detail::reduce::empty_problem_init_t<accum_t>;
|
|
1490
1402
|
using output_value_t = typename output_tuple_t::second_type;
|
|
1491
1403
|
|
|
1492
1404
|
// Wrapped input iterator to produce index-value <input_t, InputT> tuples
|
|
1493
1405
|
auto d_indexed_in = THRUST_NS_QUALIFIER::make_transform_iterator(
|
|
1494
1406
|
THRUST_NS_QUALIFIER::counting_iterator<::cuda::std::int64_t>{0},
|
|
1495
1407
|
detail::reduce::generate_idx_value<InputIteratorT, output_value_t>(d_in, segment_size));
|
|
1496
|
-
|
|
1497
1408
|
using arg_index_input_iterator_t = decltype(d_indexed_in);
|
|
1498
1409
|
|
|
1499
|
-
// Initial value
|
|
1500
1410
|
init_t initial_value{accum_t(1, ::cuda::std::numeric_limits<input_value_t>::lowest())};
|
|
1501
1411
|
|
|
1502
1412
|
return detail::reduce::DispatchFixedSizeSegmentedReduce<
|
|
1503
1413
|
arg_index_input_iterator_t,
|
|
1504
1414
|
OutputIteratorT,
|
|
1505
1415
|
input_t,
|
|
1506
|
-
|
|
1416
|
+
detail::arg_max,
|
|
1507
1417
|
init_t,
|
|
1508
1418
|
accum_t>::Dispatch(d_temp_storage,
|
|
1509
1419
|
temp_storage_bytes,
|
|
@@ -1511,7 +1421,7 @@ public:
|
|
|
1511
1421
|
d_out,
|
|
1512
1422
|
num_segments,
|
|
1513
1423
|
segment_size,
|
|
1514
|
-
|
|
1424
|
+
detail::arg_max(),
|
|
1515
1425
|
initial_value,
|
|
1516
1426
|
stream);
|
|
1517
1427
|
}
|