cuda-cccl 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
- cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
- cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +27 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +13 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +155 -13
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +79 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
- cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
- cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -18,6 +18,8 @@
|
|
|
18
18
|
#include <cub/util_namespace.cuh>
|
|
19
19
|
|
|
20
20
|
#include <cuda/__functional/address_stability.h>
|
|
21
|
+
#include <cuda/__stream/get_stream.h>
|
|
22
|
+
#include <cuda/std/__execution/env.h>
|
|
21
23
|
#include <cuda/std/tuple>
|
|
22
24
|
|
|
23
25
|
CUB_NAMESPACE_BEGIN
|
|
@@ -49,13 +51,20 @@ CUB_NAMESPACE_BEGIN
|
|
|
49
51
|
struct DeviceTransform
|
|
50
52
|
{
|
|
51
53
|
private:
|
|
52
|
-
template <typename... RandomAccessIteratorsIn,
|
|
54
|
+
template <typename... RandomAccessIteratorsIn,
|
|
55
|
+
typename RandomAccessIteratorOut,
|
|
56
|
+
typename NumItemsT,
|
|
57
|
+
typename Predicate,
|
|
58
|
+
typename TransformOp,
|
|
59
|
+
typename StableAddress = cuda::std::false_type>
|
|
53
60
|
CUB_RUNTIME_FUNCTION static cudaError_t TransformInternal(
|
|
54
61
|
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
|
|
55
62
|
RandomAccessIteratorOut output,
|
|
56
63
|
NumItemsT num_items,
|
|
64
|
+
Predicate predicate,
|
|
57
65
|
TransformOp transform_op,
|
|
58
|
-
cudaStream_t stream
|
|
66
|
+
cudaStream_t stream,
|
|
67
|
+
StableAddress = {})
|
|
59
68
|
{
|
|
60
69
|
using choose_offset_t = detail::choose_signed_offset<NumItemsT>;
|
|
61
70
|
using offset_t = typename choose_offset_t::type;
|
|
@@ -66,18 +75,28 @@ private:
|
|
|
66
75
|
return error;
|
|
67
76
|
}
|
|
68
77
|
|
|
69
|
-
return detail::transform::dispatch_t<
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
78
|
+
return detail::transform::dispatch_t < StableAddress::value
|
|
79
|
+
? detail::transform::requires_stable_address::yes
|
|
80
|
+
: detail::transform::requires_stable_address::no,
|
|
81
|
+
offset_t, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, Predicate,
|
|
82
|
+
TransformOp > ::dispatch(
|
|
83
|
+
::cuda::std::move(inputs),
|
|
84
|
+
::cuda::std::move(output),
|
|
85
|
+
num_items,
|
|
86
|
+
::cuda::std::move(predicate),
|
|
87
|
+
::cuda::std::move(transform_op),
|
|
88
|
+
stream);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
template <typename Env>
|
|
92
|
+
CUB_RUNTIME_FUNCTION static auto get_stream(Env env) -> cudaStream_t
|
|
93
|
+
{
|
|
94
|
+
return ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}).get();
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
CUB_RUNTIME_FUNCTION static auto get_stream(cudaStream_t stream) -> cudaStream_t
|
|
98
|
+
{
|
|
99
|
+
return stream;
|
|
81
100
|
}
|
|
82
101
|
|
|
83
102
|
public:
|
|
@@ -108,18 +127,28 @@ public:
|
|
|
108
127
|
//! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
|
|
109
128
|
//! types must be convertible to the parameters of the function object's call operator. The return type of the call
|
|
110
129
|
//! operator must be assignable to the dereferenced output iterator.
|
|
111
|
-
//! @param
|
|
112
|
-
|
|
130
|
+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
|
|
131
|
+
//! stream\ :sub:`0`
|
|
132
|
+
template <typename... RandomAccessIteratorsIn,
|
|
133
|
+
typename RandomAccessIteratorOut,
|
|
134
|
+
typename NumItemsT,
|
|
135
|
+
typename TransformOp,
|
|
136
|
+
typename Env = ::cuda::std::execution::env<>>
|
|
113
137
|
CUB_RUNTIME_FUNCTION static cudaError_t Transform(
|
|
114
138
|
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
|
|
115
139
|
RandomAccessIteratorOut output,
|
|
116
140
|
NumItemsT num_items,
|
|
117
141
|
TransformOp transform_op,
|
|
118
|
-
|
|
142
|
+
Env env = {})
|
|
119
143
|
{
|
|
120
144
|
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
|
|
121
145
|
return TransformInternal(
|
|
122
|
-
::cuda::std::move(inputs),
|
|
146
|
+
::cuda::std::move(inputs),
|
|
147
|
+
::cuda::std::move(output),
|
|
148
|
+
num_items,
|
|
149
|
+
detail::transform::always_true_predicate{},
|
|
150
|
+
::cuda::std::move(transform_op),
|
|
151
|
+
get_stream(env));
|
|
123
152
|
}
|
|
124
153
|
|
|
125
154
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
@@ -160,21 +189,26 @@ public:
|
|
|
160
189
|
//! @param transform_op A unary function object. The input iterator's value type must be convertible to the parameter
|
|
161
190
|
//! of the function object's call operator. The return type of the call operator must be assignable to the
|
|
162
191
|
//! dereferenced output iterator.
|
|
163
|
-
//! @param
|
|
164
|
-
|
|
192
|
+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
|
|
193
|
+
//! stream\ :sub:`0`
|
|
194
|
+
template <typename RandomAccessIteratorIn,
|
|
195
|
+
typename RandomAccessIteratorOut,
|
|
196
|
+
typename NumItemsT,
|
|
197
|
+
typename TransformOp,
|
|
198
|
+
typename Env = ::cuda::std::execution::env<>>
|
|
165
199
|
CUB_RUNTIME_FUNCTION static cudaError_t Transform(
|
|
166
200
|
RandomAccessIteratorIn input,
|
|
167
201
|
RandomAccessIteratorOut output,
|
|
168
202
|
NumItemsT num_items,
|
|
169
203
|
TransformOp transform_op,
|
|
170
|
-
|
|
204
|
+
Env env = {})
|
|
171
205
|
{
|
|
172
206
|
return Transform(
|
|
173
207
|
::cuda::std::make_tuple(::cuda::std::move(input)),
|
|
174
208
|
::cuda::std::move(output),
|
|
175
209
|
num_items,
|
|
176
210
|
::cuda::std::move(transform_op),
|
|
177
|
-
|
|
211
|
+
::cuda::std::move(env));
|
|
178
212
|
}
|
|
179
213
|
|
|
180
214
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
@@ -215,10 +249,14 @@ public:
|
|
|
215
249
|
//! @param num_items The number of elements to write to the output sequence.
|
|
216
250
|
//! @param generator A nullary function object. The return type of the call operator must be assignable to the
|
|
217
251
|
//! dereferenced output iterator.
|
|
218
|
-
//! @param
|
|
219
|
-
|
|
252
|
+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
|
|
253
|
+
//! stream\ :sub:`0`
|
|
254
|
+
template <typename RandomAccessIteratorOut,
|
|
255
|
+
typename NumItemsT,
|
|
256
|
+
typename Generator,
|
|
257
|
+
typename Env = ::cuda::std::execution::env<>>
|
|
220
258
|
CUB_RUNTIME_FUNCTION static cudaError_t
|
|
221
|
-
Generate(RandomAccessIteratorOut output, NumItemsT num_items, Generator generator,
|
|
259
|
+
Generate(RandomAccessIteratorOut output, NumItemsT num_items, Generator generator, Env env = {})
|
|
222
260
|
{
|
|
223
261
|
static_assert(::cuda::std::is_invocable_v<Generator>, "The passed generator must be a nullary function object");
|
|
224
262
|
static_assert(
|
|
@@ -228,7 +266,12 @@ public:
|
|
|
228
266
|
|
|
229
267
|
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Generate");
|
|
230
268
|
return TransformInternal(
|
|
231
|
-
::cuda::std::make_tuple(),
|
|
269
|
+
::cuda::std::make_tuple(),
|
|
270
|
+
::cuda::std::move(output),
|
|
271
|
+
num_items,
|
|
272
|
+
detail::transform::always_true_predicate{},
|
|
273
|
+
::cuda::std::move(generator),
|
|
274
|
+
get_stream(env));
|
|
232
275
|
}
|
|
233
276
|
|
|
234
277
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
@@ -262,10 +305,14 @@ public:
|
|
|
262
305
|
//! @param output An iterator to the output sequence where num_items results are written to.
|
|
263
306
|
//! @param num_items The number of elements to write to the output sequence.
|
|
264
307
|
//! @param value The value to write. Must be assignable to the dereferenced output iterator.
|
|
265
|
-
//! @param
|
|
266
|
-
|
|
308
|
+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
|
|
309
|
+
//! stream\ :sub:`0`
|
|
310
|
+
template <typename RandomAccessIteratorOut,
|
|
311
|
+
typename NumItemsT,
|
|
312
|
+
typename Value,
|
|
313
|
+
typename Env = ::cuda::std::execution::env<>>
|
|
267
314
|
CUB_RUNTIME_FUNCTION static cudaError_t
|
|
268
|
-
Fill(RandomAccessIteratorOut output, NumItemsT num_items, Value value,
|
|
315
|
+
Fill(RandomAccessIteratorOut output, NumItemsT num_items, Value value, Env env = {})
|
|
269
316
|
{
|
|
270
317
|
static_assert(::cuda::std::is_assignable_v<detail::it_reference_t<RandomAccessIteratorOut>, Value>,
|
|
271
318
|
"The passed value must be assignable to the dereferenced output iterator");
|
|
@@ -275,8 +322,9 @@ public:
|
|
|
275
322
|
::cuda::std::make_tuple(),
|
|
276
323
|
::cuda::std::move(output),
|
|
277
324
|
num_items,
|
|
325
|
+
detail::transform::always_true_predicate{},
|
|
278
326
|
detail::__return_constant<Value>{::cuda::std::move(value)},
|
|
279
|
-
|
|
327
|
+
get_stream(env));
|
|
280
328
|
}
|
|
281
329
|
|
|
282
330
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
@@ -296,8 +344,7 @@ public:
|
|
|
296
344
|
return cudaSuccess;
|
|
297
345
|
}
|
|
298
346
|
|
|
299
|
-
return
|
|
300
|
-
::cuda::std::move(output), num_items, detail::__return_constant<Value>{::cuda::std::move(value)}, stream);
|
|
347
|
+
return Fill(::cuda::std::move(output), num_items, ::cuda::std::move(value), stream);
|
|
301
348
|
}
|
|
302
349
|
#endif // _CCCL_DOXYGEN_INVOKED
|
|
303
350
|
|
|
@@ -333,43 +380,30 @@ public:
|
|
|
333
380
|
//! types must be convertible to the parameters of the function object's call operator. The return type of the call
|
|
334
381
|
//! operator must be assignable to the dereferenced output iterator. Will only be invoked if \p predicate returns
|
|
335
382
|
//! true.
|
|
336
|
-
//! @param
|
|
383
|
+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
|
|
384
|
+
//! stream\ :sub:`0`
|
|
337
385
|
template <typename... RandomAccessIteratorsIn,
|
|
338
386
|
typename RandomAccessIteratorOut,
|
|
339
387
|
typename NumItemsT,
|
|
340
388
|
typename Predicate,
|
|
341
|
-
typename TransformOp
|
|
389
|
+
typename TransformOp,
|
|
390
|
+
typename Env = ::cuda::std::execution::env<>>
|
|
342
391
|
CUB_RUNTIME_FUNCTION static cudaError_t TransformIf(
|
|
343
392
|
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
|
|
344
393
|
RandomAccessIteratorOut output,
|
|
345
394
|
NumItemsT num_items,
|
|
346
395
|
Predicate predicate,
|
|
347
396
|
TransformOp transform_op,
|
|
348
|
-
|
|
397
|
+
Env env = {})
|
|
349
398
|
{
|
|
350
399
|
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformIf");
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
return error;
|
|
359
|
-
}
|
|
360
|
-
|
|
361
|
-
return detail::transform::dispatch_t<
|
|
362
|
-
detail::transform::requires_stable_address::no,
|
|
363
|
-
offset_t,
|
|
364
|
-
::cuda::std::tuple<RandomAccessIteratorsIn...>,
|
|
365
|
-
RandomAccessIteratorOut,
|
|
366
|
-
Predicate,
|
|
367
|
-
TransformOp>::dispatch(::cuda::std::move(inputs),
|
|
368
|
-
::cuda::std::move(output),
|
|
369
|
-
num_items,
|
|
370
|
-
::cuda::std::move(predicate),
|
|
371
|
-
::cuda::std::move(transform_op),
|
|
372
|
-
stream);
|
|
400
|
+
return TransformInternal(
|
|
401
|
+
::cuda::std::move(inputs),
|
|
402
|
+
::cuda::std::move(output),
|
|
403
|
+
num_items,
|
|
404
|
+
::cuda::std::move(predicate),
|
|
405
|
+
::cuda::std::move(transform_op),
|
|
406
|
+
get_stream(env));
|
|
373
407
|
}
|
|
374
408
|
|
|
375
409
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
@@ -435,19 +469,21 @@ public:
|
|
|
435
469
|
//! @param transform_op A unary function object. The input iterator's value type must be convertible to the
|
|
436
470
|
//! parameter of the function object's call operator. The return type of the call operator must be assignable to the
|
|
437
471
|
//! dereferenced output iterator. Will only be invoked if \p predicate returns true.
|
|
438
|
-
//! @param
|
|
472
|
+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
|
|
473
|
+
//! stream\ :sub:`0`
|
|
439
474
|
template <typename RandomAccessIteratorIn,
|
|
440
475
|
typename RandomAccessIteratorOut,
|
|
441
476
|
typename NumItemsT,
|
|
442
477
|
typename Predicate,
|
|
443
|
-
typename TransformOp
|
|
478
|
+
typename TransformOp,
|
|
479
|
+
typename Env = ::cuda::std::execution::env<>>
|
|
444
480
|
CUB_RUNTIME_FUNCTION static cudaError_t TransformIf(
|
|
445
481
|
RandomAccessIteratorIn input,
|
|
446
482
|
RandomAccessIteratorOut output,
|
|
447
483
|
NumItemsT num_items,
|
|
448
484
|
Predicate predicate,
|
|
449
485
|
TransformOp transform_op,
|
|
450
|
-
|
|
486
|
+
Env env = {})
|
|
451
487
|
{
|
|
452
488
|
return TransformIf(
|
|
453
489
|
::cuda::std::make_tuple(::cuda::std::move(input)),
|
|
@@ -455,7 +491,7 @@ public:
|
|
|
455
491
|
num_items,
|
|
456
492
|
::cuda::std::move(predicate),
|
|
457
493
|
::cuda::std::move(transform_op),
|
|
458
|
-
|
|
494
|
+
get_stream(env));
|
|
459
495
|
}
|
|
460
496
|
|
|
461
497
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
@@ -518,39 +554,29 @@ public:
|
|
|
518
554
|
//! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
|
|
519
555
|
//! types must be convertible to the parameters of the function object's call operator. The return type of the call
|
|
520
556
|
//! operator must be assignable to the dereferenced output iterator.
|
|
521
|
-
//! @param
|
|
522
|
-
|
|
557
|
+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
|
|
558
|
+
//! stream\ :sub:`0`
|
|
559
|
+
template <typename... RandomAccessIteratorsIn,
|
|
560
|
+
typename RandomAccessIteratorOut,
|
|
561
|
+
typename NumItemsT,
|
|
562
|
+
typename TransformOp,
|
|
563
|
+
typename Env = ::cuda::std::execution::env<>>
|
|
523
564
|
CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
|
|
524
565
|
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
|
|
525
566
|
RandomAccessIteratorOut output,
|
|
526
567
|
NumItemsT num_items,
|
|
527
568
|
TransformOp transform_op,
|
|
528
|
-
|
|
569
|
+
Env env = {})
|
|
529
570
|
{
|
|
530
571
|
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses");
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
return error;
|
|
540
|
-
}
|
|
541
|
-
|
|
542
|
-
return detail::transform::dispatch_t<
|
|
543
|
-
detail::transform::requires_stable_address::yes,
|
|
544
|
-
offset_t,
|
|
545
|
-
::cuda::std::tuple<RandomAccessIteratorsIn...>,
|
|
546
|
-
RandomAccessIteratorOut,
|
|
547
|
-
detail::transform::always_true_predicate,
|
|
548
|
-
TransformOp>::dispatch(::cuda::std::move(inputs),
|
|
549
|
-
::cuda::std::move(output),
|
|
550
|
-
num_items,
|
|
551
|
-
detail::transform::always_true_predicate{},
|
|
552
|
-
::cuda::std::move(transform_op),
|
|
553
|
-
stream);
|
|
572
|
+
return TransformInternal(
|
|
573
|
+
::cuda::std::move(inputs),
|
|
574
|
+
::cuda::std::move(output),
|
|
575
|
+
num_items,
|
|
576
|
+
detail::transform::always_true_predicate{},
|
|
577
|
+
::cuda::std::move(transform_op),
|
|
578
|
+
get_stream(env),
|
|
579
|
+
::cuda::std::true_type{});
|
|
554
580
|
}
|
|
555
581
|
|
|
556
582
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
@@ -590,21 +616,26 @@ public:
|
|
|
590
616
|
//! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
|
|
591
617
|
//! types must be convertible to the parameters of the function object's call operator. The return type of the call
|
|
592
618
|
//! operator must be assignable to the dereferenced output iterator.
|
|
593
|
-
//! @param
|
|
594
|
-
|
|
619
|
+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
|
|
620
|
+
//! stream\ :sub:`0`
|
|
621
|
+
template <typename RandomAccessIteratorIn,
|
|
622
|
+
typename RandomAccessIteratorOut,
|
|
623
|
+
typename NumItemsT,
|
|
624
|
+
typename TransformOp,
|
|
625
|
+
typename Env = ::cuda::std::execution::env<>>
|
|
595
626
|
CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
|
|
596
627
|
RandomAccessIteratorIn input,
|
|
597
628
|
RandomAccessIteratorOut output,
|
|
598
629
|
NumItemsT num_items,
|
|
599
630
|
TransformOp transform_op,
|
|
600
|
-
|
|
631
|
+
Env env = {})
|
|
601
632
|
{
|
|
602
633
|
return TransformStableArgumentAddresses(
|
|
603
634
|
::cuda::std::make_tuple(::cuda::std::move(input)),
|
|
604
635
|
::cuda::std::move(output),
|
|
605
636
|
num_items,
|
|
606
637
|
::cuda::std::move(transform_op),
|
|
607
|
-
|
|
638
|
+
get_stream(env));
|
|
608
639
|
}
|
|
609
640
|
|
|
610
641
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
@@ -122,9 +122,8 @@ __launch_bounds__(
|
|
|
122
122
|
{
|
|
123
123
|
// the merge agent loads keys into a local array of KeyIt1::value_type, on which the comparisons are performed
|
|
124
124
|
using key_t = it_value_t<KeyIt1>;
|
|
125
|
-
static_assert(::cuda::std::
|
|
126
|
-
|
|
127
|
-
static_assert(::cuda::std::is_convertible_v<typename ::cuda::std::__invoke_of<CompareOp, key_t, key_t>::type, bool>,
|
|
125
|
+
static_assert(::cuda::std::is_invocable_v<CompareOp, key_t, key_t>, "Comparison operator cannot compare two keys");
|
|
126
|
+
static_assert(::cuda::std::is_convertible_v<::cuda::std::invoke_result_t<CompareOp, key_t, key_t>, bool>,
|
|
128
127
|
"Comparison operator must be convertible to bool");
|
|
129
128
|
|
|
130
129
|
using MergeAgent = typename choose_merge_agent<
|
|
@@ -144,11 +143,11 @@ __launch_bounds__(
|
|
|
144
143
|
auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage);
|
|
145
144
|
MergeAgent{
|
|
146
145
|
temp_storage.Alias(),
|
|
147
|
-
|
|
148
|
-
|
|
146
|
+
keys1,
|
|
147
|
+
items1,
|
|
149
148
|
num_keys1,
|
|
150
|
-
|
|
151
|
-
|
|
149
|
+
keys2,
|
|
150
|
+
items2,
|
|
152
151
|
num_keys2,
|
|
153
152
|
keys_result,
|
|
154
153
|
items_result,
|
|
@@ -44,7 +44,6 @@
|
|
|
44
44
|
# pragma system_header
|
|
45
45
|
#endif // no system header
|
|
46
46
|
|
|
47
|
-
#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
|
|
48
47
|
#include <cub/device/dispatch/kernels/radix_sort.cuh>
|
|
49
48
|
#include <cub/device/dispatch/tuning/tuning_radix_sort.cuh>
|
|
50
49
|
#include <cub/util_debug.cuh>
|
|
@@ -1379,14 +1378,6 @@ struct DispatchSegmentedRadixSort
|
|
|
1379
1378
|
// Number of radix sort invocations until all segments have been processed
|
|
1380
1379
|
const auto num_invocations = ::cuda::ceil_div(num_segments, max_num_segments_per_invocation);
|
|
1381
1380
|
|
|
1382
|
-
// If d_begin_offsets and d_end_offsets do not support operator+ then we can't have more than
|
|
1383
|
-
// max_num_segments_per_invocation segments per invocation
|
|
1384
|
-
if (num_invocations > 1
|
|
1385
|
-
&& !detail::all_iterators_support_add_assign_operator(::cuda::std::int64_t{}, d_begin_offsets, d_end_offsets))
|
|
1386
|
-
{
|
|
1387
|
-
return cudaErrorInvalidValue;
|
|
1388
|
-
}
|
|
1389
|
-
|
|
1390
1381
|
BeginOffsetIteratorT begin_offsets_current_it = d_begin_offsets;
|
|
1391
1382
|
EndOffsetIteratorT end_offsets_current_it = d_end_offsets;
|
|
1392
1383
|
|
|
@@ -1435,8 +1426,8 @@ struct DispatchSegmentedRadixSort
|
|
|
1435
1426
|
|
|
1436
1427
|
if (invocation_index + 1 < num_invocations)
|
|
1437
1428
|
{
|
|
1438
|
-
|
|
1439
|
-
|
|
1429
|
+
begin_offsets_current_it += num_current_segments;
|
|
1430
|
+
end_offsets_current_it += num_current_segments;
|
|
1440
1431
|
}
|
|
1441
1432
|
|
|
1442
1433
|
// Sync the stream if specified to flush runtime errors
|
|
@@ -46,7 +46,6 @@
|
|
|
46
46
|
|
|
47
47
|
#include <cub/detail/launcher/cuda_runtime.cuh>
|
|
48
48
|
#include <cub/detail/type_traits.cuh> // for cub::detail::invoke_result_t
|
|
49
|
-
#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
|
|
50
49
|
#include <cub/device/dispatch/kernels/reduce.cuh>
|
|
51
50
|
#include <cub/device/dispatch/kernels/segmented_reduce.cuh>
|
|
52
51
|
#include <cub/device/dispatch/tuning/tuning_reduce.cuh>
|
|
@@ -791,7 +790,7 @@ struct DispatchSegmentedReduce
|
|
|
791
790
|
* Function type of cub::DeviceSegmentedReduceKernel
|
|
792
791
|
*
|
|
793
792
|
* @param[in] segmented_reduce_kernel
|
|
794
|
-
* Kernel function pointer to
|
|
793
|
+
* Kernel function pointer to instantiation of
|
|
795
794
|
* cub::DeviceSegmentedReduceKernel
|
|
796
795
|
*/
|
|
797
796
|
template <typename ActivePolicyT, typename DeviceSegmentedReduceKernelT>
|
|
@@ -810,7 +809,8 @@ struct DispatchSegmentedReduce
|
|
|
810
809
|
return cudaSuccess;
|
|
811
810
|
}
|
|
812
811
|
|
|
813
|
-
// Init kernel configuration
|
|
812
|
+
// Init kernel configuration (computes kernel occupancy)
|
|
813
|
+
// maybe only used inside CUB_DEBUG_LOG code sections
|
|
814
814
|
[[maybe_unused]] detail::KernelConfig segmented_reduce_config;
|
|
815
815
|
error =
|
|
816
816
|
CubDebug(segmented_reduce_config.Init(segmented_reduce_kernel, policy.SegmentedReduce(), launcher_factory));
|
|
@@ -823,17 +823,6 @@ struct DispatchSegmentedReduce
|
|
|
823
823
|
static_cast<::cuda::std::int64_t>(::cuda::std::numeric_limits<::cuda::std::int32_t>::max());
|
|
824
824
|
const ::cuda::std::int64_t num_invocations = ::cuda::ceil_div(num_segments, num_segments_per_invocation);
|
|
825
825
|
|
|
826
|
-
// If we need multiple passes over the segments but the iterators do not support the + operator, we cannot use the
|
|
827
|
-
// streaming approach and have to fail, returning cudaErrorInvalidValue. This is because c.parallel passes
|
|
828
|
-
// indirect_arg_t as the iterator type, which does not support the + operator.
|
|
829
|
-
// TODO (elstehle): Remove this check once https://github.com/NVIDIA/cccl/issues/4148 is resolved.
|
|
830
|
-
if (num_invocations > 1
|
|
831
|
-
&& !detail::all_iterators_support_add_assign_operator(
|
|
832
|
-
::cuda::std::int64_t{}, d_out, d_begin_offsets, d_end_offsets))
|
|
833
|
-
{
|
|
834
|
-
return cudaErrorInvalidValue;
|
|
835
|
-
}
|
|
836
|
-
|
|
837
826
|
for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
|
|
838
827
|
{
|
|
839
828
|
const auto current_seg_offset = invocation_index * num_segments_per_invocation;
|
|
@@ -851,7 +840,7 @@ struct DispatchSegmentedReduce
|
|
|
851
840
|
segmented_reduce_config.sm_occupancy);
|
|
852
841
|
#endif // CUB_DEBUG_LOG
|
|
853
842
|
|
|
854
|
-
// Invoke
|
|
843
|
+
// Invoke DeviceSegmentedReduceKernel
|
|
855
844
|
launcher_factory(
|
|
856
845
|
static_cast<::cuda::std::uint32_t>(num_current_segments), policy.SegmentedReduce().BlockThreads(), 0, stream)
|
|
857
846
|
.doit(segmented_reduce_kernel, d_in, d_out, d_begin_offsets, d_end_offsets, reduction_op, init);
|
|
@@ -865,9 +854,9 @@ struct DispatchSegmentedReduce
|
|
|
865
854
|
|
|
866
855
|
if (invocation_index + 1 < num_invocations)
|
|
867
856
|
{
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
857
|
+
d_out += num_current_segments;
|
|
858
|
+
d_begin_offsets += num_current_segments;
|
|
859
|
+
d_end_offsets += num_current_segments;
|
|
871
860
|
}
|
|
872
861
|
|
|
873
862
|
// Sync the stream if specified to flush runtime errors
|
|
@@ -1182,15 +1171,6 @@ struct DispatchFixedSizeSegmentedReduce
|
|
|
1182
1171
|
|
|
1183
1172
|
const ::cuda::std::int64_t num_invocations = ::cuda::ceil_div(num_segments, num_segments_per_invocation);
|
|
1184
1173
|
|
|
1185
|
-
// If we need multiple passes over the segments but the iterators do not support the + operator, we cannot use the
|
|
1186
|
-
// streaming approach and have to fail, returning cudaErrorInvalidValue. This is because c.parallel passes
|
|
1187
|
-
// indirect_arg_t as the iterator type, which does not support the + operator.
|
|
1188
|
-
// TODO (srinivas/elstehle): Remove this check once https://github.com/NVIDIA/cccl/issues/4148 is resolved.
|
|
1189
|
-
if (num_invocations > 1 && !detail::all_iterators_support_plus_operator(::cuda::std::int64_t{}, d_in, d_out))
|
|
1190
|
-
{
|
|
1191
|
-
return cudaErrorInvalidValue;
|
|
1192
|
-
}
|
|
1193
|
-
|
|
1194
1174
|
cudaError error = cudaSuccess;
|
|
1195
1175
|
for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
|
|
1196
1176
|
{
|
|
@@ -1204,13 +1184,16 @@ struct DispatchFixedSizeSegmentedReduce
|
|
|
1204
1184
|
launcher_factory(
|
|
1205
1185
|
static_cast<::cuda::std::int32_t>(num_current_blocks), ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream)
|
|
1206
1186
|
.doit(fixed_size_segmented_reduce_kernel,
|
|
1207
|
-
|
|
1208
|
-
|
|
1187
|
+
d_in,
|
|
1188
|
+
d_out,
|
|
1209
1189
|
segment_size,
|
|
1210
1190
|
static_cast<::cuda::std::int32_t>(num_current_segments),
|
|
1211
1191
|
reduction_op,
|
|
1212
1192
|
init);
|
|
1213
1193
|
|
|
1194
|
+
d_in += num_segments_per_invocation * segment_size;
|
|
1195
|
+
d_out += num_segments_per_invocation;
|
|
1196
|
+
|
|
1214
1197
|
error = CubDebug(cudaPeekAtLastError());
|
|
1215
1198
|
if (cudaSuccess != error)
|
|
1216
1199
|
{
|
|
@@ -77,7 +77,7 @@ namespace rfa
|
|
|
77
77
|
{
|
|
78
78
|
|
|
79
79
|
template <typename Invocable, typename InputT>
|
|
80
|
-
using transformed_input_t = ::cuda::std::decay_t
|
|
80
|
+
using transformed_input_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<Invocable, InputT>>;
|
|
81
81
|
|
|
82
82
|
template <typename InitT, typename InputIteratorT, typename TransformOpT>
|
|
83
83
|
using accum_t =
|
|
@@ -328,11 +328,6 @@ struct DispatchReduceDeterministic
|
|
|
328
328
|
// Alias the allocation for the privatized per-block reductions
|
|
329
329
|
deterministic_accum_t* d_block_reductions = (deterministic_accum_t*) allocations[0];
|
|
330
330
|
|
|
331
|
-
if (num_chunks > 1 && !detail::all_iterators_support_add_assign_operator(::cuda::std::int32_t{}, d_in))
|
|
332
|
-
{
|
|
333
|
-
return cudaErrorInvalidValue;
|
|
334
|
-
}
|
|
335
|
-
|
|
336
331
|
auto d_chunk_block_reductions = d_block_reductions;
|
|
337
332
|
for (int chunk_index = 0; chunk_index < num_chunks; chunk_index++)
|
|
338
333
|
{
|
|
@@ -372,7 +367,7 @@ struct DispatchReduceDeterministic
|
|
|
372
367
|
|
|
373
368
|
if (chunk_index + 1 < num_chunks)
|
|
374
369
|
{
|
|
375
|
-
|
|
370
|
+
d_in += num_current_items;
|
|
376
371
|
d_chunk_block_reductions += current_grid_size;
|
|
377
372
|
}
|
|
378
373
|
|
|
@@ -20,7 +20,6 @@
|
|
|
20
20
|
|
|
21
21
|
#include <cub/detail/launcher/cuda_runtime.cuh>
|
|
22
22
|
#include <cub/detail/type_traits.cuh> // for cub::detail::invoke_result_t
|
|
23
|
-
#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
|
|
24
23
|
#include <cub/device/dispatch/kernels/reduce.cuh>
|
|
25
24
|
#include <cub/device/dispatch/tuning/tuning_reduce.cuh>
|
|
26
25
|
#include <cub/grid/grid_even_share.cuh>
|
|
@@ -40,7 +40,6 @@
|
|
|
40
40
|
#include <cub/detail/device_double_buffer.cuh>
|
|
41
41
|
#include <cub/detail/temporary_storage.cuh>
|
|
42
42
|
#include <cub/device/device_partition.cuh>
|
|
43
|
-
#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
|
|
44
43
|
#include <cub/device/dispatch/kernels/segmented_sort.cuh>
|
|
45
44
|
#include <cub/device/dispatch/tuning/tuning_segmented_sort.cuh>
|
|
46
45
|
#include <cub/util_debug.cuh>
|
|
@@ -764,8 +763,8 @@ private:
|
|
|
764
763
|
BeginOffsetIteratorT current_begin_offset = d_begin_offsets;
|
|
765
764
|
EndOffsetIteratorT current_end_offset = d_end_offsets;
|
|
766
765
|
|
|
767
|
-
|
|
768
|
-
|
|
766
|
+
current_begin_offset += current_seg_offset;
|
|
767
|
+
current_end_offset += current_seg_offset;
|
|
769
768
|
|
|
770
769
|
auto medium_indices_iterator =
|
|
771
770
|
::cuda::std::make_reverse_iterator(large_and_medium_segments_indices.get() + current_num_segments);
|
|
@@ -18,8 +18,8 @@
|
|
|
18
18
|
|
|
19
19
|
#include <thrust/iterator/constant_iterator.h>
|
|
20
20
|
#include <thrust/iterator/iterator_adaptor.h>
|
|
21
|
-
#include <thrust/iterator/tabulate_output_iterator.h>
|
|
22
21
|
|
|
22
|
+
#include <cuda/__iterator/tabulate_output_iterator.h>
|
|
23
23
|
#include <cuda/std/__functional/identity.h>
|
|
24
24
|
#include <cuda/std/__utility/swap.h>
|
|
25
25
|
#include <cuda/std/limits>
|
|
@@ -217,8 +217,7 @@ struct dispatch_streaming_arg_reduce_t
|
|
|
217
217
|
|
|
218
218
|
// The output iterator that implements the logic to accumulate per-partition result to a global aggregate and,
|
|
219
219
|
// eventually, write to the user-provided output iterators
|
|
220
|
-
using accumulating_transform_out_it_t =
|
|
221
|
-
THRUST_NS_QUALIFIER::tabulate_output_iterator<accumulating_transform_output_op_t>;
|
|
220
|
+
using accumulating_transform_out_it_t = ::cuda::tabulate_output_iterator<accumulating_transform_output_op_t>;
|
|
222
221
|
|
|
223
222
|
// Empty problem initialization type
|
|
224
223
|
using empty_problem_init_t = empty_problem_init_t<per_partition_accum_t>;
|
|
@@ -270,7 +269,7 @@ struct dispatch_streaming_arg_reduce_t
|
|
|
270
269
|
nullptr,
|
|
271
270
|
allocation_sizes[0],
|
|
272
271
|
d_indexed_offset_in,
|
|
273
|
-
|
|
272
|
+
::cuda::make_tabulate_output_iterator(accumulating_out_op),
|
|
274
273
|
static_cast<PerPartitionOffsetT>(largest_partition_size),
|
|
275
274
|
reduce_op,
|
|
276
275
|
initial_value,
|
|
@@ -315,7 +314,7 @@ struct dispatch_streaming_arg_reduce_t
|
|
|
315
314
|
d_temp_storage,
|
|
316
315
|
temp_storage_bytes,
|
|
317
316
|
d_indexed_offset_in,
|
|
318
|
-
|
|
317
|
+
::cuda::make_tabulate_output_iterator(accumulating_out_op),
|
|
319
318
|
static_cast<PerPartitionOffsetT>(current_num_items),
|
|
320
319
|
reduce_op,
|
|
321
320
|
initial_value,
|
|
@@ -23,7 +23,6 @@
|
|
|
23
23
|
#include <cub/util_type.cuh>
|
|
24
24
|
|
|
25
25
|
#include <thrust/iterator/offset_iterator.h>
|
|
26
|
-
#include <thrust/iterator/tabulate_output_iterator.h>
|
|
27
26
|
#include <thrust/iterator/transform_iterator.h>
|
|
28
27
|
#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
|
|
29
28
|
|