cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
- cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
- cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +27 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +13 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +155 -13
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +79 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
- cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
- cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -387,15 +387,13 @@ struct DispatchTopK
|
|
|
387
387
|
return error;
|
|
388
388
|
}
|
|
389
389
|
|
|
390
|
-
_CubLog("Invoking topk_kernel
|
|
390
|
+
_CubLog("Invoking topk_kernel<<<%d, %d, 0, "
|
|
391
391
|
"%lld>>>(), %d items per thread, %d SM occupancy\n",
|
|
392
|
-
topk_grid_size
|
|
393
|
-
topk_grid_size.y,
|
|
394
|
-
topk_grid_size.z,
|
|
392
|
+
topk_grid_size,
|
|
395
393
|
block_threads,
|
|
396
394
|
(long long) stream,
|
|
397
395
|
items_per_thread,
|
|
398
|
-
|
|
396
|
+
main_kernel_blocks_per_sm);
|
|
399
397
|
}
|
|
400
398
|
#endif // CUB_DEBUG_LOG
|
|
401
399
|
|
|
@@ -109,8 +109,9 @@ struct TransformKernelSource<Offset,
|
|
|
109
109
|
return detail::transform::make_aligned_base_ptr_kernel_arg(it, align);
|
|
110
110
|
}
|
|
111
111
|
|
|
112
|
+
private:
|
|
112
113
|
template <typename T>
|
|
113
|
-
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static auto
|
|
114
|
+
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static auto is_pointer_aligned(T it, [[maybe_unused]] int alignment)
|
|
114
115
|
{
|
|
115
116
|
if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<decltype(it)>)
|
|
116
117
|
{
|
|
@@ -121,6 +122,14 @@ struct TransformKernelSource<Offset,
|
|
|
121
122
|
return true; // fancy iterators are aligned, since the vectorized kernel chooses a different code path
|
|
122
123
|
}
|
|
123
124
|
}
|
|
125
|
+
|
|
126
|
+
public:
|
|
127
|
+
CUB_RUNTIME_FUNCTION constexpr static bool
|
|
128
|
+
CanVectorize(int vec_size, const RandomAccessIteratorOut& out, const RandomAccessIteratorsIn&... in)
|
|
129
|
+
{
|
|
130
|
+
return is_pointer_aligned(out, sizeof(it_value_t<RandomAccessIteratorOut>) * vec_size)
|
|
131
|
+
&& (is_pointer_aligned(in, sizeof(it_value_t<RandomAccessIteratorsIn>) * vec_size) && ...);
|
|
132
|
+
}
|
|
124
133
|
};
|
|
125
134
|
|
|
126
135
|
enum class requires_stable_address
|
|
@@ -384,7 +393,7 @@ struct dispatch_t<StableAddress,
|
|
|
384
393
|
}
|
|
385
394
|
|
|
386
395
|
CUB_DEFINE_SFINAE_GETTER(items_per_thread_no_input, prefetch, ItemsPerThreadNoInput)
|
|
387
|
-
CUB_DEFINE_SFINAE_GETTER(
|
|
396
|
+
CUB_DEFINE_SFINAE_GETTER(vec_size, vectorized, VecSize)
|
|
388
397
|
CUB_DEFINE_SFINAE_GETTER(items_per_thread_vectorized, vectorized, ItemsPerThreadVectorized)
|
|
389
398
|
|
|
390
399
|
#undef CUB_DEFINE_SFINAE_GETTER
|
|
@@ -441,9 +450,8 @@ struct dispatch_t<StableAddress,
|
|
|
441
450
|
// the policy already handles the compile-time checks if we can vectorize. Do the remaining alignment check here
|
|
442
451
|
if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::vectorized == wrapped_policy.Algorithm())
|
|
443
452
|
{
|
|
444
|
-
const int
|
|
445
|
-
can_vectorize
|
|
446
|
-
&& kernel_source.IsPointerAligned(out, alignment);
|
|
453
|
+
const int vs = vec_size(wrapped_policy.AlgorithmPolicy());
|
|
454
|
+
can_vectorize = kernel_source.CanVectorize(vs, out, ::cuda::std::get<Is>(in)...);
|
|
447
455
|
}
|
|
448
456
|
|
|
449
457
|
int ipt = 0;
|
|
@@ -14,19 +14,17 @@
|
|
|
14
14
|
#endif // no system header
|
|
15
15
|
|
|
16
16
|
#include <cub/agent/agent_for.cuh>
|
|
17
|
-
#include <cub/detail/fast_modulo_division.cuh> // fast_div_mod
|
|
18
17
|
#include <cub/detail/mdspan_utils.cuh> // is_sub_size_static
|
|
19
18
|
#include <cub/detail/type_traits.cuh> // implicit_prom_t
|
|
20
19
|
|
|
21
|
-
#include <cuda/std/__fwd/span.h>
|
|
22
20
|
#include <cuda/std/__type_traits/enable_if.h>
|
|
23
21
|
#include <cuda/std/__type_traits/integral_constant.h>
|
|
24
22
|
#include <cuda/std/__type_traits/is_convertible.h>
|
|
25
23
|
#include <cuda/std/__type_traits/is_reference.h>
|
|
26
24
|
#include <cuda/std/__type_traits/is_trivially_constructible.h>
|
|
27
|
-
#include <cuda/std/__type_traits/
|
|
25
|
+
#include <cuda/std/__type_traits/is_trivially_copy_assignable.h>
|
|
28
26
|
#include <cuda/std/__type_traits/is_trivially_destructible.h>
|
|
29
|
-
#include <cuda/std/__type_traits/
|
|
27
|
+
#include <cuda/std/__type_traits/is_trivially_move_assignable.h>
|
|
30
28
|
#include <cuda/std/__type_traits/make_unsigned.h>
|
|
31
29
|
#include <cuda/std/__utility/integer_sequence.h>
|
|
32
30
|
#include <cuda/std/cstddef> // size_t
|
|
@@ -140,16 +138,21 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) //
|
|
|
140
138
|
* ForEachInExtents
|
|
141
139
|
**********************************************************************************************************************/
|
|
142
140
|
|
|
143
|
-
//
|
|
144
|
-
|
|
145
|
-
|
|
141
|
+
// Retrieves the extent (dimension size) at a specific position in a multi-dimensional array
|
|
142
|
+
//
|
|
143
|
+
// This function efficiently returns the extent at the given position, optimizing for static extents by returning
|
|
144
|
+
// compile-time constants when possible. For dynamic extents, it returns the precomputed value to avoid runtime
|
|
145
|
+
// computation overhead.
|
|
146
|
+
template <int Position, typename ExtentType, typename FastDivModType>
|
|
147
|
+
_CCCL_DEVICE_API auto extent_at(ExtentType extents, FastDivModType dynamic_extent)
|
|
146
148
|
{
|
|
147
|
-
if constexpr (ExtentType::static_extent(
|
|
149
|
+
if constexpr (ExtentType::static_extent(Position) != ::cuda::std::dynamic_extent)
|
|
148
150
|
{
|
|
149
151
|
using extent_index_type = typename ExtentType::index_type;
|
|
150
152
|
using index_type = implicit_prom_t<extent_index_type>;
|
|
151
153
|
using unsigned_index_type = ::cuda::std::make_unsigned_t<index_type>;
|
|
152
|
-
|
|
154
|
+
constexpr auto extent = extents.static_extent(Position);
|
|
155
|
+
return static_cast<unsigned_index_type>(extent);
|
|
153
156
|
}
|
|
154
157
|
else
|
|
155
158
|
{
|
|
@@ -157,17 +160,22 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto extent_at(ExtentType extents, FastDivModType
|
|
|
157
160
|
}
|
|
158
161
|
}
|
|
159
162
|
|
|
160
|
-
//
|
|
161
|
-
//
|
|
162
|
-
|
|
163
|
-
|
|
163
|
+
// Computes the product of extents in a specified range for multi-dimensional indexing.
|
|
164
|
+
// This function calculates the product of all extent dimensions from Start (inclusive) to End (exclusive).
|
|
165
|
+
//
|
|
166
|
+
// Performance characteristics:
|
|
167
|
+
// - Static extents in range: Product computed at compile-time, zero runtime cost
|
|
168
|
+
// - Dynamic extents present: Returns precomputed value, avoiding runtime multiplication
|
|
169
|
+
template <int Start, int End, typename ExtentType, typename FastDivModType>
|
|
170
|
+
_CCCL_DEVICE_API auto get_extents_sub_size(ExtentType extents, FastDivModType extent_sub_size)
|
|
164
171
|
{
|
|
165
|
-
if constexpr (cub::detail::
|
|
172
|
+
if constexpr (cub::detail::are_extents_in_range_static<ExtentType>(Start, End))
|
|
166
173
|
{
|
|
167
174
|
using extent_index_type = typename ExtentType::index_type;
|
|
168
175
|
using index_type = implicit_prom_t<extent_index_type>;
|
|
169
176
|
using unsigned_index_type = ::cuda::std::make_unsigned_t<index_type>;
|
|
170
|
-
|
|
177
|
+
auto sub_size = cub::detail::size_range(extents, Start, End);
|
|
178
|
+
return static_cast<unsigned_index_type>(sub_size);
|
|
171
179
|
}
|
|
172
180
|
else
|
|
173
181
|
{
|
|
@@ -175,49 +183,76 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto get_extents_sub_size(ExtentType extents, Fas
|
|
|
175
183
|
}
|
|
176
184
|
}
|
|
177
185
|
|
|
178
|
-
|
|
179
|
-
|
|
186
|
+
// Converts a linear index to a multi-dimensional coordinate at a specific position.
|
|
187
|
+
//
|
|
188
|
+
// This function performs the mathematical conversion from a linear (flat) index to the coordinate value at a specific
|
|
189
|
+
// position in a multi-dimensional array. It supports both row-major (layout_right) and column-major (layout_left)
|
|
190
|
+
// memory layouts, which affects the indexing calculation order.
|
|
191
|
+
//
|
|
192
|
+
// The mathematical formulation depends on the layout:
|
|
193
|
+
// - Right layout (row-major): index_i = (index / product(extent[j] for j in [i+1, rank-1])) % extent[i]
|
|
194
|
+
// - Left layout (column-major): index_i = (index / product(extent[j] for j in [0, i])) % extent[i]
|
|
195
|
+
//
|
|
196
|
+
// This function leverages precomputed fast division and modulo operations to minimize runtime arithmetic overhead.
|
|
197
|
+
template <bool IsLayoutRight, int Position, typename IndexType, typename ExtentType, typename FastDivModType>
|
|
198
|
+
_CCCL_DEVICE_API auto
|
|
180
199
|
coordinate_at(IndexType index, ExtentType extents, FastDivModType extent_sub_size, FastDivModType dynamic_extent)
|
|
181
200
|
{
|
|
182
201
|
using cub::detail::for_each::extent_at;
|
|
183
202
|
using cub::detail::for_each::get_extents_sub_size;
|
|
184
203
|
using extent_index_type = typename ExtentType::index_type;
|
|
185
|
-
|
|
186
|
-
|
|
204
|
+
constexpr auto start = IsLayoutRight ? Position + 1 : 0;
|
|
205
|
+
constexpr auto end = IsLayoutRight ? ExtentType::rank() : Position;
|
|
206
|
+
return static_cast<extent_index_type>((index / get_extents_sub_size<start, end>(extents, extent_sub_size))
|
|
207
|
+
% extent_at<Position>(extents, dynamic_extent));
|
|
187
208
|
}
|
|
188
209
|
|
|
189
|
-
|
|
210
|
+
// Function object wrapper for applying operations with multi-dimensional coordinate conversion.
|
|
211
|
+
//
|
|
212
|
+
// The wrapped operation will be called with signature: `op(linear_index, coord_0, coord_1, ..., coord_n)`
|
|
213
|
+
// where the number of coordinate parameters matches the rank of the extents object.
|
|
214
|
+
//
|
|
215
|
+
// This wrapper is used internally by DeviceFor::ForEachInLayout/ForEachInExtents
|
|
216
|
+
template <typename OpT, typename ExtentsType, bool IsLayoutRight, typename FastDivModArrayT>
|
|
190
217
|
struct op_wrapper_extents_t
|
|
191
218
|
{
|
|
192
|
-
OpT op;
|
|
193
|
-
|
|
194
|
-
FastDivModArrayT sub_sizes_div_array;
|
|
195
|
-
FastDivModArrayT extents_mod_array;
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
219
|
+
OpT op; ///< The user-provided operation to be called with coordinates
|
|
220
|
+
ExtentsType extents; ///< The multi-dimensional extents defining array dimensions
|
|
221
|
+
FastDivModArrayT sub_sizes_div_array; ///< Precomputed fast division values for extent sub-products
|
|
222
|
+
FastDivModArrayT extents_mod_array; ///< Precomputed fast modulo values for individual extents
|
|
223
|
+
|
|
224
|
+
// Internal implementation that converts linear index to coordinates and calls the user operation
|
|
225
|
+
template <typename IndexType, size_t... Positions>
|
|
226
|
+
_CCCL_DEVICE_API void impl(IndexType i, ::cuda::std::index_sequence<Positions...>)
|
|
199
227
|
{
|
|
200
228
|
using cub::detail::for_each::coordinate_at;
|
|
201
|
-
op(i,
|
|
229
|
+
op(i,
|
|
230
|
+
coordinate_at<IsLayoutRight, Positions>(
|
|
231
|
+
i, extents, sub_sizes_div_array[Positions], extents_mod_array[Positions])...);
|
|
202
232
|
}
|
|
203
233
|
|
|
204
|
-
|
|
205
|
-
|
|
234
|
+
// Internal implementation that converts linear index to coordinates and calls the user operation
|
|
235
|
+
template <typename IndexType, size_t... Positions>
|
|
236
|
+
_CCCL_DEVICE_API void impl(IndexType i, ::cuda::std::index_sequence<Positions...>) const
|
|
206
237
|
{
|
|
207
238
|
using cub::detail::for_each::coordinate_at;
|
|
208
|
-
op(i,
|
|
239
|
+
op(i,
|
|
240
|
+
coordinate_at<IsLayoutRight, Positions>(
|
|
241
|
+
i, extents, sub_sizes_div_array[Positions], extents_mod_array[Positions])...);
|
|
209
242
|
}
|
|
210
243
|
|
|
211
|
-
|
|
212
|
-
|
|
244
|
+
// Function call operator that processes a linear index by converting it to multi-dimensional coordinates
|
|
245
|
+
template <typename IndexType>
|
|
246
|
+
_CCCL_DEVICE_API void operator()(IndexType i)
|
|
213
247
|
{
|
|
214
|
-
impl(i, ::cuda::std::make_index_sequence<
|
|
248
|
+
impl(i, ::cuda::std::make_index_sequence<ExtentsType::rank()>{});
|
|
215
249
|
}
|
|
216
250
|
|
|
217
|
-
|
|
218
|
-
|
|
251
|
+
// Function call operator that processes a linear index by converting it to multi-dimensional coordinates
|
|
252
|
+
template <typename IndexType>
|
|
253
|
+
_CCCL_DEVICE_API void operator()(IndexType i) const
|
|
219
254
|
{
|
|
220
|
-
impl(i, ::cuda::std::make_index_sequence<
|
|
255
|
+
impl(i, ::cuda::std::make_index_sequence<ExtentsType::rank()>{});
|
|
221
256
|
}
|
|
222
257
|
};
|
|
223
258
|
|
|
@@ -47,9 +47,7 @@
|
|
|
47
47
|
|
|
48
48
|
CUB_NAMESPACE_BEGIN
|
|
49
49
|
|
|
50
|
-
namespace detail
|
|
51
|
-
{
|
|
52
|
-
namespace reduce
|
|
50
|
+
namespace detail::reduce
|
|
53
51
|
{
|
|
54
52
|
|
|
55
53
|
/**
|
|
@@ -580,7 +578,6 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
|
|
|
580
578
|
}
|
|
581
579
|
}
|
|
582
580
|
|
|
583
|
-
} // namespace reduce
|
|
584
|
-
} // namespace detail
|
|
581
|
+
} // namespace detail::reduce
|
|
585
582
|
|
|
586
583
|
CUB_NAMESPACE_END
|
|
@@ -42,9 +42,7 @@
|
|
|
42
42
|
|
|
43
43
|
CUB_NAMESPACE_BEGIN
|
|
44
44
|
|
|
45
|
-
namespace detail
|
|
46
|
-
{
|
|
47
|
-
namespace scan
|
|
45
|
+
namespace detail::scan
|
|
48
46
|
{
|
|
49
47
|
|
|
50
48
|
/******************************************************************************
|
|
@@ -186,7 +184,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
|
|
|
186
184
|
AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(num_items, tile_state, start_tile);
|
|
187
185
|
}
|
|
188
186
|
|
|
189
|
-
} // namespace scan
|
|
190
|
-
} // namespace detail
|
|
187
|
+
} // namespace detail::scan
|
|
191
188
|
|
|
192
189
|
CUB_NAMESPACE_END
|
|
@@ -43,9 +43,7 @@
|
|
|
43
43
|
|
|
44
44
|
CUB_NAMESPACE_BEGIN
|
|
45
45
|
|
|
46
|
-
namespace detail
|
|
47
|
-
{
|
|
48
|
-
namespace reduce
|
|
46
|
+
namespace detail::reduce
|
|
49
47
|
{
|
|
50
48
|
|
|
51
49
|
/// Normalize input iterator to segment offset
|
|
@@ -318,7 +316,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
|
|
|
318
316
|
}
|
|
319
317
|
}
|
|
320
318
|
|
|
321
|
-
} // namespace reduce
|
|
322
|
-
} // namespace detail
|
|
319
|
+
} // namespace detail::reduce
|
|
323
320
|
|
|
324
321
|
CUB_NAMESPACE_END
|
|
@@ -217,6 +217,7 @@ _CCCL_DEVICE void transform_kernel_vectorized(
|
|
|
217
217
|
{
|
|
218
218
|
constexpr int block_dim = VectorizedPolicy::block_threads;
|
|
219
219
|
constexpr int items_per_thread = VectorizedPolicy::items_per_thread_vectorized;
|
|
220
|
+
constexpr int vec_size = VectorizedPolicy::vec_size;
|
|
220
221
|
_CCCL_ASSERT(!can_vectorize || (items_per_thread == num_elem_per_thread_prefetch), "");
|
|
221
222
|
constexpr int tile_size = block_dim * items_per_thread;
|
|
222
223
|
const Offset offset = static_cast<Offset>(blockIdx.x) * tile_size;
|
|
@@ -241,23 +242,13 @@ _CCCL_DEVICE void transform_kernel_vectorized(
|
|
|
241
242
|
out += offset;
|
|
242
243
|
}
|
|
243
244
|
|
|
244
|
-
|
|
245
|
-
using load_store_t = decltype(load_store_type<load_store_size>());
|
|
246
|
-
using output_t = it_value_t<RandomAccessIteratorOut>;
|
|
245
|
+
using output_t = it_value_t<RandomAccessIteratorOut>;
|
|
247
246
|
using result_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const it_value_t<RandomAccessIteratorsIn>&...>>;
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
(sizeof(it_value_t<RandomAccessIteratorsIn>)
|
|
251
|
-
* THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
|
|
252
|
-
size_of<output_t>)};
|
|
253
|
-
constexpr int load_store_count = (items_per_thread * element_size) / load_store_size;
|
|
247
|
+
constexpr int load_store_count = items_per_thread / vec_size;
|
|
248
|
+
static_assert(items_per_thread % vec_size == 0, "The items per thread must be a multiple of the vector size");
|
|
254
249
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
constexpr bool can_vectorize_store =
|
|
259
|
-
THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>
|
|
260
|
-
&& THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t> && size_of<output_t> == element_size;
|
|
250
|
+
constexpr bool can_vectorize_store = THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>
|
|
251
|
+
&& THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t>;
|
|
261
252
|
|
|
262
253
|
// if we can vectorize, we convert f's return type to the output type right away, so we can reinterpret later
|
|
263
254
|
using THRUST_NS_QUALIFIER::cuda_cub::core::detail::uninitialized_array;
|
|
@@ -266,10 +257,15 @@ _CCCL_DEVICE void transform_kernel_vectorized(
|
|
|
266
257
|
auto provide_array = [&](auto... inputs) {
|
|
267
258
|
// load inputs
|
|
268
259
|
[[maybe_unused]] auto load_tile = [](auto in, auto& input) {
|
|
260
|
+
using it_t = decltype(in);
|
|
261
|
+
using value_t = it_value_t<it_t>;
|
|
269
262
|
if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<decltype(in)>)
|
|
270
263
|
{
|
|
271
|
-
|
|
272
|
-
|
|
264
|
+
// TODO(bgruber): we could add a max_load_store_size to the policy to avoid huge load types and huge alignment
|
|
265
|
+
// requirements
|
|
266
|
+
using load_t = decltype(load_store_type<sizeof(value_t) * vec_size>());
|
|
267
|
+
auto in_vec = reinterpret_cast<const load_t*>(in) + threadIdx.x;
|
|
268
|
+
auto input_vec = reinterpret_cast<load_t*>(input.data());
|
|
273
269
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
274
270
|
for (int i = 0; i < load_store_count; ++i)
|
|
275
271
|
{
|
|
@@ -278,15 +274,14 @@ _CCCL_DEVICE void transform_kernel_vectorized(
|
|
|
278
274
|
}
|
|
279
275
|
else
|
|
280
276
|
{
|
|
281
|
-
|
|
282
|
-
in += threadIdx.x * elems;
|
|
277
|
+
in += threadIdx.x * vec_size;
|
|
283
278
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
284
279
|
for (int i = 0; i < load_store_count; ++i)
|
|
285
280
|
{
|
|
286
281
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
287
|
-
for (int j = 0; j <
|
|
282
|
+
for (int j = 0; j < vec_size; ++j)
|
|
288
283
|
{
|
|
289
|
-
input[i *
|
|
284
|
+
input[i * vec_size + j] = in[i * vec_size * VectorizedPolicy::block_threads + j];
|
|
290
285
|
}
|
|
291
286
|
}
|
|
292
287
|
}
|
|
@@ -310,8 +305,9 @@ _CCCL_DEVICE void transform_kernel_vectorized(
|
|
|
310
305
|
if constexpr (can_vectorize_store)
|
|
311
306
|
{
|
|
312
307
|
// vector path
|
|
313
|
-
|
|
314
|
-
auto
|
|
308
|
+
using store_t = decltype(load_store_type<sizeof(output_t) * vec_size>());
|
|
309
|
+
auto output_vec = reinterpret_cast<const store_t*>(output.data());
|
|
310
|
+
auto out_vec = reinterpret_cast<store_t*>(out) + threadIdx.x;
|
|
315
311
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
316
312
|
for (int i = 0; i < load_store_count; ++i)
|
|
317
313
|
{
|
|
@@ -321,15 +317,14 @@ _CCCL_DEVICE void transform_kernel_vectorized(
|
|
|
321
317
|
else
|
|
322
318
|
{
|
|
323
319
|
// serial path
|
|
324
|
-
|
|
325
|
-
out += threadIdx.x * elems;
|
|
320
|
+
out += threadIdx.x * vec_size;
|
|
326
321
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
327
322
|
for (int i = 0; i < load_store_count; ++i)
|
|
328
323
|
{
|
|
329
324
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
330
|
-
for (int j = 0; j <
|
|
325
|
+
for (int j = 0; j < vec_size; ++j)
|
|
331
326
|
{
|
|
332
|
-
out[i *
|
|
327
|
+
out[i * vec_size * VectorizedPolicy::block_threads + j] = output[i * vec_size + j];
|
|
333
328
|
}
|
|
334
329
|
}
|
|
335
330
|
}
|
|
@@ -43,9 +43,7 @@
|
|
|
43
43
|
|
|
44
44
|
CUB_NAMESPACE_BEGIN
|
|
45
45
|
|
|
46
|
-
namespace detail
|
|
47
|
-
{
|
|
48
|
-
namespace adjacent_difference
|
|
46
|
+
namespace detail::adjacent_difference
|
|
49
47
|
{
|
|
50
48
|
template <typename InputIteratorT, bool MayAlias>
|
|
51
49
|
struct policy_hub
|
|
@@ -64,7 +62,6 @@ struct policy_hub
|
|
|
64
62
|
|
|
65
63
|
using MaxPolicy = Policy500;
|
|
66
64
|
};
|
|
67
|
-
} // namespace adjacent_difference
|
|
68
|
-
} // namespace detail
|
|
65
|
+
} // namespace detail::adjacent_difference
|
|
69
66
|
|
|
70
67
|
CUB_NAMESPACE_END
|
|
@@ -43,9 +43,7 @@
|
|
|
43
43
|
|
|
44
44
|
CUB_NAMESPACE_BEGIN
|
|
45
45
|
|
|
46
|
-
namespace detail
|
|
47
|
-
{
|
|
48
|
-
namespace batch_memcpy
|
|
46
|
+
namespace detail::batch_memcpy
|
|
49
47
|
{
|
|
50
48
|
/**
|
|
51
49
|
* Parameterizable tuning policy type for AgentBatchMemcpy
|
|
@@ -115,7 +113,6 @@ struct policy_hub
|
|
|
115
113
|
|
|
116
114
|
using MaxPolicy = Policy700;
|
|
117
115
|
};
|
|
118
|
-
} // namespace batch_memcpy
|
|
119
|
-
} // namespace detail
|
|
116
|
+
} // namespace detail::batch_memcpy
|
|
120
117
|
|
|
121
118
|
CUB_NAMESPACE_END
|
|
@@ -42,9 +42,7 @@
|
|
|
42
42
|
|
|
43
43
|
CUB_NAMESPACE_BEGIN
|
|
44
44
|
|
|
45
|
-
namespace detail
|
|
46
|
-
{
|
|
47
|
-
namespace for_each
|
|
45
|
+
namespace detail::for_each
|
|
48
46
|
{
|
|
49
47
|
|
|
50
48
|
struct policy_hub_t
|
|
@@ -57,7 +55,6 @@ struct policy_hub_t
|
|
|
57
55
|
using MaxPolicy = policy_500_t;
|
|
58
56
|
};
|
|
59
57
|
|
|
60
|
-
} // namespace for_each
|
|
61
|
-
} // namespace detail
|
|
58
|
+
} // namespace detail::for_each
|
|
62
59
|
|
|
63
60
|
CUB_NAMESPACE_END
|
|
@@ -46,9 +46,7 @@
|
|
|
46
46
|
|
|
47
47
|
CUB_NAMESPACE_BEGIN
|
|
48
48
|
|
|
49
|
-
namespace detail
|
|
50
|
-
{
|
|
51
|
-
namespace histogram
|
|
49
|
+
namespace detail::histogram
|
|
52
50
|
{
|
|
53
51
|
enum class primitive_sample
|
|
54
52
|
{
|
|
@@ -272,7 +270,6 @@ struct policy_hub
|
|
|
272
270
|
|
|
273
271
|
using MaxPolicy = Policy1000;
|
|
274
272
|
};
|
|
275
|
-
} // namespace histogram
|
|
276
|
-
} // namespace detail
|
|
273
|
+
} // namespace detail::histogram
|
|
277
274
|
|
|
278
275
|
CUB_NAMESPACE_END
|
|
@@ -42,9 +42,7 @@
|
|
|
42
42
|
|
|
43
43
|
CUB_NAMESPACE_BEGIN
|
|
44
44
|
|
|
45
|
-
namespace detail
|
|
46
|
-
{
|
|
47
|
-
namespace merge
|
|
45
|
+
namespace detail::merge
|
|
48
46
|
{
|
|
49
47
|
template <typename KeyT, typename ValueT>
|
|
50
48
|
struct policy_hub
|
|
@@ -73,7 +71,6 @@ struct policy_hub
|
|
|
73
71
|
|
|
74
72
|
using max_policy = policy600;
|
|
75
73
|
};
|
|
76
|
-
} // namespace merge
|
|
77
|
-
} // namespace detail
|
|
74
|
+
} // namespace detail::merge
|
|
78
75
|
|
|
79
76
|
CUB_NAMESPACE_END
|
|
@@ -62,6 +62,14 @@ struct MergeSortPolicyWrapper<StaticPolicyT, ::cuda::std::void_t<decltype(Static
|
|
|
62
62
|
{}
|
|
63
63
|
|
|
64
64
|
CUB_DEFINE_SUB_POLICY_GETTER(MergeSort);
|
|
65
|
+
|
|
66
|
+
#if defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
67
|
+
_CCCL_DEVICE static constexpr auto EncodedPolicy()
|
|
68
|
+
{
|
|
69
|
+
using namespace ptx_json;
|
|
70
|
+
return object<key<"MergeSortPolicy">() = MergeSort().EncodedPolicy()>();
|
|
71
|
+
}
|
|
72
|
+
#endif
|
|
65
73
|
};
|
|
66
74
|
|
|
67
75
|
template <typename PolicyT>
|
|
@@ -46,9 +46,7 @@
|
|
|
46
46
|
|
|
47
47
|
CUB_NAMESPACE_BEGIN
|
|
48
48
|
|
|
49
|
-
namespace detail
|
|
50
|
-
{
|
|
51
|
-
namespace radix
|
|
49
|
+
namespace detail::radix
|
|
52
50
|
{
|
|
53
51
|
// sm90 default
|
|
54
52
|
template <size_t KeySize, size_t ValueSize, size_t OffsetSize>
|
|
@@ -1062,7 +1060,6 @@ struct policy_hub
|
|
|
1062
1060
|
using MaxPolicy = Policy1000;
|
|
1063
1061
|
};
|
|
1064
1062
|
|
|
1065
|
-
} // namespace radix
|
|
1066
|
-
} // namespace detail
|
|
1063
|
+
} // namespace detail::radix
|
|
1067
1064
|
|
|
1068
1065
|
CUB_NAMESPACE_END
|
|
@@ -50,9 +50,7 @@
|
|
|
50
50
|
|
|
51
51
|
CUB_NAMESPACE_BEGIN
|
|
52
52
|
|
|
53
|
-
namespace detail
|
|
54
|
-
{
|
|
55
|
-
namespace reduce_by_key
|
|
53
|
+
namespace detail::reduce_by_key
|
|
56
54
|
{
|
|
57
55
|
enum class primitive_key
|
|
58
56
|
{
|
|
@@ -939,7 +937,6 @@ struct policy_hub
|
|
|
939
937
|
};
|
|
940
938
|
using MaxPolicy = Policy1000;
|
|
941
939
|
};
|
|
942
|
-
} // namespace reduce_by_key
|
|
943
|
-
} // namespace detail
|
|
940
|
+
} // namespace detail::reduce_by_key
|
|
944
941
|
|
|
945
942
|
CUB_NAMESPACE_END
|
|
@@ -52,9 +52,7 @@
|
|
|
52
52
|
|
|
53
53
|
CUB_NAMESPACE_BEGIN
|
|
54
54
|
|
|
55
|
-
namespace detail
|
|
56
|
-
{
|
|
57
|
-
namespace rle
|
|
55
|
+
namespace detail::rle
|
|
58
56
|
{
|
|
59
57
|
enum class primitive_key
|
|
60
58
|
{
|
|
@@ -670,7 +668,6 @@ struct policy_hub
|
|
|
670
668
|
using MaxPolicy = Policy1000;
|
|
671
669
|
};
|
|
672
670
|
} // namespace non_trivial_runs
|
|
673
|
-
} // namespace rle
|
|
674
|
-
} // namespace detail
|
|
671
|
+
} // namespace detail::rle
|
|
675
672
|
|
|
676
673
|
CUB_NAMESPACE_END
|
|
@@ -53,9 +53,7 @@
|
|
|
53
53
|
|
|
54
54
|
CUB_NAMESPACE_BEGIN
|
|
55
55
|
|
|
56
|
-
namespace detail
|
|
57
|
-
{
|
|
58
|
-
namespace scan
|
|
56
|
+
namespace detail::scan
|
|
59
57
|
{
|
|
60
58
|
enum class keep_rejects
|
|
61
59
|
{
|
|
@@ -615,7 +613,6 @@ struct policy_hub
|
|
|
615
613
|
|
|
616
614
|
using MaxPolicy = Policy1000;
|
|
617
615
|
};
|
|
618
|
-
} // namespace scan
|
|
619
|
-
} // namespace detail
|
|
616
|
+
} // namespace detail::scan
|
|
620
617
|
|
|
621
618
|
CUB_NAMESPACE_END
|
|
@@ -49,9 +49,7 @@
|
|
|
49
49
|
|
|
50
50
|
CUB_NAMESPACE_BEGIN
|
|
51
51
|
|
|
52
|
-
namespace detail
|
|
53
|
-
{
|
|
54
|
-
namespace scan_by_key
|
|
52
|
+
namespace detail::scan_by_key
|
|
55
53
|
{
|
|
56
54
|
enum class primitive_accum
|
|
57
55
|
{
|
|
@@ -1007,7 +1005,6 @@ struct policy_hub
|
|
|
1007
1005
|
|
|
1008
1006
|
using MaxPolicy = Policy1000;
|
|
1009
1007
|
};
|
|
1010
|
-
} // namespace scan_by_key
|
|
1011
|
-
} // namespace detail
|
|
1008
|
+
} // namespace detail::scan_by_key
|
|
1012
1009
|
|
|
1013
1010
|
CUB_NAMESPACE_END
|
|
@@ -43,9 +43,7 @@
|
|
|
43
43
|
|
|
44
44
|
CUB_NAMESPACE_BEGIN
|
|
45
45
|
|
|
46
|
-
namespace detail
|
|
47
|
-
{
|
|
48
|
-
namespace segmented_sort
|
|
46
|
+
namespace detail::segmented_sort
|
|
49
47
|
{
|
|
50
48
|
|
|
51
49
|
template <typename PolicyT, typename = void>
|
|
@@ -395,7 +393,6 @@ struct policy_hub
|
|
|
395
393
|
|
|
396
394
|
using MaxPolicy = Policy860;
|
|
397
395
|
};
|
|
398
|
-
} // namespace segmented_sort
|
|
399
|
-
} // namespace detail
|
|
396
|
+
} // namespace detail::segmented_sort
|
|
400
397
|
|
|
401
398
|
CUB_NAMESPACE_END
|