cuda-cccl 0.2.1__cp312-cp312-manylinux_2_26_x86_64.whl → 0.3.0__cp312-cp312-manylinux_2_26_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/experimental/_common.py +3 -1
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +68 -62
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +23 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +2 -18
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +23 -0
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +22 -14
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +14 -0
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +94 -13
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +20 -6
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +0 -2
- cuda/cccl/headers/include/cub/cub.cuh +8 -0
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +13 -32
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +61 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +3 -3
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +3 -2
- cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
- cuda/cccl/headers/include/cub/device/device_for.cuh +2 -10
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +8 -8
- cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +775 -163
- cuda/cccl/headers/include/cub/device/device_scan.cuh +306 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
- cuda/cccl/headers/include/cub/device/device_transform.cuh +11 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +43 -44
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +321 -262
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +100 -171
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +629 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +8 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +29 -24
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +4 -10
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
- cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +80 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +39 -15
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +3 -15
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +10 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +203 -51
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +36 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
- cuda/cccl/headers/include/cub/util_device.cuh +72 -51
- cuda/cccl/headers/include/cub/util_ptx.cuh +8 -8
- cuda/cccl/headers/include/cub/util_type.cuh +15 -20
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +63 -10
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +3 -1
- cuda/cccl/headers/include/cuda/__device/all_devices.h +3 -6
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +3 -3
- cuda/cccl/headers/include/cuda/__device/attributes.h +7 -7
- cuda/cccl/headers/include/cuda/__device/device_ref.h +12 -10
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +260 -30
- cuda/cccl/headers/include/cuda/__event/event.h +7 -8
- cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -5
- cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +97 -52
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +5 -6
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +5 -0
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +5 -0
- cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +6 -4
- cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +5 -0
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +6 -1
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +28 -27
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +39 -33
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +18 -11
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +10 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +83 -44
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +1 -1
- cuda/cccl/headers/include/cuda/__memory/address_space.h +28 -12
- cuda/cccl/headers/include/cuda/__memory/check_address.h +34 -29
- cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
- cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -12
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +161 -92
- cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +3 -2
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
- cuda/cccl/headers/include/cuda/pipeline +2 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +0 -6
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +3 -3
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +3 -3
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +2 -3
- cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +2 -3
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +6 -8
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +3 -3
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +4 -4
- cuda/cccl/headers/include/cuda/std/__atomic/order.h +1 -1
- cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cccl/assert.h +18 -7
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +4 -115
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +8 -5
- cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +22 -3
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +4 -4
- cuda/cccl/headers/include/cuda/std/__cccl/os.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
- cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +25 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +258 -0
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +67 -0
- cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +1 -4
- cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +12 -9
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +31 -38
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +4 -4
- cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +4 -4
- cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/function.h +10 -11
- cuda/cccl/headers/include/cuda/std/__functional/hash.h +5 -6
- cuda/cccl/headers/include/cuda/std/__functional/identity.h +4 -8
- cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +2 -4
- cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +16 -18
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +2 -3
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +2 -3
- cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +16 -25
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +3 -3
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +3 -3
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +21 -28
- cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +28 -39
- cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +3 -4
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +2 -2
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +2 -3
- cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +12 -41
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +2 -2
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +3 -4
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +31 -31
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +45 -45
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +3 -2
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
- cuda/cccl/headers/include/cuda/std/__memory/addressof.h +5 -2
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +30 -30
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +1 -1
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +66 -86
- cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +2 -2
- cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +5 -2
- cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +30 -45
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +8 -12
- cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +21 -23
- cuda/cccl/headers/include/cuda/std/__new/launder.h +4 -0
- cuda/cccl/headers/include/cuda/std/__optional/hash.h +2 -2
- cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +2 -1
- cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +2 -1
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +1 -1
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +1 -1
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +1 -1
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +1 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +4 -13
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +12 -22
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +9 -18
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +2 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +3 -4
- cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +2 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +7 -8
- cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +4 -13
- cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +2 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +3 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +3 -44
- cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +2 -28
- cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +9 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +3 -3
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +1 -4
- cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +3 -34
- cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +3 -29
- cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +2 -16
- cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +4 -21
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +3 -3
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +4 -24
- cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +3 -24
- cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +13 -9
- cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +3 -18
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +21 -20
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +3 -17
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +4 -31
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +3 -42
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +5 -19
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +3 -19
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +3 -17
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +2 -15
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +13 -28
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +2 -17
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +2 -16
- cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +3 -18
- cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +4 -3
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +2 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +2 -16
- cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +2 -2
- cuda/cccl/headers/include/cuda/std/__utility/declval.h +17 -4
- cuda/cccl/headers/include/cuda/std/__utility/exchange.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/forward.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +29 -0
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +2 -2
- cuda/cccl/headers/include/cuda/std/__utility/move.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +8 -9
- cuda/cccl/headers/include/cuda/std/array +2 -2
- cuda/cccl/headers/include/cuda/std/atomic +20 -28
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +1 -32
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +3 -3
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +16 -1137
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +12 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +4 -4
- cuda/cccl/headers/include/cuda/std/inplace_vector +9 -9
- cuda/cccl/headers/include/cuda/std/ratio +3 -4
- cuda/cccl/headers/include/cuda/std/version +2 -4
- cuda/cccl/headers/include/thrust/advance.h +6 -8
- cuda/cccl/headers/include/thrust/detail/execution_policy.h +61 -21
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +37 -2
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +1 -1
- cuda/cccl/headers/include/thrust/detail/reference.h +10 -16
- cuda/cccl/headers/include/thrust/detail/seq.h +37 -25
- cuda/cccl/headers/include/thrust/detail/vector_base.h +2 -4
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +2 -2
- cuda/cccl/headers/include/thrust/distance.h +3 -3
- cuda/cccl/headers/include/thrust/execution_policy.h +202 -335
- cuda/cccl/headers/include/thrust/functional.h +1 -2
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +6 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +24 -23
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +66 -44
- cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +4 -99
- cuda/cccl/headers/include/thrust/system/cuda/config.h +7 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +151 -40
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +199 -48
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +2 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +2 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +15 -13
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +22 -19
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +12 -42
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +16 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +30 -30
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +29 -15
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +7 -5
- cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +3 -27
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +10 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +16 -35
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +75 -61
- cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +4 -99
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +68 -51
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +2 -2
- cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +4 -99
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
- cuda/cccl/parallel/experimental/__init__.py +4 -0
- cuda/cccl/parallel/experimental/_bindings.py +38 -15
- cuda/cccl/parallel/experimental/_bindings.pyi +28 -0
- cuda/cccl/parallel/experimental/_bindings_impl.pyx +176 -9
- cuda/cccl/parallel/experimental/_cccl_interop.py +3 -3
- cuda/cccl/parallel/experimental/algorithms/__init__.py +4 -0
- cuda/cccl/parallel/experimental/algorithms/_reduce.py +0 -2
- cuda/cccl/parallel/experimental/algorithms/_scan.py +0 -2
- cuda/cccl/parallel/experimental/algorithms/_three_way_partition.py +261 -0
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
- {cuda_cccl-0.2.1.dist-info → cuda_cccl-0.3.0.dist-info}/METADATA +2 -3
- {cuda_cccl-0.2.1.dist-info → cuda_cccl-0.3.0.dist-info}/RECORD +313 -303
- cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
- cuda/cccl/headers/include/thrust/detail/util/align.h +0 -59
- cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +0 -59
- cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +0 -204
- cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +0 -92
- cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +0 -237
- cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +0 -95
- cuda/cccl/headers/include/thrust/system/omp/detail/par.h +0 -62
- cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +0 -62
- {cuda_cccl-0.2.1.dist-info → cuda_cccl-0.3.0.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.2.1.dist-info → cuda_cccl-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -58,7 +58,9 @@ def make_binary_tempfile(content: bytes, suffix: str) -> BinaryIO:
|
|
|
58
58
|
|
|
59
59
|
:return: A binary file-like object representing the temporary file.
|
|
60
60
|
"""
|
|
61
|
-
tmp = tempfile.NamedTemporaryFile(
|
|
61
|
+
tmp = tempfile.NamedTemporaryFile(
|
|
62
|
+
mode="w+b", suffix=suffix, buffering=0, delete=False
|
|
63
|
+
)
|
|
62
64
|
tmp.write(content)
|
|
63
65
|
return tmp
|
|
64
66
|
|
|
@@ -25,22 +25,15 @@
|
|
|
25
25
|
#include <cuda/std/__algorithm/min.h>
|
|
26
26
|
|
|
27
27
|
CUB_NAMESPACE_BEGIN
|
|
28
|
-
namespace detail
|
|
28
|
+
namespace detail::merge
|
|
29
29
|
{
|
|
30
|
-
|
|
31
|
-
{
|
|
32
|
-
template <int ThreadsPerBlock,
|
|
33
|
-
int ItemsPerThread,
|
|
34
|
-
BlockLoadAlgorithm LoadAlgorithm,
|
|
35
|
-
CacheLoadModifier LoadCacheModifier,
|
|
36
|
-
BlockStoreAlgorithm StoreAlgorithm>
|
|
30
|
+
template <int ThreadsPerBlock, int ItemsPerThread, CacheLoadModifier LoadCacheModifier, BlockStoreAlgorithm StoreAlgorithm>
|
|
37
31
|
struct agent_policy_t
|
|
38
32
|
{
|
|
39
33
|
// do not change data member names, policy_wrapper_t depends on it
|
|
40
34
|
static constexpr int BLOCK_THREADS = ThreadsPerBlock;
|
|
41
35
|
static constexpr int ITEMS_PER_THREAD = ItemsPerThread;
|
|
42
36
|
static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD;
|
|
43
|
-
static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = LoadAlgorithm;
|
|
44
37
|
static constexpr CacheLoadModifier LOAD_MODIFIER = LoadCacheModifier;
|
|
45
38
|
static constexpr BlockStoreAlgorithm STORE_ALGORITHM = StoreAlgorithm;
|
|
46
39
|
};
|
|
@@ -68,34 +61,27 @@ struct agent_t
|
|
|
68
61
|
using items_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt1>;
|
|
69
62
|
using items_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt2>;
|
|
70
63
|
|
|
71
|
-
using block_load_keys1 = typename BlockLoadType<Policy, keys_load_it1>::type;
|
|
72
|
-
using block_load_keys2 = typename BlockLoadType<Policy, keys_load_it2>::type;
|
|
73
|
-
using block_load_items1 = typename BlockLoadType<Policy, items_load_it1>::type;
|
|
74
|
-
using block_load_items2 = typename BlockLoadType<Policy, items_load_it2>::type;
|
|
75
|
-
|
|
76
64
|
using block_store_keys = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
|
|
77
65
|
using block_store_items = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
|
|
78
66
|
|
|
67
|
+
static constexpr int items_per_thread = Policy::ITEMS_PER_THREAD;
|
|
68
|
+
static constexpr int threads_per_block = Policy::BLOCK_THREADS;
|
|
69
|
+
static constexpr int items_per_tile = Policy::ITEMS_PER_TILE;
|
|
70
|
+
|
|
79
71
|
union temp_storages
|
|
80
72
|
{
|
|
81
|
-
typename block_load_keys1::TempStorage load_keys1;
|
|
82
|
-
typename block_load_keys2::TempStorage load_keys2;
|
|
83
|
-
typename block_load_items1::TempStorage load_items1;
|
|
84
|
-
typename block_load_items2::TempStorage load_items2;
|
|
85
73
|
typename block_store_keys::TempStorage store_keys;
|
|
86
74
|
typename block_store_items::TempStorage store_items;
|
|
87
75
|
|
|
88
|
-
|
|
89
|
-
|
|
76
|
+
// We could change SerialMerge to avoid reading one item out of bounds and drop the + 1 here. But that would
|
|
77
|
+
// introduce more branches (about 10% slower on 2^16 problem sizes on RTX 5090 in a first attempt)
|
|
78
|
+
key_type keys_shared[items_per_tile + 1];
|
|
79
|
+
item_type items_shared[items_per_tile + 1];
|
|
90
80
|
};
|
|
91
81
|
|
|
92
82
|
struct TempStorage : Uninitialized<temp_storages>
|
|
93
83
|
{};
|
|
94
84
|
|
|
95
|
-
static constexpr int items_per_thread = Policy::ITEMS_PER_THREAD;
|
|
96
|
-
static constexpr int threads_per_block = Policy::BLOCK_THREADS;
|
|
97
|
-
static constexpr Offset items_per_tile = Policy::ITEMS_PER_TILE;
|
|
98
|
-
|
|
99
85
|
// Per thread data
|
|
100
86
|
temp_storages& storage;
|
|
101
87
|
keys_load_it1 keys1_in;
|
|
@@ -107,61 +93,86 @@ struct agent_t
|
|
|
107
93
|
KeysOutputIt keys_out;
|
|
108
94
|
ItemsOutputIt items_out;
|
|
109
95
|
CompareOp compare_op;
|
|
110
|
-
Offset*
|
|
96
|
+
Offset* key1_beg_offsets;
|
|
111
97
|
|
|
112
98
|
template <bool IsFullTile>
|
|
113
99
|
_CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(Offset tile_idx, Offset tile_base, int num_remaining)
|
|
114
100
|
{
|
|
115
|
-
const Offset partition_beg = merge_partitions[tile_idx + 0];
|
|
116
|
-
const Offset partition_end = merge_partitions[tile_idx + 1];
|
|
117
|
-
|
|
118
101
|
const Offset diag0 = items_per_tile * tile_idx;
|
|
119
|
-
|
|
102
|
+
Offset diag1 = diag0 + items_per_tile;
|
|
103
|
+
if constexpr (IsFullTile)
|
|
104
|
+
{
|
|
105
|
+
_CCCL_ASSERT(diag1 <= keys1_count + keys2_count, "");
|
|
106
|
+
}
|
|
107
|
+
else
|
|
108
|
+
{
|
|
109
|
+
diag1 = keys1_count + keys2_count;
|
|
110
|
+
}
|
|
120
111
|
|
|
121
112
|
// compute bounding box for keys1 & keys2
|
|
122
|
-
const Offset keys1_beg =
|
|
123
|
-
const Offset keys1_end =
|
|
113
|
+
const Offset keys1_beg = key1_beg_offsets[tile_idx + 0];
|
|
114
|
+
const Offset keys1_end = key1_beg_offsets[tile_idx + 1];
|
|
124
115
|
const Offset keys2_beg = diag0 - keys1_beg;
|
|
125
116
|
const Offset keys2_end = diag1 - keys1_end;
|
|
126
117
|
|
|
127
118
|
// number of keys per tile
|
|
128
|
-
const int
|
|
129
|
-
const int
|
|
119
|
+
const int keys1_count_tile = static_cast<int>(keys1_end - keys1_beg);
|
|
120
|
+
const int keys2_count_tile = static_cast<int>(keys2_end - keys2_beg);
|
|
121
|
+
if constexpr (IsFullTile)
|
|
122
|
+
{
|
|
123
|
+
_CCCL_ASSERT(keys1_count_tile + keys2_count_tile == items_per_tile, "");
|
|
124
|
+
}
|
|
125
|
+
else
|
|
126
|
+
{
|
|
127
|
+
_CCCL_ASSERT(keys1_count_tile + keys2_count_tile == num_remaining, "");
|
|
128
|
+
}
|
|
130
129
|
|
|
131
130
|
key_type keys_loc[items_per_thread];
|
|
132
131
|
merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
|
|
133
|
-
keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg,
|
|
132
|
+
keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, keys1_count_tile, keys2_count_tile);
|
|
134
133
|
merge_sort::reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
|
|
135
134
|
__syncthreads();
|
|
136
135
|
|
|
137
|
-
//
|
|
136
|
+
// now find the merge path for each of thread.
|
|
138
137
|
// we can use int type here, because the number of items in shared memory is limited
|
|
139
|
-
|
|
138
|
+
int diag0_thread = items_per_thread * static_cast<int>(threadIdx.x);
|
|
139
|
+
if constexpr (IsFullTile)
|
|
140
|
+
{
|
|
141
|
+
_CCCL_ASSERT(num_remaining == items_per_tile, "");
|
|
142
|
+
_CCCL_ASSERT(diag0_thread < num_remaining, "");
|
|
143
|
+
}
|
|
144
|
+
else
|
|
145
|
+
{ // for partial tiles, clamp the thread diagonal to the valid items
|
|
146
|
+
diag0_thread = (::cuda::std::min) (diag0_thread, num_remaining);
|
|
147
|
+
}
|
|
140
148
|
|
|
141
|
-
const int
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
149
|
+
const int keys1_beg_thread = MergePath(
|
|
150
|
+
&storage.keys_shared[0],
|
|
151
|
+
&storage.keys_shared[keys1_count_tile],
|
|
152
|
+
keys1_count_tile,
|
|
153
|
+
keys2_count_tile,
|
|
154
|
+
diag0_thread,
|
|
155
|
+
compare_op);
|
|
156
|
+
const int keys2_beg_thread = diag0_thread - keys1_beg_thread;
|
|
146
157
|
|
|
147
|
-
const int
|
|
148
|
-
const int
|
|
158
|
+
const int keys1_count_thread = keys1_count_tile - keys1_beg_thread;
|
|
159
|
+
const int keys2_count_thread = keys2_count_tile - keys2_beg_thread;
|
|
149
160
|
|
|
150
161
|
// perform serial merge
|
|
151
162
|
int indices[items_per_thread];
|
|
152
|
-
|
|
163
|
+
SerialMerge(
|
|
153
164
|
&storage.keys_shared[0],
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
165
|
+
keys1_beg_thread,
|
|
166
|
+
keys2_beg_thread + keys1_count_tile,
|
|
167
|
+
keys1_count_thread,
|
|
168
|
+
keys2_count_thread,
|
|
158
169
|
keys_loc,
|
|
159
170
|
indices,
|
|
160
171
|
compare_op);
|
|
161
|
-
__syncthreads();
|
|
162
172
|
|
|
163
173
|
// write keys
|
|
164
|
-
|
|
174
|
+
__syncthreads(); // sync after reading from SMEM before so block store can use SMEM again
|
|
175
|
+
if constexpr (IsFullTile)
|
|
165
176
|
{
|
|
166
177
|
block_store_keys{storage.store_keys}.Store(keys_out + tile_base, keys_loc);
|
|
167
178
|
}
|
|
@@ -176,9 +187,8 @@ struct agent_t
|
|
|
176
187
|
{
|
|
177
188
|
item_type items_loc[items_per_thread];
|
|
178
189
|
merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
|
|
179
|
-
items_loc, items1_in + keys1_beg, items2_in + keys2_beg,
|
|
180
|
-
__syncthreads(); // block_store_keys above uses
|
|
181
|
-
// to it
|
|
190
|
+
items_loc, items1_in + keys1_beg, items2_in + keys2_beg, keys1_count_tile, keys2_count_tile);
|
|
191
|
+
__syncthreads(); // block_store_keys above uses SMEM, so make sure all threads are done before we write to it
|
|
182
192
|
merge_sort::reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
|
|
183
193
|
__syncthreads();
|
|
184
194
|
|
|
@@ -191,7 +201,7 @@ struct agent_t
|
|
|
191
201
|
__syncthreads();
|
|
192
202
|
|
|
193
203
|
// write from reg to gmem
|
|
194
|
-
if (IsFullTile)
|
|
204
|
+
if constexpr (IsFullTile)
|
|
195
205
|
{
|
|
196
206
|
block_store_items{storage.store_items}.Store(items_out + tile_base, items_loc);
|
|
197
207
|
}
|
|
@@ -204,23 +214,19 @@ struct agent_t
|
|
|
204
214
|
|
|
205
215
|
_CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
|
|
206
216
|
{
|
|
207
|
-
|
|
208
|
-
// TODO(bgruber): is the above still true?
|
|
209
|
-
const int tile_idx = static_cast<int>(blockIdx.x);
|
|
217
|
+
const Offset tile_idx = blockIdx.x;
|
|
210
218
|
const Offset tile_base = tile_idx * items_per_tile;
|
|
211
|
-
// TODO(bgruber): random mixing of int and Offset
|
|
212
219
|
const int items_in_tile =
|
|
213
220
|
static_cast<int>((::cuda::std::min) (static_cast<Offset>(items_per_tile), keys1_count + keys2_count - tile_base));
|
|
214
221
|
if (items_in_tile == items_per_tile)
|
|
215
222
|
{
|
|
216
|
-
consume_tile
|
|
223
|
+
consume_tile</* IsFullTile */ true>(tile_idx, tile_base, items_per_tile);
|
|
217
224
|
}
|
|
218
225
|
else
|
|
219
226
|
{
|
|
220
|
-
consume_tile
|
|
227
|
+
consume_tile</* IsFullTile */ false>(tile_idx, tile_base, items_in_tile);
|
|
221
228
|
}
|
|
222
229
|
}
|
|
223
230
|
};
|
|
224
|
-
} // namespace merge
|
|
225
|
-
} // namespace detail
|
|
231
|
+
} // namespace detail::merge
|
|
226
232
|
CUB_NAMESPACE_END
|
|
@@ -51,6 +51,7 @@
|
|
|
51
51
|
#include <cub/block/radix_rank_sort_operations.cuh>
|
|
52
52
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
53
53
|
#include <cub/thread/thread_load.cuh>
|
|
54
|
+
#include <cub/util_device.cuh>
|
|
54
55
|
#include <cub/util_type.cuh>
|
|
55
56
|
|
|
56
57
|
#include <cuda/std/cstdint>
|
|
@@ -119,6 +120,28 @@ struct AgentRadixSortDownsweepPolicy : ScalingType
|
|
|
119
120
|
static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
|
|
120
121
|
};
|
|
121
122
|
|
|
123
|
+
#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
124
|
+
namespace detail
|
|
125
|
+
{
|
|
126
|
+
// Only define this when needed.
|
|
127
|
+
// Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
|
|
128
|
+
// either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
|
|
129
|
+
// version is always defined, and that's the only one needed for regular CUB operations.
|
|
130
|
+
//
|
|
131
|
+
// TODO: enable this unconditionally once concepts are always available
|
|
132
|
+
CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
133
|
+
RadixSortDownsweepAgentPolicy,
|
|
134
|
+
(GenericAgentPolicy),
|
|
135
|
+
(BLOCK_THREADS, BlockThreads, int),
|
|
136
|
+
(ITEMS_PER_THREAD, ItemsPerThread, int),
|
|
137
|
+
(RADIX_BITS, RadixBits, int),
|
|
138
|
+
(LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
|
|
139
|
+
(LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
|
|
140
|
+
(RANK_ALGORITHM, RankAlgorithm, cub::RadixRankAlgorithm),
|
|
141
|
+
(SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
|
|
142
|
+
} // namespace detail
|
|
143
|
+
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
144
|
+
|
|
122
145
|
/******************************************************************************
|
|
123
146
|
* Thread block abstractions
|
|
124
147
|
******************************************************************************/
|
|
@@ -50,10 +50,10 @@
|
|
|
50
50
|
#include <cub/util_device.cuh>
|
|
51
51
|
#include <cub/util_type.cuh>
|
|
52
52
|
|
|
53
|
-
#include <cuda/__memory/is_aligned.h>
|
|
54
53
|
#include <cuda/std/__algorithm/min.h>
|
|
55
54
|
#include <cuda/std/__functional/identity.h>
|
|
56
55
|
#include <cuda/std/__functional/operations.h>
|
|
56
|
+
#include <cuda/std/__memory/is_sufficiently_aligned.h>
|
|
57
57
|
#include <cuda/std/__type_traits/conditional.h>
|
|
58
58
|
#include <cuda/std/__type_traits/is_pointer.h>
|
|
59
59
|
|
|
@@ -175,9 +175,6 @@ namespace detail::reduce
|
|
|
175
175
|
* @tparam InputIteratorT
|
|
176
176
|
* Random-access iterator type for input
|
|
177
177
|
*
|
|
178
|
-
* @tparam OutputIteratorT
|
|
179
|
-
* Random-access iterator type for output
|
|
180
|
-
*
|
|
181
178
|
* @tparam OffsetT
|
|
182
179
|
* Signed integer type for global offsets
|
|
183
180
|
*
|
|
@@ -202,7 +199,6 @@ namespace detail::reduce
|
|
|
202
199
|
*/
|
|
203
200
|
template <typename AgentReducePolicy,
|
|
204
201
|
typename InputIteratorT,
|
|
205
|
-
typename OutputIteratorT,
|
|
206
202
|
typename OffsetT,
|
|
207
203
|
typename ReductionOp,
|
|
208
204
|
typename AccumT,
|
|
@@ -274,7 +270,7 @@ struct AgentReduceImpl
|
|
|
274
270
|
{
|
|
275
271
|
if constexpr (AttemptVectorization)
|
|
276
272
|
{
|
|
277
|
-
return ::cuda::
|
|
273
|
+
return ::cuda::std::is_sufficiently_aligned<alignof(VectorT)>(d_in);
|
|
278
274
|
}
|
|
279
275
|
else
|
|
280
276
|
{
|
|
@@ -506,9 +502,6 @@ private:
|
|
|
506
502
|
* @tparam InputIteratorT
|
|
507
503
|
* Random-access iterator type for input
|
|
508
504
|
*
|
|
509
|
-
* @tparam OutputIteratorT
|
|
510
|
-
* Random-access iterator type for output
|
|
511
|
-
*
|
|
512
505
|
* @tparam OffsetT
|
|
513
506
|
* Signed integer type for global offsets
|
|
514
507
|
*
|
|
@@ -524,7 +517,6 @@ private:
|
|
|
524
517
|
*/
|
|
525
518
|
template <typename AgentReducePolicy,
|
|
526
519
|
typename InputIteratorT,
|
|
527
|
-
typename OutputIteratorT,
|
|
528
520
|
typename OffsetT,
|
|
529
521
|
typename ReductionOp,
|
|
530
522
|
typename AccumT,
|
|
@@ -532,7 +524,6 @@ template <typename AgentReducePolicy,
|
|
|
532
524
|
struct AgentReduce
|
|
533
525
|
: AgentReduceImpl<AgentReducePolicy,
|
|
534
526
|
InputIteratorT,
|
|
535
|
-
OutputIteratorT,
|
|
536
527
|
OffsetT,
|
|
537
528
|
ReductionOp,
|
|
538
529
|
AccumT,
|
|
@@ -543,7 +534,6 @@ struct AgentReduce
|
|
|
543
534
|
using base_t =
|
|
544
535
|
AgentReduceImpl<AgentReducePolicy,
|
|
545
536
|
InputIteratorT,
|
|
546
|
-
OutputIteratorT,
|
|
547
537
|
OffsetT,
|
|
548
538
|
ReductionOp,
|
|
549
539
|
AccumT,
|
|
@@ -574,9 +564,6 @@ struct AgentReduce
|
|
|
574
564
|
* @tparam InputIteratorT
|
|
575
565
|
* Random-access iterator type for input
|
|
576
566
|
*
|
|
577
|
-
* @tparam OutputIteratorT
|
|
578
|
-
* Random-access iterator type for output
|
|
579
|
-
*
|
|
580
567
|
* @tparam OffsetT
|
|
581
568
|
* Signed integer type for global offsets
|
|
582
569
|
*
|
|
@@ -592,7 +579,6 @@ struct AgentReduce
|
|
|
592
579
|
*/
|
|
593
580
|
template <typename AgentReducePolicy,
|
|
594
581
|
typename InputIteratorT,
|
|
595
|
-
typename OutputIteratorT,
|
|
596
582
|
typename OffsetT,
|
|
597
583
|
typename ReductionOp,
|
|
598
584
|
typename AccumT,
|
|
@@ -600,7 +586,6 @@ template <typename AgentReducePolicy,
|
|
|
600
586
|
struct AgentWarpReduce
|
|
601
587
|
: AgentReduceImpl<AgentReducePolicy,
|
|
602
588
|
InputIteratorT,
|
|
603
|
-
OutputIteratorT,
|
|
604
589
|
OffsetT,
|
|
605
590
|
ReductionOp,
|
|
606
591
|
AccumT,
|
|
@@ -612,7 +597,6 @@ struct AgentWarpReduce
|
|
|
612
597
|
using base_t =
|
|
613
598
|
AgentReduceImpl<AgentReducePolicy,
|
|
614
599
|
InputIteratorT,
|
|
615
|
-
OutputIteratorT,
|
|
616
600
|
OffsetT,
|
|
617
601
|
ReductionOp,
|
|
618
602
|
AccumT,
|
|
@@ -47,7 +47,9 @@
|
|
|
47
47
|
#include <cub/block/block_load.cuh>
|
|
48
48
|
#include <cub/block/block_scan.cuh>
|
|
49
49
|
#include <cub/block/block_store.cuh>
|
|
50
|
+
#include <cub/grid/grid_queue.cuh>
|
|
50
51
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
52
|
+
#include <cub/util_device.cuh>
|
|
51
53
|
|
|
52
54
|
#include <cuda/std/__type_traits/conditional.h>
|
|
53
55
|
#include <cuda/std/__type_traits/is_pointer.h>
|
|
@@ -110,6 +112,27 @@ struct AgentScanPolicy : ScalingType
|
|
|
110
112
|
};
|
|
111
113
|
};
|
|
112
114
|
|
|
115
|
+
#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
116
|
+
namespace detail
|
|
117
|
+
{
|
|
118
|
+
// Only define this when needed.
|
|
119
|
+
// Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
|
|
120
|
+
// either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
|
|
121
|
+
// version is always defined, and that's the only one needed for regular CUB operations.
|
|
122
|
+
//
|
|
123
|
+
// TODO: enable this unconditionally once concepts are always available
|
|
124
|
+
CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
125
|
+
ScanAgentPolicy,
|
|
126
|
+
(GenericAgentPolicy),
|
|
127
|
+
(BLOCK_THREADS, BlockThreads, int),
|
|
128
|
+
(ITEMS_PER_THREAD, ItemsPerThread, int),
|
|
129
|
+
(LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
|
|
130
|
+
(LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
|
|
131
|
+
(STORE_ALGORITHM, StoreAlgorithm, cub::BlockStoreAlgorithm),
|
|
132
|
+
(SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
|
|
133
|
+
} // namespace detail
|
|
134
|
+
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
135
|
+
|
|
113
136
|
/******************************************************************************
|
|
114
137
|
* Thread block abstractions
|
|
115
138
|
******************************************************************************/
|
|
@@ -48,33 +48,41 @@
|
|
|
48
48
|
|
|
49
49
|
CUB_NAMESPACE_BEGIN
|
|
50
50
|
|
|
51
|
-
template <int
|
|
51
|
+
template <int BLOCK_THREADS_ARG,
|
|
52
|
+
int WARP_THREADS_ARG,
|
|
52
53
|
int ITEMS_PER_THREAD_ARG,
|
|
53
54
|
cub::WarpLoadAlgorithm LOAD_ALGORITHM_ARG = cub::WARP_LOAD_DIRECT,
|
|
54
55
|
cub::CacheLoadModifier LOAD_MODIFIER_ARG = cub::LOAD_LDG,
|
|
55
56
|
cub::WarpStoreAlgorithm STORE_ALGORITHM_ARG = cub::WARP_STORE_DIRECT>
|
|
56
57
|
struct AgentSubWarpMergeSortPolicy
|
|
57
58
|
{
|
|
58
|
-
static constexpr int
|
|
59
|
-
static constexpr int
|
|
60
|
-
static constexpr int
|
|
59
|
+
static constexpr int BLOCK_THREADS = BLOCK_THREADS_ARG;
|
|
60
|
+
static constexpr int WARP_THREADS = WARP_THREADS_ARG;
|
|
61
|
+
static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_ARG;
|
|
62
|
+
static constexpr int ITEMS_PER_TILE = WARP_THREADS * ITEMS_PER_THREAD;
|
|
63
|
+
static constexpr int SEGMENTS_PER_BLOCK = BLOCK_THREADS / WARP_THREADS;
|
|
61
64
|
|
|
62
65
|
static constexpr cub::WarpLoadAlgorithm LOAD_ALGORITHM = LOAD_ALGORITHM_ARG;
|
|
63
66
|
static constexpr cub::CacheLoadModifier LOAD_MODIFIER = LOAD_MODIFIER_ARG;
|
|
64
67
|
static constexpr cub::WarpStoreAlgorithm STORE_ALGORITHM = STORE_ALGORITHM_ARG;
|
|
65
68
|
};
|
|
66
69
|
|
|
67
|
-
|
|
68
|
-
|
|
70
|
+
#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
71
|
+
namespace detail
|
|
69
72
|
{
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
73
|
+
CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
74
|
+
SubWarpMergeSortAgentPolicy,
|
|
75
|
+
(GenericAgentPolicy),
|
|
76
|
+
(BLOCK_THREADS, BlockThreads, int),
|
|
77
|
+
(WARP_THREADS, WarpThreads, int),
|
|
78
|
+
(ITEMS_PER_THREAD, ItemsPerThread, int),
|
|
79
|
+
(ITEMS_PER_TILE, ItemsPerTile, int),
|
|
80
|
+
(SEGMENTS_PER_BLOCK, SegmentsPerBlock, int),
|
|
81
|
+
(LOAD_ALGORITHM, LoadAlgorithm, cub::WarpLoadAlgorithm),
|
|
82
|
+
(LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
|
|
83
|
+
(STORE_ALGORITHM, StoreAlgorithm, cub::WarpStoreAlgorithm))
|
|
84
|
+
} // namespace detail
|
|
85
|
+
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
78
86
|
|
|
79
87
|
namespace detail
|
|
80
88
|
{
|
|
@@ -44,6 +44,7 @@
|
|
|
44
44
|
#include <cub/block/block_scan.cuh>
|
|
45
45
|
#include <cub/block/block_store.cuh>
|
|
46
46
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
47
|
+
#include <cub/util_device.cuh>
|
|
47
48
|
|
|
48
49
|
#include <cuda/std/__functional/operations.h>
|
|
49
50
|
#include <cuda/std/__type_traits/conditional.h>
|
|
@@ -76,9 +77,22 @@ struct AgentThreeWayPartitionPolicy
|
|
|
76
77
|
};
|
|
77
78
|
};
|
|
78
79
|
|
|
80
|
+
#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
79
81
|
namespace detail
|
|
80
82
|
{
|
|
83
|
+
CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
84
|
+
ThreeWayPartitionAgentPolicy,
|
|
85
|
+
(GenericAgentPolicy),
|
|
86
|
+
(BLOCK_THREADS, BlockThreads, int),
|
|
87
|
+
(ITEMS_PER_THREAD, ItemsPerThread, int),
|
|
88
|
+
(LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
|
|
89
|
+
(LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
|
|
90
|
+
(SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
|
|
91
|
+
} // namespace detail
|
|
92
|
+
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
81
93
|
|
|
94
|
+
namespace detail
|
|
95
|
+
{
|
|
82
96
|
namespace three_way_partition
|
|
83
97
|
{
|
|
84
98
|
|