cuda-cccl 0.2.1__cp313-cp313-manylinux_2_26_x86_64.whl → 0.3.1__cp313-cp313-manylinux_2_26_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +88 -80
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +2 -18
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +27 -0
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +14 -3
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +94 -13
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +20 -6
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2 -2
- cuda/cccl/headers/include/cub/cub.cuh +8 -0
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +13 -32
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +3 -3
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +3 -2
- cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
- cuda/cccl/headers/include/cub/device/device_for.cuh +2 -10
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +8 -8
- cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +775 -163
- cuda/cccl/headers/include/cub/device/device_scan.cuh +306 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -246
- cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
- cuda/cccl/headers/include/cub/device/device_transform.cuh +11 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +47 -48
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +9 -27
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +100 -171
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +629 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +8 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +31 -29
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +6 -15
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
- cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +80 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +39 -15
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +5 -20
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +12 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +37 -4
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/util_device.cuh +72 -51
- cuda/cccl/headers/include/cub/util_ptx.cuh +8 -8
- cuda/cccl/headers/include/cub/util_type.cuh +15 -20
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +64 -11
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +3 -1
- cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
- cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
- cuda/cccl/headers/include/cuda/__device/device_ref.h +38 -48
- cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +365 -33
- cuda/cccl/headers/include/cuda/__event/event.h +8 -8
- cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +5 -5
- cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +97 -52
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +5 -6
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +5 -0
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +5 -0
- cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +6 -4
- cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +5 -0
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +6 -1
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +28 -27
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +39 -33
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +18 -11
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +10 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +36 -109
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +1 -1
- cuda/cccl/headers/include/cuda/__memory/address_space.h +28 -12
- cuda/cccl/headers/include/cuda/__memory/check_address.h +34 -29
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
- cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
- cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +161 -92
- cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +3 -2
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/devices +10 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/pipeline +2 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +0 -6
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +3 -3
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +3 -3
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +2 -3
- cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +2 -3
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +6 -8
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +3 -3
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +4 -4
- cuda/cccl/headers/include/cuda/std/__atomic/order.h +1 -1
- cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +1 -1
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/assert.h +18 -7
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +4 -115
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +8 -5
- cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +22 -3
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +4 -4
- cuda/cccl/headers/include/cuda/std/__cccl/os.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
- cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +25 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +258 -0
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +67 -0
- cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +1 -4
- cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +12 -9
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +31 -38
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +4 -4
- cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +4 -4
- cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/function.h +10 -11
- cuda/cccl/headers/include/cuda/std/__functional/hash.h +5 -6
- cuda/cccl/headers/include/cuda/std/__functional/identity.h +4 -8
- cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +2 -4
- cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +16 -18
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +2 -3
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +2 -3
- cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +16 -25
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +3 -3
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +3 -3
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +21 -28
- cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +28 -39
- cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +3 -4
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +2 -2
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +2 -3
- cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +12 -41
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +2 -2
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +3 -4
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +31 -31
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +45 -45
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +3 -2
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
- cuda/cccl/headers/include/cuda/std/__memory/addressof.h +5 -2
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +30 -30
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +1 -1
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +66 -86
- cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +2 -2
- cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +5 -2
- cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +30 -45
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +8 -12
- cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +21 -23
- cuda/cccl/headers/include/cuda/std/__new/launder.h +4 -0
- cuda/cccl/headers/include/cuda/std/__optional/hash.h +2 -2
- cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +2 -1
- cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +2 -1
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +1 -1
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +1 -1
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +1 -1
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +1 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +4 -13
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +16 -22
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +13 -18
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +2 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +3 -4
- cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +2 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +7 -8
- cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +4 -13
- cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +2 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +3 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +3 -44
- cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +2 -28
- cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +9 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +3 -3
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +1 -4
- cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +3 -34
- cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +3 -29
- cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +2 -16
- cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +4 -21
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +3 -3
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +4 -24
- cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +3 -24
- cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +13 -9
- cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +3 -18
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +21 -20
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +3 -17
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +4 -31
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +3 -42
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +5 -19
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +3 -19
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +3 -17
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +2 -15
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +13 -28
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +2 -17
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +2 -16
- cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +3 -18
- cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +4 -3
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +2 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +2 -16
- cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +2 -2
- cuda/cccl/headers/include/cuda/std/__utility/declval.h +17 -4
- cuda/cccl/headers/include/cuda/std/__utility/exchange.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/forward.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +29 -0
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +2 -2
- cuda/cccl/headers/include/cuda/std/__utility/move.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +8 -9
- cuda/cccl/headers/include/cuda/std/array +2 -2
- cuda/cccl/headers/include/cuda/std/atomic +20 -28
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +1 -32
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +3 -3
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +16 -1137
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +12 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +4 -4
- cuda/cccl/headers/include/cuda/std/inplace_vector +9 -9
- cuda/cccl/headers/include/cuda/std/ratio +3 -4
- cuda/cccl/headers/include/cuda/std/string_view +12 -5
- cuda/cccl/headers/include/cuda/std/version +3 -8
- cuda/cccl/headers/include/thrust/advance.h +6 -8
- cuda/cccl/headers/include/thrust/detail/execution_policy.h +61 -21
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +37 -2
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +1 -1
- cuda/cccl/headers/include/thrust/detail/reference.h +10 -16
- cuda/cccl/headers/include/thrust/detail/seq.h +37 -25
- cuda/cccl/headers/include/thrust/detail/vector_base.h +2 -4
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +2 -2
- cuda/cccl/headers/include/thrust/distance.h +3 -3
- cuda/cccl/headers/include/thrust/execution_policy.h +202 -335
- cuda/cccl/headers/include/thrust/functional.h +1 -2
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +6 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +35 -23
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +66 -44
- cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +4 -99
- cuda/cccl/headers/include/thrust/system/cuda/config.h +7 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +182 -38
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +199 -48
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +2 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +2 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +15 -13
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +22 -19
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +12 -42
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +16 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +30 -30
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +29 -15
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +7 -5
- cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +3 -27
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +10 -2
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +16 -35
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +75 -61
- cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +4 -99
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +68 -51
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +2 -2
- cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +4 -99
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
- cuda/cccl/parallel/experimental/__init__.py +21 -70
- cuda/compute/__init__.py +77 -0
- cuda/compute/_bindings.py +79 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +177 -10
- cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_common.py +3 -1
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.2.1.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +2 -3
- {cuda_cccl-0.2.1.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +401 -388
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
- cuda/cccl/headers/include/thrust/detail/util/align.h +0 -59
- cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +0 -59
- cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +0 -204
- cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +0 -92
- cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +0 -237
- cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +0 -95
- cuda/cccl/headers/include/thrust/system/omp/detail/par.h +0 -62
- cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +0 -62
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/_bindings.py +0 -56
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- {cuda_cccl-0.2.1.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.2.1.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,3 +1,9 @@
|
|
|
1
1
|
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
|
|
2
2
|
#
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
3
|
+
# SPDX-License -Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
|
+
|
|
5
|
+
from . import experimental
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"experimental",
|
|
9
|
+
]
|
|
@@ -1,8 +1,24 @@
|
|
|
1
|
-
# Copyright (c)
|
|
1
|
+
# Copyright (c) 2025, NVIDIA CORPORATION.
|
|
2
2
|
#
|
|
3
|
-
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
# alias for backwards compatibility
|
|
16
|
+
|
|
17
|
+
from warnings import warn
|
|
4
18
|
|
|
5
|
-
from cuda.
|
|
6
|
-
from cuda.cccl.cooperative.experimental._types import StatefulFunction
|
|
19
|
+
from cuda.coop import * # noqa: F403
|
|
7
20
|
|
|
8
|
-
|
|
21
|
+
warn(
|
|
22
|
+
"The module cuda.cccl.cooperative.experimental is deprecated. Use cuda.coop instead.",
|
|
23
|
+
FutureWarning,
|
|
24
|
+
)
|
|
@@ -64,9 +64,7 @@ struct AgentAdjacentDifferencePolicy
|
|
|
64
64
|
static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
|
|
65
65
|
};
|
|
66
66
|
|
|
67
|
-
namespace detail
|
|
68
|
-
{
|
|
69
|
-
namespace adjacent_difference
|
|
67
|
+
namespace detail::adjacent_difference
|
|
70
68
|
{
|
|
71
69
|
|
|
72
70
|
template <typename Policy,
|
|
@@ -256,7 +254,6 @@ struct AgentDifferenceInit
|
|
|
256
254
|
}
|
|
257
255
|
};
|
|
258
256
|
|
|
259
|
-
} // namespace adjacent_difference
|
|
260
|
-
} // namespace detail
|
|
257
|
+
} // namespace detail::adjacent_difference
|
|
261
258
|
|
|
262
259
|
CUB_NAMESPACE_END
|
|
@@ -62,9 +62,7 @@
|
|
|
62
62
|
|
|
63
63
|
CUB_NAMESPACE_BEGIN
|
|
64
64
|
|
|
65
|
-
namespace detail
|
|
66
|
-
{
|
|
67
|
-
namespace batch_memcpy
|
|
65
|
+
namespace detail::batch_memcpy
|
|
68
66
|
{
|
|
69
67
|
template <bool PTR_IS_FOUR_BYTE_ALIGNED>
|
|
70
68
|
_CCCL_FORCEINLINE _CCCL_DEVICE void
|
|
@@ -1179,7 +1177,6 @@ private:
|
|
|
1179
1177
|
// buffers
|
|
1180
1178
|
BLevBlockOffsetTileState blev_block_scan_state;
|
|
1181
1179
|
};
|
|
1182
|
-
} // namespace batch_memcpy
|
|
1183
|
-
} // namespace detail
|
|
1180
|
+
} // namespace detail::batch_memcpy
|
|
1184
1181
|
|
|
1185
1182
|
CUB_NAMESPACE_END
|
|
@@ -42,9 +42,7 @@
|
|
|
42
42
|
|
|
43
43
|
CUB_NAMESPACE_BEGIN
|
|
44
44
|
|
|
45
|
-
namespace detail
|
|
46
|
-
{
|
|
47
|
-
namespace for_each
|
|
45
|
+
namespace detail::for_each
|
|
48
46
|
{
|
|
49
47
|
|
|
50
48
|
template <int BlockThreads, int ItemsPerThread>
|
|
@@ -78,7 +76,6 @@ struct agent_block_striped_t
|
|
|
78
76
|
}
|
|
79
77
|
};
|
|
80
78
|
|
|
81
|
-
} // namespace for_each
|
|
82
|
-
} // namespace detail
|
|
79
|
+
} // namespace detail::for_each
|
|
83
80
|
|
|
84
81
|
CUB_NAMESPACE_END
|
|
@@ -25,22 +25,15 @@
|
|
|
25
25
|
#include <cuda/std/__algorithm/min.h>
|
|
26
26
|
|
|
27
27
|
CUB_NAMESPACE_BEGIN
|
|
28
|
-
namespace detail
|
|
28
|
+
namespace detail::merge
|
|
29
29
|
{
|
|
30
|
-
|
|
31
|
-
{
|
|
32
|
-
template <int ThreadsPerBlock,
|
|
33
|
-
int ItemsPerThread,
|
|
34
|
-
BlockLoadAlgorithm LoadAlgorithm,
|
|
35
|
-
CacheLoadModifier LoadCacheModifier,
|
|
36
|
-
BlockStoreAlgorithm StoreAlgorithm>
|
|
30
|
+
template <int ThreadsPerBlock, int ItemsPerThread, CacheLoadModifier LoadCacheModifier, BlockStoreAlgorithm StoreAlgorithm>
|
|
37
31
|
struct agent_policy_t
|
|
38
32
|
{
|
|
39
33
|
// do not change data member names, policy_wrapper_t depends on it
|
|
40
34
|
static constexpr int BLOCK_THREADS = ThreadsPerBlock;
|
|
41
35
|
static constexpr int ITEMS_PER_THREAD = ItemsPerThread;
|
|
42
36
|
static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD;
|
|
43
|
-
static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = LoadAlgorithm;
|
|
44
37
|
static constexpr CacheLoadModifier LOAD_MODIFIER = LoadCacheModifier;
|
|
45
38
|
static constexpr BlockStoreAlgorithm STORE_ALGORITHM = StoreAlgorithm;
|
|
46
39
|
};
|
|
@@ -60,108 +53,124 @@ struct agent_t
|
|
|
60
53
|
using policy = Policy;
|
|
61
54
|
|
|
62
55
|
// key and value type are taken from the first input sequence (consistent with old Thrust behavior)
|
|
63
|
-
using key_type
|
|
64
|
-
using item_type
|
|
65
|
-
|
|
66
|
-
using keys_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt1>;
|
|
67
|
-
using keys_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt2>;
|
|
68
|
-
using items_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt1>;
|
|
69
|
-
using items_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt2>;
|
|
70
|
-
|
|
71
|
-
using block_load_keys1 = typename BlockLoadType<Policy, keys_load_it1>::type;
|
|
72
|
-
using block_load_keys2 = typename BlockLoadType<Policy, keys_load_it2>::type;
|
|
73
|
-
using block_load_items1 = typename BlockLoadType<Policy, items_load_it1>::type;
|
|
74
|
-
using block_load_items2 = typename BlockLoadType<Policy, items_load_it2>::type;
|
|
75
|
-
|
|
56
|
+
using key_type = it_value_t<KeysIt1>;
|
|
57
|
+
using item_type = it_value_t<ItemsIt1>;
|
|
76
58
|
using block_store_keys = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
|
|
77
59
|
using block_store_items = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
|
|
78
60
|
|
|
61
|
+
static constexpr int items_per_thread = Policy::ITEMS_PER_THREAD;
|
|
62
|
+
static constexpr int threads_per_block = Policy::BLOCK_THREADS;
|
|
63
|
+
static constexpr int items_per_tile = Policy::ITEMS_PER_TILE;
|
|
64
|
+
|
|
79
65
|
union temp_storages
|
|
80
66
|
{
|
|
81
|
-
typename block_load_keys1::TempStorage load_keys1;
|
|
82
|
-
typename block_load_keys2::TempStorage load_keys2;
|
|
83
|
-
typename block_load_items1::TempStorage load_items1;
|
|
84
|
-
typename block_load_items2::TempStorage load_items2;
|
|
85
67
|
typename block_store_keys::TempStorage store_keys;
|
|
86
68
|
typename block_store_items::TempStorage store_items;
|
|
87
69
|
|
|
88
|
-
|
|
89
|
-
|
|
70
|
+
// We could change SerialMerge to avoid reading one item out of bounds and drop the + 1 here. But that would
|
|
71
|
+
// introduce more branches (about 10% slower on 2^16 problem sizes on RTX 5090 in a first attempt)
|
|
72
|
+
key_type keys_shared[items_per_tile + 1];
|
|
73
|
+
item_type items_shared[items_per_tile + 1];
|
|
90
74
|
};
|
|
91
75
|
|
|
92
76
|
struct TempStorage : Uninitialized<temp_storages>
|
|
93
77
|
{};
|
|
94
78
|
|
|
95
|
-
static constexpr int items_per_thread = Policy::ITEMS_PER_THREAD;
|
|
96
|
-
static constexpr int threads_per_block = Policy::BLOCK_THREADS;
|
|
97
|
-
static constexpr Offset items_per_tile = Policy::ITEMS_PER_TILE;
|
|
98
|
-
|
|
99
79
|
// Per thread data
|
|
100
80
|
temp_storages& storage;
|
|
101
|
-
|
|
102
|
-
|
|
81
|
+
KeysIt1 keys1_in;
|
|
82
|
+
ItemsIt1 items1_in;
|
|
103
83
|
Offset keys1_count;
|
|
104
|
-
|
|
105
|
-
|
|
84
|
+
KeysIt2 keys2_in;
|
|
85
|
+
ItemsIt2 items2_in;
|
|
106
86
|
Offset keys2_count;
|
|
107
87
|
KeysOutputIt keys_out;
|
|
108
88
|
ItemsOutputIt items_out;
|
|
109
89
|
CompareOp compare_op;
|
|
110
|
-
Offset*
|
|
90
|
+
Offset* key1_beg_offsets;
|
|
111
91
|
|
|
112
92
|
template <bool IsFullTile>
|
|
113
93
|
_CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(Offset tile_idx, Offset tile_base, int num_remaining)
|
|
114
94
|
{
|
|
115
|
-
const Offset partition_beg = merge_partitions[tile_idx + 0];
|
|
116
|
-
const Offset partition_end = merge_partitions[tile_idx + 1];
|
|
117
|
-
|
|
118
95
|
const Offset diag0 = items_per_tile * tile_idx;
|
|
119
|
-
|
|
96
|
+
Offset diag1 = diag0 + items_per_tile;
|
|
97
|
+
if constexpr (IsFullTile)
|
|
98
|
+
{
|
|
99
|
+
_CCCL_ASSERT(diag1 <= keys1_count + keys2_count, "");
|
|
100
|
+
}
|
|
101
|
+
else
|
|
102
|
+
{
|
|
103
|
+
diag1 = keys1_count + keys2_count;
|
|
104
|
+
}
|
|
120
105
|
|
|
121
106
|
// compute bounding box for keys1 & keys2
|
|
122
|
-
const Offset keys1_beg =
|
|
123
|
-
const Offset keys1_end =
|
|
107
|
+
const Offset keys1_beg = key1_beg_offsets[tile_idx + 0];
|
|
108
|
+
const Offset keys1_end = key1_beg_offsets[tile_idx + 1];
|
|
124
109
|
const Offset keys2_beg = diag0 - keys1_beg;
|
|
125
110
|
const Offset keys2_end = diag1 - keys1_end;
|
|
126
111
|
|
|
127
112
|
// number of keys per tile
|
|
128
|
-
const int
|
|
129
|
-
const int
|
|
113
|
+
const int keys1_count_tile = static_cast<int>(keys1_end - keys1_beg);
|
|
114
|
+
const int keys2_count_tile = static_cast<int>(keys2_end - keys2_beg);
|
|
115
|
+
if constexpr (IsFullTile)
|
|
116
|
+
{
|
|
117
|
+
_CCCL_ASSERT(keys1_count_tile + keys2_count_tile == items_per_tile, "");
|
|
118
|
+
}
|
|
119
|
+
else
|
|
120
|
+
{
|
|
121
|
+
_CCCL_ASSERT(keys1_count_tile + keys2_count_tile == num_remaining, "");
|
|
122
|
+
}
|
|
130
123
|
|
|
131
124
|
key_type keys_loc[items_per_thread];
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
125
|
+
{
|
|
126
|
+
auto keys1_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(keys1_in);
|
|
127
|
+
auto keys2_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(keys2_in);
|
|
128
|
+
merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
|
|
129
|
+
keys_loc, keys1_in_cm + keys1_beg, keys2_in_cm + keys2_beg, keys1_count_tile, keys2_count_tile);
|
|
130
|
+
merge_sort::reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
|
|
131
|
+
__syncthreads();
|
|
132
|
+
}
|
|
136
133
|
|
|
137
|
-
//
|
|
134
|
+
// now find the merge path for each of thread.
|
|
138
135
|
// we can use int type here, because the number of items in shared memory is limited
|
|
139
|
-
|
|
136
|
+
int diag0_thread = items_per_thread * static_cast<int>(threadIdx.x);
|
|
137
|
+
if constexpr (IsFullTile)
|
|
138
|
+
{
|
|
139
|
+
_CCCL_ASSERT(num_remaining == items_per_tile, "");
|
|
140
|
+
_CCCL_ASSERT(diag0_thread < num_remaining, "");
|
|
141
|
+
}
|
|
142
|
+
else
|
|
143
|
+
{ // for partial tiles, clamp the thread diagonal to the valid items
|
|
144
|
+
diag0_thread = (::cuda::std::min) (diag0_thread, num_remaining);
|
|
145
|
+
}
|
|
140
146
|
|
|
141
|
-
const int
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
147
|
+
const int keys1_beg_thread = MergePath(
|
|
148
|
+
&storage.keys_shared[0],
|
|
149
|
+
&storage.keys_shared[keys1_count_tile],
|
|
150
|
+
keys1_count_tile,
|
|
151
|
+
keys2_count_tile,
|
|
152
|
+
diag0_thread,
|
|
153
|
+
compare_op);
|
|
154
|
+
const int keys2_beg_thread = diag0_thread - keys1_beg_thread;
|
|
146
155
|
|
|
147
|
-
const int
|
|
148
|
-
const int
|
|
156
|
+
const int keys1_count_thread = keys1_count_tile - keys1_beg_thread;
|
|
157
|
+
const int keys2_count_thread = keys2_count_tile - keys2_beg_thread;
|
|
149
158
|
|
|
150
159
|
// perform serial merge
|
|
151
160
|
int indices[items_per_thread];
|
|
152
|
-
|
|
161
|
+
SerialMerge(
|
|
153
162
|
&storage.keys_shared[0],
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
163
|
+
keys1_beg_thread,
|
|
164
|
+
keys2_beg_thread + keys1_count_tile,
|
|
165
|
+
keys1_count_thread,
|
|
166
|
+
keys2_count_thread,
|
|
158
167
|
keys_loc,
|
|
159
168
|
indices,
|
|
160
169
|
compare_op);
|
|
161
|
-
__syncthreads();
|
|
162
170
|
|
|
163
171
|
// write keys
|
|
164
|
-
|
|
172
|
+
__syncthreads(); // sync after reading from SMEM before so block store can use SMEM again
|
|
173
|
+
if constexpr (IsFullTile)
|
|
165
174
|
{
|
|
166
175
|
block_store_keys{storage.store_keys}.Store(keys_out + tile_base, keys_loc);
|
|
167
176
|
}
|
|
@@ -175,12 +184,15 @@ struct agent_t
|
|
|
175
184
|
if constexpr (have_items)
|
|
176
185
|
{
|
|
177
186
|
item_type items_loc[items_per_thread];
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
187
|
+
{
|
|
188
|
+
auto items1_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(items1_in);
|
|
189
|
+
auto items2_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(items2_in);
|
|
190
|
+
merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
|
|
191
|
+
items_loc, items1_in_cm + keys1_beg, items2_in_cm + keys2_beg, keys1_count_tile, keys2_count_tile);
|
|
192
|
+
__syncthreads(); // block_store_keys above uses SMEM, so make sure all threads are done before we write to it
|
|
193
|
+
merge_sort::reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
|
|
194
|
+
__syncthreads();
|
|
195
|
+
}
|
|
184
196
|
|
|
185
197
|
// gather items from shared mem
|
|
186
198
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
@@ -191,7 +203,7 @@ struct agent_t
|
|
|
191
203
|
__syncthreads();
|
|
192
204
|
|
|
193
205
|
// write from reg to gmem
|
|
194
|
-
if (IsFullTile)
|
|
206
|
+
if constexpr (IsFullTile)
|
|
195
207
|
{
|
|
196
208
|
block_store_items{storage.store_items}.Store(items_out + tile_base, items_loc);
|
|
197
209
|
}
|
|
@@ -204,23 +216,19 @@ struct agent_t
|
|
|
204
216
|
|
|
205
217
|
_CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
|
|
206
218
|
{
|
|
207
|
-
|
|
208
|
-
// TODO(bgruber): is the above still true?
|
|
209
|
-
const int tile_idx = static_cast<int>(blockIdx.x);
|
|
219
|
+
const Offset tile_idx = blockIdx.x;
|
|
210
220
|
const Offset tile_base = tile_idx * items_per_tile;
|
|
211
|
-
// TODO(bgruber): random mixing of int and Offset
|
|
212
221
|
const int items_in_tile =
|
|
213
222
|
static_cast<int>((::cuda::std::min) (static_cast<Offset>(items_per_tile), keys1_count + keys2_count - tile_base));
|
|
214
223
|
if (items_in_tile == items_per_tile)
|
|
215
224
|
{
|
|
216
|
-
consume_tile
|
|
225
|
+
consume_tile</* IsFullTile */ true>(tile_idx, tile_base, items_per_tile);
|
|
217
226
|
}
|
|
218
227
|
else
|
|
219
228
|
{
|
|
220
|
-
consume_tile
|
|
229
|
+
consume_tile</* IsFullTile */ false>(tile_idx, tile_base, items_in_tile);
|
|
221
230
|
}
|
|
222
231
|
}
|
|
223
232
|
};
|
|
224
|
-
} // namespace merge
|
|
225
|
-
} // namespace detail
|
|
233
|
+
} // namespace detail::merge
|
|
226
234
|
CUB_NAMESPACE_END
|
|
@@ -66,9 +66,28 @@ struct AgentMergeSortPolicy
|
|
|
66
66
|
static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
|
|
67
67
|
};
|
|
68
68
|
|
|
69
|
+
#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
69
70
|
namespace detail
|
|
70
71
|
{
|
|
71
|
-
|
|
72
|
+
// Only define this when needed.
|
|
73
|
+
// Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
|
|
74
|
+
// either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
|
|
75
|
+
// version is always defined, and that's the only one needed for regular CUB operations.
|
|
76
|
+
//
|
|
77
|
+
// TODO: enable this unconditionally once concepts are always available
|
|
78
|
+
CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
79
|
+
MergeSortAgentPolicy,
|
|
80
|
+
(GenericAgentPolicy),
|
|
81
|
+
(BLOCK_THREADS, BlockThreads, int),
|
|
82
|
+
(ITEMS_PER_THREAD, ItemsPerThread, int),
|
|
83
|
+
(ITEMS_PER_TILE, ItemsPerTile, int),
|
|
84
|
+
(LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
|
|
85
|
+
(LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
|
|
86
|
+
(STORE_ALGORITHM, StoreAlgorithm, cub::BlockStoreAlgorithm))
|
|
87
|
+
} // namespace detail
|
|
88
|
+
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES
|
|
89
|
+
|
|
90
|
+
namespace detail::merge_sort
|
|
72
91
|
{
|
|
73
92
|
|
|
74
93
|
template <typename Policy,
|
|
@@ -724,7 +743,6 @@ struct AgentMerge
|
|
|
724
743
|
}
|
|
725
744
|
};
|
|
726
745
|
|
|
727
|
-
} // namespace merge_sort
|
|
728
|
-
} // namespace detail
|
|
746
|
+
} // namespace detail::merge_sort
|
|
729
747
|
|
|
730
748
|
CUB_NAMESPACE_END
|
|
@@ -51,6 +51,7 @@
|
|
|
51
51
|
#include <cub/block/radix_rank_sort_operations.cuh>
|
|
52
52
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
53
53
|
#include <cub/thread/thread_load.cuh>
|
|
54
|
+
#include <cub/util_device.cuh>
|
|
54
55
|
#include <cub/util_type.cuh>
|
|
55
56
|
|
|
56
57
|
#include <cuda/std/cstdint>
|
|
@@ -119,13 +120,33 @@ struct AgentRadixSortDownsweepPolicy : ScalingType
|
|
|
119
120
|
static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
|
|
120
121
|
};
|
|
121
122
|
|
|
123
|
+
#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
124
|
+
namespace detail
|
|
125
|
+
{
|
|
126
|
+
// Only define this when needed.
|
|
127
|
+
// Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
|
|
128
|
+
// either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
|
|
129
|
+
// version is always defined, and that's the only one needed for regular CUB operations.
|
|
130
|
+
//
|
|
131
|
+
// TODO: enable this unconditionally once concepts are always available
|
|
132
|
+
CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
133
|
+
RadixSortDownsweepAgentPolicy,
|
|
134
|
+
(GenericAgentPolicy),
|
|
135
|
+
(BLOCK_THREADS, BlockThreads, int),
|
|
136
|
+
(ITEMS_PER_THREAD, ItemsPerThread, int),
|
|
137
|
+
(RADIX_BITS, RadixBits, int),
|
|
138
|
+
(LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
|
|
139
|
+
(LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
|
|
140
|
+
(RANK_ALGORITHM, RankAlgorithm, cub::RadixRankAlgorithm),
|
|
141
|
+
(SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
|
|
142
|
+
} // namespace detail
|
|
143
|
+
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
144
|
+
|
|
122
145
|
/******************************************************************************
|
|
123
146
|
* Thread block abstractions
|
|
124
147
|
******************************************************************************/
|
|
125
148
|
|
|
126
|
-
namespace detail
|
|
127
|
-
{
|
|
128
|
-
namespace radix_sort
|
|
149
|
+
namespace detail::radix_sort
|
|
129
150
|
{
|
|
130
151
|
|
|
131
152
|
/**
|
|
@@ -760,7 +781,6 @@ struct AgentRadixSortDownsweep
|
|
|
760
781
|
}
|
|
761
782
|
};
|
|
762
783
|
|
|
763
|
-
} // namespace radix_sort
|
|
764
|
-
} // namespace detail
|
|
784
|
+
} // namespace detail::radix_sort
|
|
765
785
|
|
|
766
786
|
CUB_NAMESPACE_END
|
|
@@ -85,9 +85,7 @@ struct AgentRadixSortExclusiveSumPolicy
|
|
|
85
85
|
};
|
|
86
86
|
};
|
|
87
87
|
|
|
88
|
-
namespace detail
|
|
89
|
-
{
|
|
90
|
-
namespace radix_sort
|
|
88
|
+
namespace detail::radix_sort
|
|
91
89
|
{
|
|
92
90
|
|
|
93
91
|
template <typename AgentRadixSortHistogramPolicy,
|
|
@@ -283,7 +281,6 @@ struct AgentRadixSortHistogram
|
|
|
283
281
|
}
|
|
284
282
|
};
|
|
285
283
|
|
|
286
|
-
} // namespace radix_sort
|
|
287
|
-
} // namespace detail
|
|
284
|
+
} // namespace detail::radix_sort
|
|
288
285
|
|
|
289
286
|
CUB_NAMESPACE_END
|
|
@@ -100,9 +100,7 @@ struct AgentRadixSortOnesweepPolicy : ScalingType
|
|
|
100
100
|
static constexpr RadixSortStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
|
|
101
101
|
};
|
|
102
102
|
|
|
103
|
-
namespace detail
|
|
104
|
-
{
|
|
105
|
-
namespace radix_sort
|
|
103
|
+
namespace detail::radix_sort
|
|
106
104
|
{
|
|
107
105
|
|
|
108
106
|
template <typename AgentRadixSortOnesweepPolicy,
|
|
@@ -700,7 +698,6 @@ struct AgentRadixSortOnesweep
|
|
|
700
698
|
}
|
|
701
699
|
};
|
|
702
700
|
|
|
703
|
-
} // namespace radix_sort
|
|
704
|
-
} // namespace detail
|
|
701
|
+
} // namespace detail::radix_sort
|
|
705
702
|
|
|
706
703
|
CUB_NAMESPACE_END
|
|
@@ -103,9 +103,7 @@ struct AgentRadixSortUpsweepPolicy : ScalingType
|
|
|
103
103
|
* Thread block abstractions
|
|
104
104
|
******************************************************************************/
|
|
105
105
|
|
|
106
|
-
namespace detail
|
|
107
|
-
{
|
|
108
|
-
namespace radix_sort
|
|
106
|
+
namespace detail::radix_sort
|
|
109
107
|
{
|
|
110
108
|
|
|
111
109
|
/**
|
|
@@ -552,7 +550,6 @@ struct AgentRadixSortUpsweep
|
|
|
552
550
|
}
|
|
553
551
|
};
|
|
554
552
|
|
|
555
|
-
} // namespace radix_sort
|
|
556
|
-
} // namespace detail
|
|
553
|
+
} // namespace detail::radix_sort
|
|
557
554
|
|
|
558
555
|
CUB_NAMESPACE_END
|
|
@@ -50,10 +50,10 @@
|
|
|
50
50
|
#include <cub/util_device.cuh>
|
|
51
51
|
#include <cub/util_type.cuh>
|
|
52
52
|
|
|
53
|
-
#include <cuda/__memory/is_aligned.h>
|
|
54
53
|
#include <cuda/std/__algorithm/min.h>
|
|
55
54
|
#include <cuda/std/__functional/identity.h>
|
|
56
55
|
#include <cuda/std/__functional/operations.h>
|
|
56
|
+
#include <cuda/std/__memory/is_sufficiently_aligned.h>
|
|
57
57
|
#include <cuda/std/__type_traits/conditional.h>
|
|
58
58
|
#include <cuda/std/__type_traits/is_pointer.h>
|
|
59
59
|
|
|
@@ -175,9 +175,6 @@ namespace detail::reduce
|
|
|
175
175
|
* @tparam InputIteratorT
|
|
176
176
|
* Random-access iterator type for input
|
|
177
177
|
*
|
|
178
|
-
* @tparam OutputIteratorT
|
|
179
|
-
* Random-access iterator type for output
|
|
180
|
-
*
|
|
181
178
|
* @tparam OffsetT
|
|
182
179
|
* Signed integer type for global offsets
|
|
183
180
|
*
|
|
@@ -202,7 +199,6 @@ namespace detail::reduce
|
|
|
202
199
|
*/
|
|
203
200
|
template <typename AgentReducePolicy,
|
|
204
201
|
typename InputIteratorT,
|
|
205
|
-
typename OutputIteratorT,
|
|
206
202
|
typename OffsetT,
|
|
207
203
|
typename ReductionOp,
|
|
208
204
|
typename AccumT,
|
|
@@ -274,7 +270,7 @@ struct AgentReduceImpl
|
|
|
274
270
|
{
|
|
275
271
|
if constexpr (AttemptVectorization)
|
|
276
272
|
{
|
|
277
|
-
return ::cuda::
|
|
273
|
+
return ::cuda::std::is_sufficiently_aligned<alignof(VectorT)>(d_in);
|
|
278
274
|
}
|
|
279
275
|
else
|
|
280
276
|
{
|
|
@@ -506,9 +502,6 @@ private:
|
|
|
506
502
|
* @tparam InputIteratorT
|
|
507
503
|
* Random-access iterator type for input
|
|
508
504
|
*
|
|
509
|
-
* @tparam OutputIteratorT
|
|
510
|
-
* Random-access iterator type for output
|
|
511
|
-
*
|
|
512
505
|
* @tparam OffsetT
|
|
513
506
|
* Signed integer type for global offsets
|
|
514
507
|
*
|
|
@@ -524,7 +517,6 @@ private:
|
|
|
524
517
|
*/
|
|
525
518
|
template <typename AgentReducePolicy,
|
|
526
519
|
typename InputIteratorT,
|
|
527
|
-
typename OutputIteratorT,
|
|
528
520
|
typename OffsetT,
|
|
529
521
|
typename ReductionOp,
|
|
530
522
|
typename AccumT,
|
|
@@ -532,7 +524,6 @@ template <typename AgentReducePolicy,
|
|
|
532
524
|
struct AgentReduce
|
|
533
525
|
: AgentReduceImpl<AgentReducePolicy,
|
|
534
526
|
InputIteratorT,
|
|
535
|
-
OutputIteratorT,
|
|
536
527
|
OffsetT,
|
|
537
528
|
ReductionOp,
|
|
538
529
|
AccumT,
|
|
@@ -543,7 +534,6 @@ struct AgentReduce
|
|
|
543
534
|
using base_t =
|
|
544
535
|
AgentReduceImpl<AgentReducePolicy,
|
|
545
536
|
InputIteratorT,
|
|
546
|
-
OutputIteratorT,
|
|
547
537
|
OffsetT,
|
|
548
538
|
ReductionOp,
|
|
549
539
|
AccumT,
|
|
@@ -574,9 +564,6 @@ struct AgentReduce
|
|
|
574
564
|
* @tparam InputIteratorT
|
|
575
565
|
* Random-access iterator type for input
|
|
576
566
|
*
|
|
577
|
-
* @tparam OutputIteratorT
|
|
578
|
-
* Random-access iterator type for output
|
|
579
|
-
*
|
|
580
567
|
* @tparam OffsetT
|
|
581
568
|
* Signed integer type for global offsets
|
|
582
569
|
*
|
|
@@ -592,7 +579,6 @@ struct AgentReduce
|
|
|
592
579
|
*/
|
|
593
580
|
template <typename AgentReducePolicy,
|
|
594
581
|
typename InputIteratorT,
|
|
595
|
-
typename OutputIteratorT,
|
|
596
582
|
typename OffsetT,
|
|
597
583
|
typename ReductionOp,
|
|
598
584
|
typename AccumT,
|
|
@@ -600,7 +586,6 @@ template <typename AgentReducePolicy,
|
|
|
600
586
|
struct AgentWarpReduce
|
|
601
587
|
: AgentReduceImpl<AgentReducePolicy,
|
|
602
588
|
InputIteratorT,
|
|
603
|
-
OutputIteratorT,
|
|
604
589
|
OffsetT,
|
|
605
590
|
ReductionOp,
|
|
606
591
|
AccumT,
|
|
@@ -612,7 +597,6 @@ struct AgentWarpReduce
|
|
|
612
597
|
using base_t =
|
|
613
598
|
AgentReduceImpl<AgentReducePolicy,
|
|
614
599
|
InputIteratorT,
|
|
615
|
-
OutputIteratorT,
|
|
616
600
|
OffsetT,
|
|
617
601
|
ReductionOp,
|
|
618
602
|
AccumT,
|
|
@@ -134,9 +134,7 @@ struct AgentRlePolicy
|
|
|
134
134
|
* Thread block abstractions
|
|
135
135
|
******************************************************************************/
|
|
136
136
|
|
|
137
|
-
namespace detail
|
|
138
|
-
{
|
|
139
|
-
namespace rle
|
|
137
|
+
namespace detail::rle
|
|
140
138
|
{
|
|
141
139
|
|
|
142
140
|
/**
|
|
@@ -1121,7 +1119,6 @@ struct AgentRle
|
|
|
1121
1119
|
}
|
|
1122
1120
|
};
|
|
1123
1121
|
|
|
1124
|
-
} // namespace rle
|
|
1125
|
-
} // namespace detail
|
|
1122
|
+
} // namespace detail::rle
|
|
1126
1123
|
|
|
1127
1124
|
CUB_NAMESPACE_END
|