cuda-cccl 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
- cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
- cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
- cuda/cccl/headers/include/cuda/__event/event.h +26 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +3 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +146 -11
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/compute/__init__.py +2 -0
- cuda/compute/_bindings.pyi +43 -1
- cuda/compute/_bindings_impl.pyx +156 -7
- cuda/compute/algorithms/_scan.py +108 -36
- cuda/compute/algorithms/_transform.py +32 -11
- cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/iterators/__init__.py +2 -0
- cuda/compute/iterators/_factories.py +28 -0
- cuda/compute/iterators/_iterators.py +206 -1
- cuda/compute/numba_utils.py +2 -2
- cuda/compute/typing.py +2 -0
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -111,10 +111,9 @@ CUB_NAMESPACE_BEGIN
|
|
|
111
111
|
//! // Collectively compute adjacent_difference
|
|
112
112
|
//! int result[4];
|
|
113
113
|
//!
|
|
114
|
-
//! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
|
|
115
|
-
//!
|
|
116
|
-
//!
|
|
117
|
-
//! CustomDifference());
|
|
114
|
+
//! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(thread_data, result,
|
|
115
|
+
//! CustomDifference());
|
|
116
|
+
//! }
|
|
118
117
|
//!
|
|
119
118
|
//! Suppose the set of input `thread_data` across the block of threads is
|
|
120
119
|
//! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
|
|
@@ -283,10 +282,9 @@ public:
|
|
|
283
282
|
//! ...
|
|
284
283
|
//!
|
|
285
284
|
//! // Collectively compute adjacent_difference
|
|
286
|
-
//! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
|
|
287
|
-
//!
|
|
288
|
-
//!
|
|
289
|
-
//! CustomDifference());
|
|
285
|
+
//! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(thread_data, thread_data,
|
|
286
|
+
//! CustomDifference());
|
|
287
|
+
//! }
|
|
290
288
|
//!
|
|
291
289
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
292
290
|
//! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
|
|
@@ -96,6 +96,7 @@ CUB_NAMESPACE_BEGIN
|
|
|
96
96
|
//! // Collectively compute head flags for discontinuities in the segment
|
|
97
97
|
//! int head_flags[4];
|
|
98
98
|
//! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
|
|
99
|
+
//! }
|
|
99
100
|
//!
|
|
100
101
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
101
102
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
|
|
@@ -387,6 +388,7 @@ public:
|
|
|
387
388
|
//! // Collectively compute head flags for discontinuities in the segment
|
|
388
389
|
//! int head_flags[4];
|
|
389
390
|
//! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
|
|
391
|
+
//! }
|
|
390
392
|
//!
|
|
391
393
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
392
394
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
|
|
@@ -463,8 +465,9 @@ public:
|
|
|
463
465
|
//!
|
|
464
466
|
//! // Collectively compute head flags for discontinuities in the segment
|
|
465
467
|
//! int head_flags[4];
|
|
466
|
-
//! BlockDiscontinuity(temp_storage).FlagHeads(
|
|
467
|
-
//!
|
|
468
|
+
//! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data,
|
|
469
|
+
//! cub::Inequality(), tile_predecessor_item);
|
|
470
|
+
//! }
|
|
468
471
|
//!
|
|
469
472
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
470
473
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``,
|
|
@@ -549,6 +552,7 @@ public:
|
|
|
549
552
|
//! // Collectively compute tail flags for discontinuities in the segment
|
|
550
553
|
//! int tail_flags[4];
|
|
551
554
|
//! BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
|
|
555
|
+
//! }
|
|
552
556
|
//!
|
|
553
557
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
554
558
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``.
|
|
@@ -640,8 +644,9 @@ public:
|
|
|
640
644
|
//!
|
|
641
645
|
//! // Collectively compute tail flags for discontinuities in the segment
|
|
642
646
|
//! int tail_flags[4];
|
|
643
|
-
//! BlockDiscontinuity(temp_storage).FlagTails(
|
|
644
|
-
//!
|
|
647
|
+
//! BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data,
|
|
648
|
+
//! cub::Inequality(), tile_successor_item);
|
|
649
|
+
//! }
|
|
645
650
|
//!
|
|
646
651
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
647
652
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
|
|
@@ -742,8 +747,9 @@ public:
|
|
|
742
747
|
//! // Collectively compute head and flags for discontinuities in the segment
|
|
743
748
|
//! int head_flags[4];
|
|
744
749
|
//! int tail_flags[4];
|
|
745
|
-
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
|
|
746
|
-
//!
|
|
750
|
+
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tail_flags, thread_data,
|
|
751
|
+
//! cub::Inequality());
|
|
752
|
+
//! }
|
|
747
753
|
//!
|
|
748
754
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
749
755
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
|
|
@@ -864,8 +870,10 @@ public:
|
|
|
864
870
|
//! // Collectively compute head and flags for discontinuities in the segment
|
|
865
871
|
//! int head_flags[4];
|
|
866
872
|
//! int tail_flags[4];
|
|
867
|
-
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
|
|
868
|
-
//!
|
|
873
|
+
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tail_flags,
|
|
874
|
+
//! tile_successor_item, thread_data,
|
|
875
|
+
//! cub::Inequality());
|
|
876
|
+
//! }
|
|
869
877
|
//!
|
|
870
878
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
871
879
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
|
|
@@ -997,9 +1005,10 @@ public:
|
|
|
997
1005
|
//! // Collectively compute head and flags for discontinuities in the segment
|
|
998
1006
|
//! int head_flags[4];
|
|
999
1007
|
//! int tail_flags[4];
|
|
1000
|
-
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
|
|
1001
|
-
//!
|
|
1002
|
-
//!
|
|
1008
|
+
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tile_predecessor_item,
|
|
1009
|
+
//! tail_flags, tile_successor_item,
|
|
1010
|
+
//! thread_data, cub::Inequality());
|
|
1011
|
+
//! }
|
|
1003
1012
|
//!
|
|
1004
1013
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
1005
1014
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
|
|
@@ -1126,9 +1135,10 @@ public:
|
|
|
1126
1135
|
//! // Collectively compute head and flags for discontinuities in the segment
|
|
1127
1136
|
//! int head_flags[4];
|
|
1128
1137
|
//! int tail_flags[4];
|
|
1129
|
-
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
|
|
1130
|
-
//!
|
|
1131
|
-
//!
|
|
1138
|
+
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tile_predecessor_item,
|
|
1139
|
+
//! tail_flags, tile_successor_item,
|
|
1140
|
+
//! thread_data, cub::Inequality());
|
|
1141
|
+
//! }
|
|
1132
1142
|
//!
|
|
1133
1143
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
1134
1144
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
|
|
@@ -101,6 +101,7 @@ CUB_NAMESPACE_BEGIN
|
|
|
101
101
|
//!
|
|
102
102
|
//! // Collectively exchange data into a blocked arrangement across threads
|
|
103
103
|
//! BlockExchange(temp_storage).StripedToBlocked(thread_data);
|
|
104
|
+
//! }
|
|
104
105
|
//!
|
|
105
106
|
//! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
|
|
106
107
|
//! [1,129,257,385], ..., [127,255,383,511] }``. The corresponding output ``thread_data`` in those threads will be
|
|
@@ -883,6 +884,7 @@ public:
|
|
|
883
884
|
//!
|
|
884
885
|
//! // Collectively exchange data into a blocked arrangement across threads
|
|
885
886
|
//! BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
|
|
887
|
+
//! }
|
|
886
888
|
//!
|
|
887
889
|
//! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
|
|
888
890
|
//! [1,129,257,385], ..., [127,255,383,511] }`` after loading from device-accessible memory. The corresponding output
|
|
@@ -933,6 +935,7 @@ public:
|
|
|
933
935
|
//!
|
|
934
936
|
//! // Store data striped across block threads into an ordered tile
|
|
935
937
|
//! cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
|
|
938
|
+
//! }
|
|
936
939
|
//!
|
|
937
940
|
//! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
|
|
938
941
|
//! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be
|
|
@@ -983,6 +986,7 @@ public:
|
|
|
983
986
|
//!
|
|
984
987
|
//! // Collectively exchange data into a blocked arrangement across threads
|
|
985
988
|
//! BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
|
|
989
|
+
//! }
|
|
986
990
|
//!
|
|
987
991
|
//! Suppose the set of warp-striped input ``thread_data`` across the block of threads is ``{ [0,32,64,96],
|
|
988
992
|
//! [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` after loading from device-accessible memory. (The first 128
|
|
@@ -1037,6 +1041,7 @@ public:
|
|
|
1037
1041
|
//!
|
|
1038
1042
|
//! // Store data striped across warp threads into an ordered tile
|
|
1039
1043
|
//! cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
|
|
1044
|
+
//! }
|
|
1040
1045
|
//!
|
|
1041
1046
|
//! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
|
|
1042
1047
|
//! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be
|
|
@@ -140,6 +140,7 @@ enum BlockHistogramAlgorithm
|
|
|
140
140
|
//!
|
|
141
141
|
//! // Compute the block-wide histogram
|
|
142
142
|
//! BlockHistogram(temp_storage).Histogram(data, smem_histogram);
|
|
143
|
+
//! }
|
|
143
144
|
//!
|
|
144
145
|
//! Performance and Usage Considerations
|
|
145
146
|
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
@@ -281,6 +282,7 @@ public:
|
|
|
281
282
|
//!
|
|
282
283
|
//! // Update the block-wide histogram
|
|
283
284
|
//! BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
|
|
285
|
+
//! }
|
|
284
286
|
//!
|
|
285
287
|
//! @endrst
|
|
286
288
|
//!
|
|
@@ -338,6 +340,7 @@ public:
|
|
|
338
340
|
//!
|
|
339
341
|
//! // Compute the block-wide histogram
|
|
340
342
|
//! BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
|
|
343
|
+
//! }
|
|
341
344
|
//!
|
|
342
345
|
//! @endrst
|
|
343
346
|
//!
|
|
@@ -399,6 +402,7 @@ public:
|
|
|
399
402
|
//!
|
|
400
403
|
//! // Update the block-wide histogram
|
|
401
404
|
//! BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
|
|
405
|
+
//! }
|
|
402
406
|
//!
|
|
403
407
|
//! @endrst
|
|
404
408
|
//!
|
|
@@ -771,6 +771,7 @@ enum BlockLoadAlgorithm
|
|
|
771
771
|
//! // Load a segment of consecutive items that are blocked across threads
|
|
772
772
|
//! int thread_data[4];
|
|
773
773
|
//! BlockLoad(temp_storage).Load(d_data, thread_data);
|
|
774
|
+
//! }
|
|
774
775
|
//!
|
|
775
776
|
//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads in
|
|
776
777
|
//! those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
|
|
@@ -1123,6 +1124,7 @@ public:
|
|
|
1123
1124
|
//! // Load a segment of consecutive items that are blocked across threads
|
|
1124
1125
|
//! int thread_data[4];
|
|
1125
1126
|
//! BlockLoad(temp_storage).Load(d_data, thread_data);
|
|
1127
|
+
//! }
|
|
1126
1128
|
//!
|
|
1127
1129
|
//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads
|
|
1128
1130
|
//! in those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
|
|
@@ -1170,6 +1172,7 @@ public:
|
|
|
1170
1172
|
//! // Load a segment of consecutive items that are blocked across threads
|
|
1171
1173
|
//! int thread_data[4];
|
|
1172
1174
|
//! BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end);
|
|
1175
|
+
//! }
|
|
1173
1176
|
//!
|
|
1174
1177
|
//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``block_items_end`` is ``5``. The set of
|
|
1175
1178
|
//! ``thread_data`` across the block of threads in those threads will be ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``,
|
|
@@ -1222,6 +1225,7 @@ public:
|
|
|
1222
1225
|
//! // Load a segment of consecutive items that are blocked across threads
|
|
1223
1226
|
//! int thread_data[4];
|
|
1224
1227
|
//! BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end, -1);
|
|
1228
|
+
//! }
|
|
1225
1229
|
//!
|
|
1226
1230
|
//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...``, ``block_items_end`` is ``5``, and the out-of-bounds
|
|
1227
1231
|
//! default is ``-1``. The set of ``thread_data`` across the block of threads in those threads will be
|
|
@@ -169,6 +169,7 @@ struct warp_in_block_matcher_t<Bits, 0, PartialWarpId>
|
|
|
169
169
|
//! block_radix_rank(temp_storage).RankKeys(keys, ranks, extractor);
|
|
170
170
|
//!
|
|
171
171
|
//! ...
|
|
172
|
+
//! }
|
|
172
173
|
//!
|
|
173
174
|
//! Suppose the set of input ``keys`` across the block of threads is ``{ [16,10], [9,11] }``.
|
|
174
175
|
//! The corresponding output ``ranks`` in those threads will be ``{ [3,1], [0,2] }``.
|
|
@@ -190,6 +190,7 @@ enum BlockScanAlgorithm
|
|
|
190
190
|
//!
|
|
191
191
|
//! // Collectively compute the block-wide exclusive prefix sum
|
|
192
192
|
//! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
|
|
193
|
+
//! }
|
|
193
194
|
//!
|
|
194
195
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
195
196
|
//! ``{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}``.
|
|
@@ -333,6 +334,7 @@ public:
|
|
|
333
334
|
//!
|
|
334
335
|
//! // Collectively compute the block-wide exclusive prefix sum
|
|
335
336
|
//! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
|
|
337
|
+
//! }
|
|
336
338
|
//!
|
|
337
339
|
//! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
|
|
338
340
|
//! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
|
|
@@ -386,6 +388,7 @@ public:
|
|
|
386
388
|
//! // Collectively compute the block-wide exclusive prefix sum
|
|
387
389
|
//! int block_aggregate;
|
|
388
390
|
//! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
|
|
391
|
+
//! }
|
|
389
392
|
//!
|
|
390
393
|
//! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
|
|
391
394
|
//! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
|
|
@@ -479,6 +482,7 @@ public:
|
|
|
479
482
|
//! // Store scanned items to output segment
|
|
480
483
|
//! d_data[block_offset + threadIdx.x] = thread_data;
|
|
481
484
|
//! }
|
|
485
|
+
//! }
|
|
482
486
|
//!
|
|
483
487
|
//! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
|
|
484
488
|
//! The corresponding output for the first segment will be ``0, 1, ..., 127``.
|
|
@@ -545,6 +549,7 @@ public:
|
|
|
545
549
|
//!
|
|
546
550
|
//! // Collectively compute the block-wide exclusive prefix sum
|
|
547
551
|
//! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
|
|
552
|
+
//! }
|
|
548
553
|
//!
|
|
549
554
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
550
555
|
//! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
|
|
@@ -606,6 +611,7 @@ public:
|
|
|
606
611
|
//! // Collectively compute the block-wide exclusive prefix sum
|
|
607
612
|
//! int block_aggregate;
|
|
608
613
|
//! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
|
|
614
|
+
//! }
|
|
609
615
|
//!
|
|
610
616
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
611
617
|
//! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
|
|
@@ -720,6 +726,7 @@ public:
|
|
|
720
726
|
//! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
|
|
721
727
|
//! __syncthreads();
|
|
722
728
|
//! }
|
|
729
|
+
//! }
|
|
723
730
|
//!
|
|
724
731
|
//! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
|
|
725
732
|
//! The corresponding output for the first segment will be ``0, 1, 2, 3, ..., 510, 511``.
|
|
@@ -788,6 +795,7 @@ public:
|
|
|
788
795
|
//!
|
|
789
796
|
//! // Collectively compute the block-wide exclusive prefix max scan
|
|
790
797
|
//! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
|
|
798
|
+
//! }
|
|
791
799
|
//!
|
|
792
800
|
//! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
|
|
793
801
|
//! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
|
|
@@ -849,8 +857,9 @@ public:
|
|
|
849
857
|
//!
|
|
850
858
|
//! // Collectively compute the block-wide exclusive prefix max scan
|
|
851
859
|
//! int block_aggregate;
|
|
852
|
-
//! BlockScan(temp_storage).ExclusiveScan(
|
|
853
|
-
//!
|
|
860
|
+
//! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data,
|
|
861
|
+
//! INT_MIN, cuda::maximum<>{}, block_aggregate);
|
|
862
|
+
//! }
|
|
854
863
|
//!
|
|
855
864
|
//! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
|
|
856
865
|
//! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
|
|
@@ -960,6 +969,7 @@ public:
|
|
|
960
969
|
//! // Store scanned items to output segment
|
|
961
970
|
//! d_data[block_offset + threadIdx.x] = thread_data;
|
|
962
971
|
//! }
|
|
972
|
+
//! }
|
|
963
973
|
//!
|
|
964
974
|
//! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
|
|
965
975
|
//! The corresponding output for the first segment will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
|
|
@@ -616,6 +616,7 @@ enum BlockStoreAlgorithm
|
|
|
616
616
|
//!
|
|
617
617
|
//! // Store items to linear memory
|
|
618
618
|
//! BlockStore(temp_storage).Store(d_data, thread_data);
|
|
619
|
+
//! }
|
|
619
620
|
//!
|
|
620
621
|
//! Suppose the set of ``thread_data`` across the block of threads is
|
|
621
622
|
//! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
|
|
@@ -1156,8 +1157,8 @@ public:
|
|
|
1156
1157
|
//! ...
|
|
1157
1158
|
//!
|
|
1158
1159
|
//! // Store items to linear memory
|
|
1159
|
-
//! int thread_data[4];
|
|
1160
1160
|
//! BlockStore(temp_storage).Store(d_data, thread_data);
|
|
1161
|
+
//! }
|
|
1161
1162
|
//!
|
|
1162
1163
|
//! Suppose the set of ``thread_data`` across the block of threads is
|
|
1163
1164
|
//! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
|
|
@@ -1208,8 +1209,8 @@ public:
|
|
|
1208
1209
|
//! ...
|
|
1209
1210
|
//!
|
|
1210
1211
|
//! // Store items to linear memory
|
|
1211
|
-
//! int thread_data[4];
|
|
1212
1212
|
//! BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
|
|
1213
|
+
//! }
|
|
1213
1214
|
//!
|
|
1214
1215
|
//! Suppose the set of ``thread_data`` across the block of threads is
|
|
1215
1216
|
//! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }`` and ``valid_items`` is ``5``.
|
|
@@ -15,71 +15,76 @@
|
|
|
15
15
|
|
|
16
16
|
#include <cub/detail/fast_modulo_division.cuh> // fast_div_mod
|
|
17
17
|
|
|
18
|
+
#include <cuda/std/__mdspan/extents.h>
|
|
18
19
|
#include <cuda/std/__type_traits/make_unsigned.h>
|
|
19
20
|
#include <cuda/std/__utility/integer_sequence.h>
|
|
20
21
|
#include <cuda/std/array>
|
|
21
22
|
#include <cuda/std/cstddef>
|
|
22
|
-
#include <cuda/std/mdspan>
|
|
23
23
|
|
|
24
24
|
CUB_NAMESPACE_BEGIN
|
|
25
|
-
|
|
26
25
|
namespace detail
|
|
27
26
|
{
|
|
28
27
|
|
|
28
|
+
_CCCL_DIAG_PUSH
|
|
29
|
+
_CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code (even if there are no branches!)
|
|
30
|
+
|
|
29
31
|
// Compute the submdspan size of a given rank
|
|
30
|
-
template <
|
|
31
|
-
[[nodiscard]]
|
|
32
|
-
|
|
32
|
+
template <typename IndexType, size_t... Extents>
|
|
33
|
+
[[nodiscard]] _CCCL_API constexpr ::cuda::std::make_unsigned_t<IndexType>
|
|
34
|
+
size_range(const ::cuda::std::extents<IndexType, Extents...>& ext, int start, int end)
|
|
33
35
|
{
|
|
36
|
+
_CCCL_ASSERT(start >= 0 && end <= static_cast<int>(ext.rank()), "invalid start or end");
|
|
34
37
|
::cuda::std::make_unsigned_t<IndexType> s = 1;
|
|
35
|
-
for (
|
|
38
|
+
for (auto i = start; i < end; i++)
|
|
36
39
|
{
|
|
37
40
|
s *= ext.extent(i);
|
|
38
41
|
}
|
|
39
42
|
return s;
|
|
40
43
|
}
|
|
41
44
|
|
|
42
|
-
//
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
45
|
+
_CCCL_DIAG_POP // MSVC(4702)
|
|
46
|
+
|
|
47
|
+
template <typename IndexType, size_t... Extents>
|
|
48
|
+
[[nodiscard]] _CCCL_API constexpr ::cuda::std::make_unsigned_t<IndexType>
|
|
49
|
+
size(const ::cuda::std::extents<IndexType, Extents...>& ext)
|
|
46
50
|
{
|
|
47
|
-
return ::
|
|
51
|
+
return cub::detail::size_range(ext, 0, static_cast<int>(ext.rank()));
|
|
48
52
|
}
|
|
49
53
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
[[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
|
|
53
|
-
size(const ::cuda::std::extents<IndexType, Extents...>& ext)
|
|
54
|
+
template <bool IsLayoutRight, int Position, typename IndexType, size_t... E>
|
|
55
|
+
[[nodiscard]] _CCCL_API auto sub_size_fast_div_mod_impl(const ::cuda::std::extents<IndexType, E...>& ext)
|
|
54
56
|
{
|
|
55
|
-
|
|
57
|
+
using fast_mod_div_t = fast_div_mod<IndexType>;
|
|
58
|
+
constexpr auto start = IsLayoutRight ? Position + 1 : 0;
|
|
59
|
+
constexpr auto end = IsLayoutRight ? sizeof...(E) : Position;
|
|
60
|
+
return fast_mod_div_t(cub::detail::size_range(ext, start, end));
|
|
56
61
|
}
|
|
57
62
|
|
|
58
63
|
// precompute modulo/division for each submdspan size (by rank)
|
|
59
|
-
template <typename IndexType, size_t... E, size_t...
|
|
60
|
-
[[nodiscard]]
|
|
61
|
-
sub_sizes_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<
|
|
64
|
+
template <bool IsLayoutRight, typename IndexType, size_t... E, size_t... Positions>
|
|
65
|
+
[[nodiscard]] _CCCL_API auto
|
|
66
|
+
sub_sizes_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Positions...> = {})
|
|
62
67
|
{
|
|
63
|
-
// deduction guides don't work with nvcc 11.x
|
|
64
68
|
using fast_mod_div_t = fast_div_mod<IndexType>;
|
|
65
|
-
|
|
69
|
+
using array_t = ::cuda::std::array<fast_mod_div_t, sizeof...(Positions)>;
|
|
70
|
+
return array_t{cub::detail::sub_size_fast_div_mod_impl<IsLayoutRight, Positions>(ext)...};
|
|
66
71
|
}
|
|
67
72
|
|
|
68
73
|
// precompute modulo/division for each mdspan extent
|
|
69
|
-
template <typename IndexType, size_t... E, size_t...
|
|
70
|
-
[[nodiscard]]
|
|
71
|
-
extents_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<
|
|
74
|
+
template <typename IndexType, size_t... E, size_t... Positions>
|
|
75
|
+
[[nodiscard]] _CCCL_API auto
|
|
76
|
+
extents_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Positions...> = {})
|
|
72
77
|
{
|
|
73
78
|
using fast_mod_div_t = fast_div_mod<IndexType>;
|
|
74
|
-
|
|
79
|
+
using array_t = ::cuda::std::array<fast_mod_div_t, sizeof...(Positions)>;
|
|
80
|
+
return array_t{fast_mod_div_t(ext.extent(Positions))...};
|
|
75
81
|
}
|
|
76
82
|
|
|
77
83
|
// GCC <= 9 constexpr workaround: Extent must be passed as type only, even const Extent& doesn't work
|
|
78
|
-
template <
|
|
79
|
-
[[nodiscard]]
|
|
84
|
+
template <typename Extents>
|
|
85
|
+
[[nodiscard]] _CCCL_API constexpr bool are_extents_in_range_static(int start, int end)
|
|
80
86
|
{
|
|
81
|
-
|
|
82
|
-
for (index_type i = Rank; i < Extents::rank(); i++)
|
|
87
|
+
for (auto i = start; i < end; i++)
|
|
83
88
|
{
|
|
84
89
|
if (Extents::static_extent(i) == ::cuda::std::dynamic_extent)
|
|
85
90
|
{
|
|
@@ -106,5 +111,4 @@ template <typename MappingTypeLhs, typename MappingTypeRhs>
|
|
|
106
111
|
}
|
|
107
112
|
|
|
108
113
|
} // namespace detail
|
|
109
|
-
|
|
110
114
|
CUB_NAMESPACE_END
|