cuda-cccl 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
- cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
- cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
- cuda/cccl/headers/include/cuda/__event/event.h +26 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +3 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +146 -11
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/compute/__init__.py +2 -0
- cuda/compute/_bindings.pyi +43 -1
- cuda/compute/_bindings_impl.pyx +156 -7
- cuda/compute/algorithms/_scan.py +108 -36
- cuda/compute/algorithms/_transform.py +32 -11
- cuda/compute/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/iterators/__init__.py +2 -0
- cuda/compute/iterators/_factories.py +28 -0
- cuda/compute/iterators/_iterators.py +206 -1
- cuda/compute/numba_utils.py +2 -2
- cuda/compute/typing.py +2 -0
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,29 +1,5 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
*
|
|
4
|
-
* Redistribution and use in source and binary forms, with or without
|
|
5
|
-
* modification, are permitted provided that the following conditions are met:
|
|
6
|
-
* * Redistributions of source code must retain the above copyright
|
|
7
|
-
* notice, this list of conditions and the following disclaimer.
|
|
8
|
-
* * Redistributions in binary form must reproduce the above copyright
|
|
9
|
-
* notice, this list of conditions and the following disclaimer in the
|
|
10
|
-
* documentation and/or other materials provided with the distribution.
|
|
11
|
-
* * Neither the name of the NVIDIA CORPORATION nor the
|
|
12
|
-
* names of its contributors may be used to endorse or promote products
|
|
13
|
-
* derived from this software without specific prior written permission.
|
|
14
|
-
*
|
|
15
|
-
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
16
|
-
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
17
|
-
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
18
|
-
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
19
|
-
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
20
|
-
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
21
|
-
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
22
|
-
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
23
|
-
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
24
|
-
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
25
|
-
*
|
|
26
|
-
******************************************************************************/
|
|
1
|
+
// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
// SPDX-License-Identifier: BSD-3-Clause
|
|
27
3
|
|
|
28
4
|
#pragma once
|
|
29
5
|
|
|
@@ -41,24 +17,23 @@
|
|
|
41
17
|
#include <cub/util_namespace.cuh>
|
|
42
18
|
|
|
43
19
|
#include <thrust/detail/raw_reference_cast.h>
|
|
44
|
-
#include <thrust/distance.h>
|
|
45
20
|
#include <thrust/type_traits/is_contiguous_iterator.h>
|
|
46
21
|
#include <thrust/type_traits/unwrap_contiguous_iterator.h>
|
|
47
22
|
|
|
48
23
|
#include <cuda/__cmath/ceil_div.h>
|
|
24
|
+
#include <cuda/std/__concepts/concept_macros.h>
|
|
25
|
+
#include <cuda/std/__fwd/mdspan.h>
|
|
49
26
|
#include <cuda/std/__iterator/distance.h>
|
|
50
27
|
#include <cuda/std/__mdspan/extents.h>
|
|
28
|
+
#include <cuda/std/__mdspan/layout_left.h>
|
|
29
|
+
#include <cuda/std/__mdspan/layout_right.h>
|
|
51
30
|
#include <cuda/std/__memory/is_sufficiently_aligned.h>
|
|
52
31
|
#include <cuda/std/__type_traits/is_integral.h>
|
|
53
|
-
#include <cuda/std/__utility/integer_sequence.h>
|
|
54
32
|
#include <cuda/std/array>
|
|
55
33
|
|
|
56
34
|
CUB_NAMESPACE_BEGIN
|
|
57
35
|
|
|
58
|
-
namespace detail
|
|
59
|
-
{
|
|
60
|
-
|
|
61
|
-
namespace for_each
|
|
36
|
+
namespace detail::for_each
|
|
62
37
|
{
|
|
63
38
|
|
|
64
39
|
/**
|
|
@@ -122,8 +97,7 @@ struct op_wrapper_vectorized_t
|
|
|
122
97
|
}
|
|
123
98
|
};
|
|
124
99
|
|
|
125
|
-
} // namespace for_each
|
|
126
|
-
} // namespace detail
|
|
100
|
+
} // namespace detail::for_each
|
|
127
101
|
|
|
128
102
|
struct DeviceFor
|
|
129
103
|
{
|
|
@@ -568,6 +542,10 @@ public:
|
|
|
568
542
|
{
|
|
569
543
|
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceFor::Bulk");
|
|
570
544
|
static_assert(::cuda::std::is_integral_v<ShapeT>, "ShapeT must be an integral type");
|
|
545
|
+
if (shape == 0)
|
|
546
|
+
{
|
|
547
|
+
return cudaSuccess;
|
|
548
|
+
}
|
|
571
549
|
using offset_t = ShapeT;
|
|
572
550
|
return detail::for_each::dispatch_t<offset_t, OpT>::dispatch(static_cast<offset_t>(shape), op, stream);
|
|
573
551
|
}
|
|
@@ -833,7 +811,8 @@ public:
|
|
|
833
811
|
//! Overview
|
|
834
812
|
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
835
813
|
//!
|
|
836
|
-
//! Iterate through a multi-dimensional extents into
|
|
814
|
+
//! Iterate through a multi-dimensional extents into a single linear index and a list of indices for each extent
|
|
815
|
+
//! dimension.
|
|
837
816
|
//!
|
|
838
817
|
//! - a single linear index that represents the current iteration
|
|
839
818
|
//! - indices of each extent dimension
|
|
@@ -899,8 +878,6 @@ public:
|
|
|
899
878
|
OpType op,
|
|
900
879
|
cudaStream_t stream = {})
|
|
901
880
|
{
|
|
902
|
-
// TODO: check dimensions overflows
|
|
903
|
-
// TODO: check tha arity of OpType is equal to sizeof...(ExtentsType)
|
|
904
881
|
if (d_temp_storage == nullptr)
|
|
905
882
|
{
|
|
906
883
|
temp_storage_bytes = 1;
|
|
@@ -967,19 +944,120 @@ public:
|
|
|
967
944
|
template <typename IndexType, size_t... Extents, typename OpType>
|
|
968
945
|
CUB_RUNTIME_FUNCTION static cudaError_t
|
|
969
946
|
ForEachInExtents(const ::cuda::std::extents<IndexType, Extents...>& extents, OpType op, cudaStream_t stream = {})
|
|
947
|
+
{
|
|
948
|
+
using extents_type = ::cuda::std::extents<IndexType, Extents...>;
|
|
949
|
+
return cub::DeviceFor::ForEachInLayout(::cuda::std::layout_right::mapping<extents_type>{extents}, op, stream);
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
/*********************************************************************************************************************
|
|
953
|
+
* ForEachInLayout
|
|
954
|
+
********************************************************************************************************************/
|
|
955
|
+
|
|
956
|
+
//! @rst
|
|
957
|
+
//! Overview
|
|
958
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
959
|
+
//!
|
|
960
|
+
//! Iterate through multi-dimensional extents using a specific mdspan layout, applying a function object for each
|
|
961
|
+
//! element, passing
|
|
962
|
+
//!
|
|
963
|
+
//! - a single linear index that represents the current iteration
|
|
964
|
+
//! - a list of indices containing the coordinates for each extent dimension
|
|
965
|
+
//!
|
|
966
|
+
//! The iteration order depends on the layout type:
|
|
967
|
+
//!
|
|
968
|
+
//! - ``layout_right``: Iterates in row-major order (rightmost index varies fastest)
|
|
969
|
+
//! - ``layout_left``: Iterates in column-major order (leftmost index varies fastest)
|
|
970
|
+
//!
|
|
971
|
+
//! - The return value of ``op``, if any, is ignored.
|
|
972
|
+
//!
|
|
973
|
+
//! A Simple Example
|
|
974
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
975
|
+
//!
|
|
976
|
+
//! The following code snippet demonstrates how to use ``ForEachInLayout`` to iterate through a 2D matrix in
|
|
977
|
+
//! column-major order using ``layout_left``.
|
|
978
|
+
//!
|
|
979
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_for_each_in_layout_api.cu
|
|
980
|
+
//! :language: c++
|
|
981
|
+
//! :dedent:
|
|
982
|
+
//! :start-after: example-begin for-each-in-layout-op
|
|
983
|
+
//! :end-before: example-end for-each-in-layout-op
|
|
984
|
+
//!
|
|
985
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_for_each_in_layout_api.cu
|
|
986
|
+
//! :language: c++
|
|
987
|
+
//! :dedent:
|
|
988
|
+
//! :start-after: example-begin for-each-in-layout-example
|
|
989
|
+
//! :end-before: example-end for-each-in-layout-example
|
|
990
|
+
//!
|
|
991
|
+
//! @endrst
|
|
992
|
+
//!
|
|
993
|
+
//! @tparam Layout
|
|
994
|
+
//! **[inferred]** The mdspan layout type, must be either ``cuda::std::layout_left`` or ``cuda::std::layout_right``
|
|
995
|
+
//!
|
|
996
|
+
//! @tparam IndexType
|
|
997
|
+
//! **[inferred]** An integral type that represents the extent index space
|
|
998
|
+
//!
|
|
999
|
+
//! @tparam Extents
|
|
1000
|
+
//! **[inferred]** The extent sizes for each rank index
|
|
1001
|
+
//!
|
|
1002
|
+
//! @tparam OpType
|
|
1003
|
+
//! **[inferred]** A function object with arity equal to the number of extents + 1 for the linear index (iteration).
|
|
1004
|
+
//! The first parameter is the linear index, followed by one parameter for each dimension coordinate.
|
|
1005
|
+
//!
|
|
1006
|
+
//! @param[in] layout
|
|
1007
|
+
//! Layout object that determines the iteration order (layout_left for column-major, layout_right for row-major)
|
|
1008
|
+
//!
|
|
1009
|
+
//! @param[in] extents
|
|
1010
|
+
//! Extents object that represents a multi-dimensional index space
|
|
1011
|
+
//!
|
|
1012
|
+
//! @param[in] op
|
|
1013
|
+
//! Function object to apply to each linear index (iteration) and multi-dimensional coordinates.
|
|
1014
|
+
//! Called as ``op(linear_index, coord_0, coord_1, ..., coord_n)``
|
|
1015
|
+
//!
|
|
1016
|
+
//! @param[in] stream
|
|
1017
|
+
//! CUDA stream to launch kernels within. Default stream is `nullptr`
|
|
1018
|
+
//!
|
|
1019
|
+
//! @return cudaError_t
|
|
1020
|
+
//! error status
|
|
1021
|
+
_CCCL_TEMPLATE(typename LayoutMapping, typename OpType)
|
|
1022
|
+
_CCCL_REQUIRES(::cuda::std::__is_any_mdspan_layout_mapping_left_or_right_v<LayoutMapping>)
|
|
1023
|
+
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
|
|
1024
|
+
ForEachInLayout(const LayoutMapping& layout_mapping, OpType op, cudaStream_t stream = {})
|
|
970
1025
|
{
|
|
971
1026
|
using namespace cub::detail;
|
|
972
|
-
using extents_type = ::
|
|
1027
|
+
using extents_type = typename LayoutMapping::extents_type;
|
|
973
1028
|
using extent_index_type = typename extents_type::index_type;
|
|
974
1029
|
using fast_mod_array_t = ::cuda::std::array<fast_div_mod<extent_index_type>, extents_type::rank()>;
|
|
975
1030
|
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachInExtents");
|
|
976
1031
|
static constexpr auto seq = ::cuda::std::make_index_sequence<extents_type::rank()>{};
|
|
977
|
-
|
|
1032
|
+
constexpr bool is_layout_right = ::cuda::std::__is_any_mdspan_layout_mapping_right_v<LayoutMapping>;
|
|
1033
|
+
auto extents = layout_mapping.extents();
|
|
1034
|
+
fast_mod_array_t sub_sizes_div_array = cub::detail::sub_sizes_fast_div_mod<is_layout_right>(extents, seq);
|
|
978
1035
|
fast_mod_array_t extents_div_array = cub::detail::extents_fast_div_mod(extents, seq);
|
|
979
|
-
for_each::op_wrapper_extents_t<OpType, extents_type, fast_mod_array_t> op_wrapper{
|
|
1036
|
+
for_each::op_wrapper_extents_t<OpType, extents_type, is_layout_right, fast_mod_array_t> op_wrapper{
|
|
980
1037
|
op, extents, sub_sizes_div_array, extents_div_array};
|
|
981
1038
|
return Bulk(static_cast<implicit_prom_t<extent_index_type>>(cub::detail::size(extents)), op_wrapper, stream);
|
|
982
1039
|
}
|
|
1040
|
+
|
|
1041
|
+
#ifndef _CCCL_DOXYGEN_INVOKED
|
|
1042
|
+
|
|
1043
|
+
_CCCL_TEMPLATE(typename LayoutMapping, typename OpType)
|
|
1044
|
+
_CCCL_REQUIRES(::cuda::std::__is_any_mdspan_layout_mapping_left_or_right_v<LayoutMapping>)
|
|
1045
|
+
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ForEachInLayout(
|
|
1046
|
+
void* d_temp_storage,
|
|
1047
|
+
size_t& temp_storage_bytes,
|
|
1048
|
+
const LayoutMapping& layout_mapping,
|
|
1049
|
+
OpType op,
|
|
1050
|
+
cudaStream_t stream = {})
|
|
1051
|
+
{
|
|
1052
|
+
if (d_temp_storage == nullptr)
|
|
1053
|
+
{
|
|
1054
|
+
temp_storage_bytes = 1;
|
|
1055
|
+
return cudaSuccess;
|
|
1056
|
+
}
|
|
1057
|
+
return ForEachInLayout(layout_mapping, op, stream);
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
#endif // !_CCCL_DOXYGEN_INVOKED
|
|
983
1061
|
};
|
|
984
1062
|
|
|
985
1063
|
CUB_NAMESPACE_END
|
|
@@ -52,15 +52,15 @@
|
|
|
52
52
|
#include <cub/thread/thread_operators.cuh>
|
|
53
53
|
#include <cub/util_type.cuh>
|
|
54
54
|
|
|
55
|
-
#include <thrust/iterator/tabulate_output_iterator.h>
|
|
56
|
-
|
|
57
55
|
#include <cuda/__execution/determinism.h>
|
|
58
56
|
#include <cuda/__execution/require.h>
|
|
59
57
|
#include <cuda/__execution/tune.h>
|
|
60
58
|
#include <cuda/__functional/maximum.h>
|
|
61
59
|
#include <cuda/__functional/minimum.h>
|
|
60
|
+
#include <cuda/__iterator/tabulate_output_iterator.h>
|
|
62
61
|
#include <cuda/__memory_resource/get_memory_resource.h>
|
|
63
62
|
#include <cuda/__stream/get_stream.h>
|
|
63
|
+
#include <cuda/__stream/stream_ref.h>
|
|
64
64
|
#include <cuda/std/__execution/env.h>
|
|
65
65
|
#include <cuda/std/__functional/identity.h>
|
|
66
66
|
#include <cuda/std/__functional/invoke.h>
|
|
@@ -70,7 +70,6 @@
|
|
|
70
70
|
#include <cuda/std/__type_traits/is_same.h>
|
|
71
71
|
#include <cuda/std/cstdint>
|
|
72
72
|
#include <cuda/std/limits>
|
|
73
|
-
#include <cuda/stream_ref>
|
|
74
73
|
|
|
75
74
|
CUB_NAMESPACE_BEGIN
|
|
76
75
|
|
|
@@ -1215,7 +1214,7 @@ public:
|
|
|
1215
1214
|
OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
|
|
1216
1215
|
|
|
1217
1216
|
// Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
|
|
1218
|
-
auto out_it =
|
|
1217
|
+
auto out_it = ::cuda::make_tabulate_output_iterator(
|
|
1219
1218
|
detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
|
|
1220
1219
|
|
|
1221
1220
|
return detail::reduce::dispatch_streaming_arg_reduce_t<
|
|
@@ -1341,7 +1340,7 @@ public:
|
|
|
1341
1340
|
OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
|
|
1342
1341
|
|
|
1343
1342
|
// Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
|
|
1344
|
-
auto out_it =
|
|
1343
|
+
auto out_it = ::cuda::make_tabulate_output_iterator(
|
|
1345
1344
|
detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
|
|
1346
1345
|
|
|
1347
1346
|
// Query the required temporary storage size
|
|
@@ -1883,7 +1882,7 @@ public:
|
|
|
1883
1882
|
OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::lowest()};
|
|
1884
1883
|
|
|
1885
1884
|
// Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
|
|
1886
|
-
auto out_it =
|
|
1885
|
+
auto out_it = ::cuda::make_tabulate_output_iterator(
|
|
1887
1886
|
detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
|
|
1888
1887
|
|
|
1889
1888
|
return detail::reduce::dispatch_streaming_arg_reduce_t<
|
|
@@ -2133,7 +2132,7 @@ public:
|
|
|
2133
2132
|
OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
|
|
2134
2133
|
|
|
2135
2134
|
// Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
|
|
2136
|
-
auto out_it =
|
|
2135
|
+
auto out_it = ::cuda::make_tabulate_output_iterator(
|
|
2137
2136
|
detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
|
|
2138
2137
|
|
|
2139
2138
|
// Query the required temporary storage size
|
|
@@ -156,14 +156,14 @@ struct DeviceSegmentedReduce
|
|
|
156
156
|
//! @rst
|
|
157
157
|
//! Random-access input iterator to the sequence of beginning offsets of
|
|
158
158
|
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
159
|
-
//! element of the *i*\ :sup:`th` data segment in ``
|
|
159
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
160
160
|
//! @endrst
|
|
161
161
|
//!
|
|
162
162
|
//! @param[in] d_end_offsets
|
|
163
163
|
//! @rst
|
|
164
164
|
//! Random-access input iterator to the sequence of ending offsets of length
|
|
165
165
|
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
166
|
-
//! the *i*\ :sup:`th` data segment in ``
|
|
166
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
167
167
|
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
168
168
|
//! @endrst
|
|
169
169
|
//!
|
|
@@ -372,15 +372,14 @@ struct DeviceSegmentedReduce
|
|
|
372
372
|
//! @rst
|
|
373
373
|
//! Random-access input iterator to the sequence of beginning offsets of
|
|
374
374
|
//! length ``num_segments`, such that ``d_begin_offsets[i]`` is the first
|
|
375
|
-
//! element of the *i*\ :sup:`th` data segment in ``
|
|
376
|
-
//! ``d_values_*``
|
|
375
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
377
376
|
//! @endrst
|
|
378
377
|
//!
|
|
379
378
|
//! @param[in] d_end_offsets
|
|
380
379
|
//! @rst
|
|
381
380
|
//! Random-access input iterator to the sequence of ending offsets of length
|
|
382
381
|
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
383
|
-
//! the *i*\ :sup:`th` data segment in ``
|
|
382
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
384
383
|
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
385
384
|
//! @endrst
|
|
386
385
|
//!
|
|
@@ -578,14 +577,14 @@ struct DeviceSegmentedReduce
|
|
|
578
577
|
//! @rst
|
|
579
578
|
//! Random-access input iterator to the sequence of beginning offsets of
|
|
580
579
|
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
581
|
-
//! element of the *i*\ :sup:`th` data segment in ``
|
|
580
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
582
581
|
//! @endrst
|
|
583
582
|
//!
|
|
584
583
|
//! @param[in] d_end_offsets
|
|
585
584
|
//! @rst
|
|
586
585
|
//! Random-access input iterator to the sequence of ending offsets of length
|
|
587
586
|
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
588
|
-
//! the *i*\ :sup:`th` data segment in ``
|
|
587
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
589
588
|
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
590
589
|
//! @endrst
|
|
591
590
|
//!
|
|
@@ -792,14 +791,14 @@ struct DeviceSegmentedReduce
|
|
|
792
791
|
//! @rst
|
|
793
792
|
//! Random-access input iterator to the sequence of beginning offsets of
|
|
794
793
|
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
795
|
-
//! element of the *i*\ :sup:`th` data segment in ``
|
|
794
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
796
795
|
//! @endrst
|
|
797
796
|
//!
|
|
798
797
|
//! @param[in] d_end_offsets
|
|
799
798
|
//! @rst
|
|
800
799
|
//! Random-access input iterator to the sequence of ending offsets of length
|
|
801
800
|
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
802
|
-
//! the *i*\ :sup:`th` data segment in ``
|
|
801
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
803
802
|
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
804
803
|
//! @endrst
|
|
805
804
|
//!
|
|
@@ -1037,14 +1036,14 @@ struct DeviceSegmentedReduce
|
|
|
1037
1036
|
//! @rst
|
|
1038
1037
|
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1039
1038
|
//! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
|
|
1040
|
-
//! element of the *i*\ :sup:`th` data segment in ``
|
|
1039
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
1041
1040
|
//! @endrst
|
|
1042
1041
|
//!
|
|
1043
1042
|
//! @param[in] d_end_offsets
|
|
1044
1043
|
//! @rst
|
|
1045
1044
|
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1046
1045
|
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
1047
|
-
//! the *i*\ :sup:`th` data segment in ``
|
|
1046
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
1048
1047
|
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
1049
1048
|
//! @endrst
|
|
1050
1049
|
//!
|
|
@@ -1249,14 +1248,14 @@ struct DeviceSegmentedReduce
|
|
|
1249
1248
|
//! @rst
|
|
1250
1249
|
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1251
1250
|
//! length `num_segments`, such that ``d_begin_offsets[i]`` is the first
|
|
1252
|
-
//! element of the *i*\ :sup:`th` data segment in ``
|
|
1251
|
+
//! element of the *i*\ :sup:`th` data segment in ``d_in``
|
|
1253
1252
|
//! @endrst
|
|
1254
1253
|
//!
|
|
1255
1254
|
//! @param[in] d_end_offsets
|
|
1256
1255
|
//! @rst
|
|
1257
1256
|
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1258
1257
|
//! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
|
|
1259
|
-
//! the *i*\ :sup:`th` data segment in ``
|
|
1258
|
+
//! the *i*\ :sup:`th` data segment in ``d_in``.
|
|
1260
1259
|
//! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
|
|
1261
1260
|
//! @endrst
|
|
1262
1261
|
//!
|