cuda-cccl 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
- cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
- cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
- cuda/cccl/headers/include/cuda/__event/event.h +26 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +3 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +146 -11
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/compute/__init__.py +2 -0
- cuda/compute/_bindings.pyi +43 -1
- cuda/compute/_bindings_impl.pyx +156 -7
- cuda/compute/algorithms/_scan.py +108 -36
- cuda/compute/algorithms/_transform.py +32 -11
- cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/iterators/__init__.py +2 -0
- cuda/compute/iterators/_factories.py +28 -0
- cuda/compute/iterators/_iterators.py +206 -1
- cuda/compute/numba_utils.py +2 -2
- cuda/compute/typing.py +2 -0
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -18,6 +18,8 @@
|
|
|
18
18
|
#include <cub/util_namespace.cuh>
|
|
19
19
|
|
|
20
20
|
#include <cuda/__functional/address_stability.h>
|
|
21
|
+
#include <cuda/__stream/get_stream.h>
|
|
22
|
+
#include <cuda/std/__execution/env.h>
|
|
21
23
|
#include <cuda/std/tuple>
|
|
22
24
|
|
|
23
25
|
CUB_NAMESPACE_BEGIN
|
|
@@ -49,13 +51,20 @@ CUB_NAMESPACE_BEGIN
|
|
|
49
51
|
struct DeviceTransform
|
|
50
52
|
{
|
|
51
53
|
private:
|
|
52
|
-
template <typename... RandomAccessIteratorsIn,
|
|
54
|
+
template <typename... RandomAccessIteratorsIn,
|
|
55
|
+
typename RandomAccessIteratorOut,
|
|
56
|
+
typename NumItemsT,
|
|
57
|
+
typename Predicate,
|
|
58
|
+
typename TransformOp,
|
|
59
|
+
typename StableAddress = cuda::std::false_type>
|
|
53
60
|
CUB_RUNTIME_FUNCTION static cudaError_t TransformInternal(
|
|
54
61
|
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
|
|
55
62
|
RandomAccessIteratorOut output,
|
|
56
63
|
NumItemsT num_items,
|
|
64
|
+
Predicate predicate,
|
|
57
65
|
TransformOp transform_op,
|
|
58
|
-
cudaStream_t stream
|
|
66
|
+
cudaStream_t stream,
|
|
67
|
+
StableAddress = {})
|
|
59
68
|
{
|
|
60
69
|
using choose_offset_t = detail::choose_signed_offset<NumItemsT>;
|
|
61
70
|
using offset_t = typename choose_offset_t::type;
|
|
@@ -66,18 +75,28 @@ private:
|
|
|
66
75
|
return error;
|
|
67
76
|
}
|
|
68
77
|
|
|
69
|
-
return detail::transform::dispatch_t<
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
78
|
+
return detail::transform::dispatch_t < StableAddress::value
|
|
79
|
+
? detail::transform::requires_stable_address::yes
|
|
80
|
+
: detail::transform::requires_stable_address::no,
|
|
81
|
+
offset_t, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, Predicate,
|
|
82
|
+
TransformOp > ::dispatch(
|
|
83
|
+
::cuda::std::move(inputs),
|
|
84
|
+
::cuda::std::move(output),
|
|
85
|
+
num_items,
|
|
86
|
+
::cuda::std::move(predicate),
|
|
87
|
+
::cuda::std::move(transform_op),
|
|
88
|
+
stream);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
template <typename Env>
|
|
92
|
+
CUB_RUNTIME_FUNCTION static auto get_stream(Env env) -> cudaStream_t
|
|
93
|
+
{
|
|
94
|
+
return ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}).get();
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
CUB_RUNTIME_FUNCTION static auto get_stream(cudaStream_t stream) -> cudaStream_t
|
|
98
|
+
{
|
|
99
|
+
return stream;
|
|
81
100
|
}
|
|
82
101
|
|
|
83
102
|
public:
|
|
@@ -108,18 +127,28 @@ public:
|
|
|
108
127
|
//! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
|
|
109
128
|
//! types must be convertible to the parameters of the function object's call operator. The return type of the call
|
|
110
129
|
//! operator must be assignable to the dereferenced output iterator.
|
|
111
|
-
//! @param
|
|
112
|
-
|
|
130
|
+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
|
|
131
|
+
//! stream\ :sub:`0`
|
|
132
|
+
template <typename... RandomAccessIteratorsIn,
|
|
133
|
+
typename RandomAccessIteratorOut,
|
|
134
|
+
typename NumItemsT,
|
|
135
|
+
typename TransformOp,
|
|
136
|
+
typename Env = ::cuda::std::execution::env<>>
|
|
113
137
|
CUB_RUNTIME_FUNCTION static cudaError_t Transform(
|
|
114
138
|
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
|
|
115
139
|
RandomAccessIteratorOut output,
|
|
116
140
|
NumItemsT num_items,
|
|
117
141
|
TransformOp transform_op,
|
|
118
|
-
|
|
142
|
+
Env env = {})
|
|
119
143
|
{
|
|
120
144
|
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
|
|
121
145
|
return TransformInternal(
|
|
122
|
-
::cuda::std::move(inputs),
|
|
146
|
+
::cuda::std::move(inputs),
|
|
147
|
+
::cuda::std::move(output),
|
|
148
|
+
num_items,
|
|
149
|
+
detail::transform::always_true_predicate{},
|
|
150
|
+
::cuda::std::move(transform_op),
|
|
151
|
+
get_stream(env));
|
|
123
152
|
}
|
|
124
153
|
|
|
125
154
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
@@ -160,21 +189,26 @@ public:
|
|
|
160
189
|
//! @param transform_op A unary function object. The input iterator's value type must be convertible to the parameter
|
|
161
190
|
//! of the function object's call operator. The return type of the call operator must be assignable to the
|
|
162
191
|
//! dereferenced output iterator.
|
|
163
|
-
//! @param
|
|
164
|
-
|
|
192
|
+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
|
|
193
|
+
//! stream\ :sub:`0`
|
|
194
|
+
template <typename RandomAccessIteratorIn,
|
|
195
|
+
typename RandomAccessIteratorOut,
|
|
196
|
+
typename NumItemsT,
|
|
197
|
+
typename TransformOp,
|
|
198
|
+
typename Env = ::cuda::std::execution::env<>>
|
|
165
199
|
CUB_RUNTIME_FUNCTION static cudaError_t Transform(
|
|
166
200
|
RandomAccessIteratorIn input,
|
|
167
201
|
RandomAccessIteratorOut output,
|
|
168
202
|
NumItemsT num_items,
|
|
169
203
|
TransformOp transform_op,
|
|
170
|
-
|
|
204
|
+
Env env = {})
|
|
171
205
|
{
|
|
172
206
|
return Transform(
|
|
173
207
|
::cuda::std::make_tuple(::cuda::std::move(input)),
|
|
174
208
|
::cuda::std::move(output),
|
|
175
209
|
num_items,
|
|
176
210
|
::cuda::std::move(transform_op),
|
|
177
|
-
|
|
211
|
+
::cuda::std::move(env));
|
|
178
212
|
}
|
|
179
213
|
|
|
180
214
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
@@ -215,10 +249,14 @@ public:
|
|
|
215
249
|
//! @param num_items The number of elements to write to the output sequence.
|
|
216
250
|
//! @param generator A nullary function object. The return type of the call operator must be assignable to the
|
|
217
251
|
//! dereferenced output iterator.
|
|
218
|
-
//! @param
|
|
219
|
-
|
|
252
|
+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
|
|
253
|
+
//! stream\ :sub:`0`
|
|
254
|
+
template <typename RandomAccessIteratorOut,
|
|
255
|
+
typename NumItemsT,
|
|
256
|
+
typename Generator,
|
|
257
|
+
typename Env = ::cuda::std::execution::env<>>
|
|
220
258
|
CUB_RUNTIME_FUNCTION static cudaError_t
|
|
221
|
-
Generate(RandomAccessIteratorOut output, NumItemsT num_items, Generator generator,
|
|
259
|
+
Generate(RandomAccessIteratorOut output, NumItemsT num_items, Generator generator, Env env = {})
|
|
222
260
|
{
|
|
223
261
|
static_assert(::cuda::std::is_invocable_v<Generator>, "The passed generator must be a nullary function object");
|
|
224
262
|
static_assert(
|
|
@@ -228,7 +266,12 @@ public:
|
|
|
228
266
|
|
|
229
267
|
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Generate");
|
|
230
268
|
return TransformInternal(
|
|
231
|
-
::cuda::std::make_tuple(),
|
|
269
|
+
::cuda::std::make_tuple(),
|
|
270
|
+
::cuda::std::move(output),
|
|
271
|
+
num_items,
|
|
272
|
+
detail::transform::always_true_predicate{},
|
|
273
|
+
::cuda::std::move(generator),
|
|
274
|
+
get_stream(env));
|
|
232
275
|
}
|
|
233
276
|
|
|
234
277
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
@@ -262,10 +305,14 @@ public:
|
|
|
262
305
|
//! @param output An iterator to the output sequence where num_items results are written to.
|
|
263
306
|
//! @param num_items The number of elements to write to the output sequence.
|
|
264
307
|
//! @param value The value to write. Must be assignable to the dereferenced output iterator.
|
|
265
|
-
//! @param
|
|
266
|
-
|
|
308
|
+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
|
|
309
|
+
//! stream\ :sub:`0`
|
|
310
|
+
template <typename RandomAccessIteratorOut,
|
|
311
|
+
typename NumItemsT,
|
|
312
|
+
typename Value,
|
|
313
|
+
typename Env = ::cuda::std::execution::env<>>
|
|
267
314
|
CUB_RUNTIME_FUNCTION static cudaError_t
|
|
268
|
-
Fill(RandomAccessIteratorOut output, NumItemsT num_items, Value value,
|
|
315
|
+
Fill(RandomAccessIteratorOut output, NumItemsT num_items, Value value, Env env = {})
|
|
269
316
|
{
|
|
270
317
|
static_assert(::cuda::std::is_assignable_v<detail::it_reference_t<RandomAccessIteratorOut>, Value>,
|
|
271
318
|
"The passed value must be assignable to the dereferenced output iterator");
|
|
@@ -275,8 +322,9 @@ public:
|
|
|
275
322
|
::cuda::std::make_tuple(),
|
|
276
323
|
::cuda::std::move(output),
|
|
277
324
|
num_items,
|
|
325
|
+
detail::transform::always_true_predicate{},
|
|
278
326
|
detail::__return_constant<Value>{::cuda::std::move(value)},
|
|
279
|
-
|
|
327
|
+
get_stream(env));
|
|
280
328
|
}
|
|
281
329
|
|
|
282
330
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
@@ -296,8 +344,7 @@ public:
|
|
|
296
344
|
return cudaSuccess;
|
|
297
345
|
}
|
|
298
346
|
|
|
299
|
-
return
|
|
300
|
-
::cuda::std::move(output), num_items, detail::__return_constant<Value>{::cuda::std::move(value)}, stream);
|
|
347
|
+
return Fill(::cuda::std::move(output), num_items, ::cuda::std::move(value), stream);
|
|
301
348
|
}
|
|
302
349
|
#endif // _CCCL_DOXYGEN_INVOKED
|
|
303
350
|
|
|
@@ -333,43 +380,30 @@ public:
|
|
|
333
380
|
//! types must be convertible to the parameters of the function object's call operator. The return type of the call
|
|
334
381
|
//! operator must be assignable to the dereferenced output iterator. Will only be invoked if \p predicate returns
|
|
335
382
|
//! true.
|
|
336
|
-
//! @param
|
|
383
|
+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
|
|
384
|
+
//! stream\ :sub:`0`
|
|
337
385
|
template <typename... RandomAccessIteratorsIn,
|
|
338
386
|
typename RandomAccessIteratorOut,
|
|
339
387
|
typename NumItemsT,
|
|
340
388
|
typename Predicate,
|
|
341
|
-
typename TransformOp
|
|
389
|
+
typename TransformOp,
|
|
390
|
+
typename Env = ::cuda::std::execution::env<>>
|
|
342
391
|
CUB_RUNTIME_FUNCTION static cudaError_t TransformIf(
|
|
343
392
|
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
|
|
344
393
|
RandomAccessIteratorOut output,
|
|
345
394
|
NumItemsT num_items,
|
|
346
395
|
Predicate predicate,
|
|
347
396
|
TransformOp transform_op,
|
|
348
|
-
|
|
397
|
+
Env env = {})
|
|
349
398
|
{
|
|
350
399
|
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformIf");
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
return error;
|
|
359
|
-
}
|
|
360
|
-
|
|
361
|
-
return detail::transform::dispatch_t<
|
|
362
|
-
detail::transform::requires_stable_address::no,
|
|
363
|
-
offset_t,
|
|
364
|
-
::cuda::std::tuple<RandomAccessIteratorsIn...>,
|
|
365
|
-
RandomAccessIteratorOut,
|
|
366
|
-
Predicate,
|
|
367
|
-
TransformOp>::dispatch(::cuda::std::move(inputs),
|
|
368
|
-
::cuda::std::move(output),
|
|
369
|
-
num_items,
|
|
370
|
-
::cuda::std::move(predicate),
|
|
371
|
-
::cuda::std::move(transform_op),
|
|
372
|
-
stream);
|
|
400
|
+
return TransformInternal(
|
|
401
|
+
::cuda::std::move(inputs),
|
|
402
|
+
::cuda::std::move(output),
|
|
403
|
+
num_items,
|
|
404
|
+
::cuda::std::move(predicate),
|
|
405
|
+
::cuda::std::move(transform_op),
|
|
406
|
+
get_stream(env));
|
|
373
407
|
}
|
|
374
408
|
|
|
375
409
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
@@ -435,19 +469,21 @@ public:
|
|
|
435
469
|
//! @param transform_op A unary function object. The input iterator's value type must be convertible to the
|
|
436
470
|
//! parameter of the function object's call operator. The return type of the call operator must be assignable to the
|
|
437
471
|
//! dereferenced output iterator. Will only be invoked if \p predicate returns true.
|
|
438
|
-
//! @param
|
|
472
|
+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
|
|
473
|
+
//! stream\ :sub:`0`
|
|
439
474
|
template <typename RandomAccessIteratorIn,
|
|
440
475
|
typename RandomAccessIteratorOut,
|
|
441
476
|
typename NumItemsT,
|
|
442
477
|
typename Predicate,
|
|
443
|
-
typename TransformOp
|
|
478
|
+
typename TransformOp,
|
|
479
|
+
typename Env = ::cuda::std::execution::env<>>
|
|
444
480
|
CUB_RUNTIME_FUNCTION static cudaError_t TransformIf(
|
|
445
481
|
RandomAccessIteratorIn input,
|
|
446
482
|
RandomAccessIteratorOut output,
|
|
447
483
|
NumItemsT num_items,
|
|
448
484
|
Predicate predicate,
|
|
449
485
|
TransformOp transform_op,
|
|
450
|
-
|
|
486
|
+
Env env = {})
|
|
451
487
|
{
|
|
452
488
|
return TransformIf(
|
|
453
489
|
::cuda::std::make_tuple(::cuda::std::move(input)),
|
|
@@ -455,7 +491,7 @@ public:
|
|
|
455
491
|
num_items,
|
|
456
492
|
::cuda::std::move(predicate),
|
|
457
493
|
::cuda::std::move(transform_op),
|
|
458
|
-
|
|
494
|
+
get_stream(env));
|
|
459
495
|
}
|
|
460
496
|
|
|
461
497
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
@@ -518,39 +554,29 @@ public:
|
|
|
518
554
|
//! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
|
|
519
555
|
//! types must be convertible to the parameters of the function object's call operator. The return type of the call
|
|
520
556
|
//! operator must be assignable to the dereferenced output iterator.
|
|
521
|
-
//! @param
|
|
522
|
-
|
|
557
|
+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
|
|
558
|
+
//! stream\ :sub:`0`
|
|
559
|
+
template <typename... RandomAccessIteratorsIn,
|
|
560
|
+
typename RandomAccessIteratorOut,
|
|
561
|
+
typename NumItemsT,
|
|
562
|
+
typename TransformOp,
|
|
563
|
+
typename Env = ::cuda::std::execution::env<>>
|
|
523
564
|
CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
|
|
524
565
|
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
|
|
525
566
|
RandomAccessIteratorOut output,
|
|
526
567
|
NumItemsT num_items,
|
|
527
568
|
TransformOp transform_op,
|
|
528
|
-
|
|
569
|
+
Env env = {})
|
|
529
570
|
{
|
|
530
571
|
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses");
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
return error;
|
|
540
|
-
}
|
|
541
|
-
|
|
542
|
-
return detail::transform::dispatch_t<
|
|
543
|
-
detail::transform::requires_stable_address::yes,
|
|
544
|
-
offset_t,
|
|
545
|
-
::cuda::std::tuple<RandomAccessIteratorsIn...>,
|
|
546
|
-
RandomAccessIteratorOut,
|
|
547
|
-
detail::transform::always_true_predicate,
|
|
548
|
-
TransformOp>::dispatch(::cuda::std::move(inputs),
|
|
549
|
-
::cuda::std::move(output),
|
|
550
|
-
num_items,
|
|
551
|
-
detail::transform::always_true_predicate{},
|
|
552
|
-
::cuda::std::move(transform_op),
|
|
553
|
-
stream);
|
|
572
|
+
return TransformInternal(
|
|
573
|
+
::cuda::std::move(inputs),
|
|
574
|
+
::cuda::std::move(output),
|
|
575
|
+
num_items,
|
|
576
|
+
detail::transform::always_true_predicate{},
|
|
577
|
+
::cuda::std::move(transform_op),
|
|
578
|
+
get_stream(env),
|
|
579
|
+
::cuda::std::true_type{});
|
|
554
580
|
}
|
|
555
581
|
|
|
556
582
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
@@ -590,21 +616,26 @@ public:
|
|
|
590
616
|
//! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
|
|
591
617
|
//! types must be convertible to the parameters of the function object's call operator. The return type of the call
|
|
592
618
|
//! operator must be assignable to the dereferenced output iterator.
|
|
593
|
-
//! @param
|
|
594
|
-
|
|
619
|
+
//! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
|
|
620
|
+
//! stream\ :sub:`0`
|
|
621
|
+
template <typename RandomAccessIteratorIn,
|
|
622
|
+
typename RandomAccessIteratorOut,
|
|
623
|
+
typename NumItemsT,
|
|
624
|
+
typename TransformOp,
|
|
625
|
+
typename Env = ::cuda::std::execution::env<>>
|
|
595
626
|
CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
|
|
596
627
|
RandomAccessIteratorIn input,
|
|
597
628
|
RandomAccessIteratorOut output,
|
|
598
629
|
NumItemsT num_items,
|
|
599
630
|
TransformOp transform_op,
|
|
600
|
-
|
|
631
|
+
Env env = {})
|
|
601
632
|
{
|
|
602
633
|
return TransformStableArgumentAddresses(
|
|
603
634
|
::cuda::std::make_tuple(::cuda::std::move(input)),
|
|
604
635
|
::cuda::std::move(output),
|
|
605
636
|
num_items,
|
|
606
637
|
::cuda::std::move(transform_op),
|
|
607
|
-
|
|
638
|
+
get_stream(env));
|
|
608
639
|
}
|
|
609
640
|
|
|
610
641
|
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
@@ -122,9 +122,8 @@ __launch_bounds__(
|
|
|
122
122
|
{
|
|
123
123
|
// the merge agent loads keys into a local array of KeyIt1::value_type, on which the comparisons are performed
|
|
124
124
|
using key_t = it_value_t<KeyIt1>;
|
|
125
|
-
static_assert(::cuda::std::
|
|
126
|
-
|
|
127
|
-
static_assert(::cuda::std::is_convertible_v<typename ::cuda::std::__invoke_of<CompareOp, key_t, key_t>::type, bool>,
|
|
125
|
+
static_assert(::cuda::std::is_invocable_v<CompareOp, key_t, key_t>, "Comparison operator cannot compare two keys");
|
|
126
|
+
static_assert(::cuda::std::is_convertible_v<::cuda::std::invoke_result_t<CompareOp, key_t, key_t>, bool>,
|
|
128
127
|
"Comparison operator must be convertible to bool");
|
|
129
128
|
|
|
130
129
|
using MergeAgent = typename choose_merge_agent<
|
|
@@ -790,7 +790,7 @@ struct DispatchSegmentedReduce
|
|
|
790
790
|
* Function type of cub::DeviceSegmentedReduceKernel
|
|
791
791
|
*
|
|
792
792
|
* @param[in] segmented_reduce_kernel
|
|
793
|
-
* Kernel function pointer to
|
|
793
|
+
* Kernel function pointer to instantiation of
|
|
794
794
|
* cub::DeviceSegmentedReduceKernel
|
|
795
795
|
*/
|
|
796
796
|
template <typename ActivePolicyT, typename DeviceSegmentedReduceKernelT>
|
|
@@ -809,7 +809,8 @@ struct DispatchSegmentedReduce
|
|
|
809
809
|
return cudaSuccess;
|
|
810
810
|
}
|
|
811
811
|
|
|
812
|
-
// Init kernel configuration
|
|
812
|
+
// Init kernel configuration (computes kernel occupancy)
|
|
813
|
+
// maybe only used inside CUB_DEBUG_LOG code sections
|
|
813
814
|
[[maybe_unused]] detail::KernelConfig segmented_reduce_config;
|
|
814
815
|
error =
|
|
815
816
|
CubDebug(segmented_reduce_config.Init(segmented_reduce_kernel, policy.SegmentedReduce(), launcher_factory));
|
|
@@ -839,7 +840,7 @@ struct DispatchSegmentedReduce
|
|
|
839
840
|
segmented_reduce_config.sm_occupancy);
|
|
840
841
|
#endif // CUB_DEBUG_LOG
|
|
841
842
|
|
|
842
|
-
// Invoke
|
|
843
|
+
// Invoke DeviceSegmentedReduceKernel
|
|
843
844
|
launcher_factory(
|
|
844
845
|
static_cast<::cuda::std::uint32_t>(num_current_segments), policy.SegmentedReduce().BlockThreads(), 0, stream)
|
|
845
846
|
.doit(segmented_reduce_kernel, d_in, d_out, d_begin_offsets, d_end_offsets, reduction_op, init);
|
|
@@ -77,7 +77,7 @@ namespace rfa
|
|
|
77
77
|
{
|
|
78
78
|
|
|
79
79
|
template <typename Invocable, typename InputT>
|
|
80
|
-
using transformed_input_t = ::cuda::std::decay_t
|
|
80
|
+
using transformed_input_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<Invocable, InputT>>;
|
|
81
81
|
|
|
82
82
|
template <typename InitT, typename InputIteratorT, typename TransformOpT>
|
|
83
83
|
using accum_t =
|
|
@@ -18,8 +18,8 @@
|
|
|
18
18
|
|
|
19
19
|
#include <thrust/iterator/constant_iterator.h>
|
|
20
20
|
#include <thrust/iterator/iterator_adaptor.h>
|
|
21
|
-
#include <thrust/iterator/tabulate_output_iterator.h>
|
|
22
21
|
|
|
22
|
+
#include <cuda/__iterator/tabulate_output_iterator.h>
|
|
23
23
|
#include <cuda/std/__functional/identity.h>
|
|
24
24
|
#include <cuda/std/__utility/swap.h>
|
|
25
25
|
#include <cuda/std/limits>
|
|
@@ -217,8 +217,7 @@ struct dispatch_streaming_arg_reduce_t
|
|
|
217
217
|
|
|
218
218
|
// The output iterator that implements the logic to accumulate per-partition result to a global aggregate and,
|
|
219
219
|
// eventually, write to the user-provided output iterators
|
|
220
|
-
using accumulating_transform_out_it_t =
|
|
221
|
-
THRUST_NS_QUALIFIER::tabulate_output_iterator<accumulating_transform_output_op_t>;
|
|
220
|
+
using accumulating_transform_out_it_t = ::cuda::tabulate_output_iterator<accumulating_transform_output_op_t>;
|
|
222
221
|
|
|
223
222
|
// Empty problem initialization type
|
|
224
223
|
using empty_problem_init_t = empty_problem_init_t<per_partition_accum_t>;
|
|
@@ -270,7 +269,7 @@ struct dispatch_streaming_arg_reduce_t
|
|
|
270
269
|
nullptr,
|
|
271
270
|
allocation_sizes[0],
|
|
272
271
|
d_indexed_offset_in,
|
|
273
|
-
|
|
272
|
+
::cuda::make_tabulate_output_iterator(accumulating_out_op),
|
|
274
273
|
static_cast<PerPartitionOffsetT>(largest_partition_size),
|
|
275
274
|
reduce_op,
|
|
276
275
|
initial_value,
|
|
@@ -315,7 +314,7 @@ struct dispatch_streaming_arg_reduce_t
|
|
|
315
314
|
d_temp_storage,
|
|
316
315
|
temp_storage_bytes,
|
|
317
316
|
d_indexed_offset_in,
|
|
318
|
-
|
|
317
|
+
::cuda::make_tabulate_output_iterator(accumulating_out_op),
|
|
319
318
|
static_cast<PerPartitionOffsetT>(current_num_items),
|
|
320
319
|
reduce_op,
|
|
321
320
|
initial_value,
|
|
@@ -23,7 +23,6 @@
|
|
|
23
23
|
#include <cub/util_type.cuh>
|
|
24
24
|
|
|
25
25
|
#include <thrust/iterator/offset_iterator.h>
|
|
26
|
-
#include <thrust/iterator/tabulate_output_iterator.h>
|
|
27
26
|
#include <thrust/iterator/transform_iterator.h>
|
|
28
27
|
#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
|
|
29
28
|
|
|
@@ -387,15 +387,13 @@ struct DispatchTopK
|
|
|
387
387
|
return error;
|
|
388
388
|
}
|
|
389
389
|
|
|
390
|
-
_CubLog("Invoking topk_kernel
|
|
390
|
+
_CubLog("Invoking topk_kernel<<<%d, %d, 0, "
|
|
391
391
|
"%lld>>>(), %d items per thread, %d SM occupancy\n",
|
|
392
|
-
topk_grid_size
|
|
393
|
-
topk_grid_size.y,
|
|
394
|
-
topk_grid_size.z,
|
|
392
|
+
topk_grid_size,
|
|
395
393
|
block_threads,
|
|
396
394
|
(long long) stream,
|
|
397
395
|
items_per_thread,
|
|
398
|
-
|
|
396
|
+
main_kernel_blocks_per_sm);
|
|
399
397
|
}
|
|
400
398
|
#endif // CUB_DEBUG_LOG
|
|
401
399
|
|
|
@@ -109,8 +109,9 @@ struct TransformKernelSource<Offset,
|
|
|
109
109
|
return detail::transform::make_aligned_base_ptr_kernel_arg(it, align);
|
|
110
110
|
}
|
|
111
111
|
|
|
112
|
+
private:
|
|
112
113
|
template <typename T>
|
|
113
|
-
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static auto
|
|
114
|
+
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static auto is_pointer_aligned(T it, [[maybe_unused]] int alignment)
|
|
114
115
|
{
|
|
115
116
|
if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<decltype(it)>)
|
|
116
117
|
{
|
|
@@ -121,6 +122,14 @@ struct TransformKernelSource<Offset,
|
|
|
121
122
|
return true; // fancy iterators are aligned, since the vectorized kernel chooses a different code path
|
|
122
123
|
}
|
|
123
124
|
}
|
|
125
|
+
|
|
126
|
+
public:
|
|
127
|
+
CUB_RUNTIME_FUNCTION constexpr static bool
|
|
128
|
+
CanVectorize(int vec_size, const RandomAccessIteratorOut& out, const RandomAccessIteratorsIn&... in)
|
|
129
|
+
{
|
|
130
|
+
return is_pointer_aligned(out, sizeof(it_value_t<RandomAccessIteratorOut>) * vec_size)
|
|
131
|
+
&& (is_pointer_aligned(in, sizeof(it_value_t<RandomAccessIteratorsIn>) * vec_size) && ...);
|
|
132
|
+
}
|
|
124
133
|
};
|
|
125
134
|
|
|
126
135
|
enum class requires_stable_address
|
|
@@ -384,7 +393,7 @@ struct dispatch_t<StableAddress,
|
|
|
384
393
|
}
|
|
385
394
|
|
|
386
395
|
CUB_DEFINE_SFINAE_GETTER(items_per_thread_no_input, prefetch, ItemsPerThreadNoInput)
|
|
387
|
-
CUB_DEFINE_SFINAE_GETTER(
|
|
396
|
+
CUB_DEFINE_SFINAE_GETTER(vec_size, vectorized, VecSize)
|
|
388
397
|
CUB_DEFINE_SFINAE_GETTER(items_per_thread_vectorized, vectorized, ItemsPerThreadVectorized)
|
|
389
398
|
|
|
390
399
|
#undef CUB_DEFINE_SFINAE_GETTER
|
|
@@ -441,9 +450,8 @@ struct dispatch_t<StableAddress,
|
|
|
441
450
|
// the policy already handles the compile-time checks if we can vectorize. Do the remaining alignment check here
|
|
442
451
|
if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::vectorized == wrapped_policy.Algorithm())
|
|
443
452
|
{
|
|
444
|
-
const int
|
|
445
|
-
can_vectorize
|
|
446
|
-
&& kernel_source.IsPointerAligned(out, alignment);
|
|
453
|
+
const int vs = vec_size(wrapped_policy.AlgorithmPolicy());
|
|
454
|
+
can_vectorize = kernel_source.CanVectorize(vs, out, ::cuda::std::get<Is>(in)...);
|
|
447
455
|
}
|
|
448
456
|
|
|
449
457
|
int ipt = 0;
|