cuda-cccl 0.1.3.2.0.dev438__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
- cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
- cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +10 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/util_device.cuh +51 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +4 -4
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
- cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
- cuda/cccl/headers/include/cuda/__device/device_ref.h +32 -51
- cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +330 -36
- cuda/cccl/headers/include/cuda/__event/event.h +8 -8
- cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -4
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
- cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
- cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/devices +10 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
- cuda/cccl/headers/include/cuda/std/string_view +12 -5
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
- cuda/cccl/parallel/experimental/__init__.py +21 -70
- cuda/compute/__init__.py +77 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +141 -1
- cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +171 -166
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -84,70 +84,6 @@ CUB_NAMESPACE_BEGIN
|
|
|
84
84
|
//! @endrst
|
|
85
85
|
struct DeviceSegmentedReduce
|
|
86
86
|
{
|
|
87
|
-
private:
|
|
88
|
-
template <typename InputIteratorT,
|
|
89
|
-
typename OutputIteratorT,
|
|
90
|
-
typename BeginOffsetIteratorT,
|
|
91
|
-
typename EndOffsetIteratorT,
|
|
92
|
-
typename OffsetT,
|
|
93
|
-
typename ReductionOpT,
|
|
94
|
-
typename InitT,
|
|
95
|
-
typename... Ts>
|
|
96
|
-
CUB_RUNTIME_FUNCTION static cudaError_t segmented_reduce(
|
|
97
|
-
::cuda::std::false_type,
|
|
98
|
-
void* d_temp_storage,
|
|
99
|
-
size_t& temp_storage_bytes,
|
|
100
|
-
InputIteratorT d_in,
|
|
101
|
-
OutputIteratorT d_out,
|
|
102
|
-
::cuda::std::int64_t num_segments,
|
|
103
|
-
BeginOffsetIteratorT d_begin_offsets,
|
|
104
|
-
EndOffsetIteratorT d_end_offsets,
|
|
105
|
-
ReductionOpT reduction_op,
|
|
106
|
-
InitT initial_value,
|
|
107
|
-
cudaStream_t stream);
|
|
108
|
-
|
|
109
|
-
template <typename InputIteratorT,
|
|
110
|
-
typename OutputIteratorT,
|
|
111
|
-
typename BeginOffsetIteratorT,
|
|
112
|
-
typename EndOffsetIteratorT,
|
|
113
|
-
typename OffsetT,
|
|
114
|
-
typename ReductionOpT,
|
|
115
|
-
typename InitT,
|
|
116
|
-
typename... Ts>
|
|
117
|
-
CUB_RUNTIME_FUNCTION static cudaError_t segmented_reduce(
|
|
118
|
-
::cuda::std::true_type,
|
|
119
|
-
void* d_temp_storage,
|
|
120
|
-
size_t& temp_storage_bytes,
|
|
121
|
-
InputIteratorT d_in,
|
|
122
|
-
OutputIteratorT d_out,
|
|
123
|
-
::cuda::std::int64_t num_segments,
|
|
124
|
-
BeginOffsetIteratorT d_begin_offsets,
|
|
125
|
-
EndOffsetIteratorT d_end_offsets,
|
|
126
|
-
ReductionOpT reduction_op,
|
|
127
|
-
InitT initial_value,
|
|
128
|
-
cudaStream_t stream)
|
|
129
|
-
{
|
|
130
|
-
return DispatchSegmentedReduce<
|
|
131
|
-
InputIteratorT,
|
|
132
|
-
OutputIteratorT,
|
|
133
|
-
BeginOffsetIteratorT,
|
|
134
|
-
EndOffsetIteratorT,
|
|
135
|
-
OffsetT,
|
|
136
|
-
ReductionOpT,
|
|
137
|
-
InitT,
|
|
138
|
-
Ts...>::Dispatch(d_temp_storage,
|
|
139
|
-
temp_storage_bytes,
|
|
140
|
-
d_in,
|
|
141
|
-
d_out,
|
|
142
|
-
num_segments,
|
|
143
|
-
d_begin_offsets,
|
|
144
|
-
d_end_offsets,
|
|
145
|
-
reduction_op,
|
|
146
|
-
initial_value,
|
|
147
|
-
stream);
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
public:
|
|
151
87
|
//! @rst
|
|
152
88
|
//! Computes a device-wide segmented reduction using the specified
|
|
153
89
|
//! binary ``reduction_op`` functor.
|
|
@@ -261,24 +197,29 @@ public:
|
|
|
261
197
|
{
|
|
262
198
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Reduce");
|
|
263
199
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
200
|
+
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
201
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
202
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
203
|
+
{
|
|
204
|
+
return DispatchSegmentedReduce<
|
|
205
|
+
InputIteratorT,
|
|
206
|
+
OutputIteratorT,
|
|
207
|
+
BeginOffsetIteratorT,
|
|
208
|
+
EndOffsetIteratorT,
|
|
209
|
+
OffsetT,
|
|
210
|
+
ReductionOpT,
|
|
211
|
+
T>::Dispatch(d_temp_storage,
|
|
212
|
+
temp_storage_bytes,
|
|
213
|
+
d_in,
|
|
214
|
+
d_out,
|
|
215
|
+
num_segments,
|
|
216
|
+
d_begin_offsets,
|
|
217
|
+
d_end_offsets,
|
|
218
|
+
reduction_op,
|
|
219
|
+
initial_value, // zero-initialize
|
|
220
|
+
stream);
|
|
221
|
+
}
|
|
222
|
+
_CCCL_UNREACHABLE();
|
|
282
223
|
}
|
|
283
224
|
|
|
284
225
|
//! @rst
|
|
@@ -465,32 +406,31 @@ public:
|
|
|
465
406
|
{
|
|
466
407
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Sum");
|
|
467
408
|
|
|
468
|
-
// Integer type for global offsets
|
|
469
409
|
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
410
|
+
using OutputT = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
|
|
411
|
+
using init_t = OutputT;
|
|
412
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
413
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
414
|
+
{
|
|
415
|
+
return DispatchSegmentedReduce<
|
|
416
|
+
InputIteratorT,
|
|
417
|
+
OutputIteratorT,
|
|
418
|
+
BeginOffsetIteratorT,
|
|
419
|
+
EndOffsetIteratorT,
|
|
420
|
+
OffsetT,
|
|
421
|
+
::cuda::std::plus<>,
|
|
422
|
+
init_t>::Dispatch(d_temp_storage,
|
|
423
|
+
temp_storage_bytes,
|
|
424
|
+
d_in,
|
|
425
|
+
d_out,
|
|
426
|
+
num_segments,
|
|
427
|
+
d_begin_offsets,
|
|
428
|
+
d_end_offsets,
|
|
429
|
+
::cuda::std::plus<>{},
|
|
430
|
+
init_t{}, // zero-initialize
|
|
431
|
+
stream);
|
|
432
|
+
}
|
|
433
|
+
_CCCL_UNREACHABLE();
|
|
494
434
|
}
|
|
495
435
|
|
|
496
436
|
//! @rst
|
|
@@ -556,9 +496,7 @@ public:
|
|
|
556
496
|
// `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
|
|
557
497
|
// integral constant or larger integral types
|
|
558
498
|
using offset_t = int;
|
|
559
|
-
|
|
560
|
-
// The output value type
|
|
561
|
-
using output_t = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
|
|
499
|
+
using output_t = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
|
|
562
500
|
|
|
563
501
|
return detail::reduce::
|
|
564
502
|
DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::std::plus<>, output_t>::Dispatch(
|
|
@@ -673,32 +611,31 @@ public:
|
|
|
673
611
|
{
|
|
674
612
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Min");
|
|
675
613
|
|
|
676
|
-
// Integer type for global offsets
|
|
677
614
|
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
615
|
+
using InputT = detail::it_value_t<InputIteratorT>;
|
|
616
|
+
using init_t = InputT;
|
|
617
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
618
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
619
|
+
{
|
|
620
|
+
return DispatchSegmentedReduce<
|
|
621
|
+
InputIteratorT,
|
|
622
|
+
OutputIteratorT,
|
|
623
|
+
BeginOffsetIteratorT,
|
|
624
|
+
EndOffsetIteratorT,
|
|
625
|
+
OffsetT,
|
|
626
|
+
::cuda::minimum<>,
|
|
627
|
+
init_t>::Dispatch(d_temp_storage,
|
|
628
|
+
temp_storage_bytes,
|
|
629
|
+
d_in,
|
|
630
|
+
d_out,
|
|
631
|
+
num_segments,
|
|
632
|
+
d_begin_offsets,
|
|
633
|
+
d_end_offsets,
|
|
634
|
+
::cuda::minimum<>{},
|
|
635
|
+
::cuda::std::numeric_limits<init_t>::max(),
|
|
636
|
+
stream);
|
|
637
|
+
}
|
|
638
|
+
_CCCL_UNREACHABLE();
|
|
702
639
|
}
|
|
703
640
|
|
|
704
641
|
//! @rst
|
|
@@ -769,9 +706,7 @@ public:
|
|
|
769
706
|
// `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
|
|
770
707
|
// integral constant or larger integral types
|
|
771
708
|
using offset_t = int;
|
|
772
|
-
|
|
773
|
-
// The input value type
|
|
774
|
-
using input_t = cub::detail::it_value_t<InputIteratorT>;
|
|
709
|
+
using input_t = detail::it_value_t<InputIteratorT>;
|
|
775
710
|
|
|
776
711
|
return detail::reduce::
|
|
777
712
|
DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::minimum<>, input_t>::Dispatch(
|
|
@@ -890,54 +825,45 @@ public:
|
|
|
890
825
|
{
|
|
891
826
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMin");
|
|
892
827
|
|
|
893
|
-
// Integer type for global offsets
|
|
894
828
|
// Using common iterator value type is a breaking change, see:
|
|
895
829
|
// https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
|
|
896
830
|
using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
897
831
|
|
|
898
|
-
|
|
899
|
-
using
|
|
900
|
-
|
|
901
|
-
// The output tuple type
|
|
902
|
-
using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
|
|
903
|
-
|
|
904
|
-
// The output value type
|
|
832
|
+
using InputValueT = detail::it_value_t<InputIteratorT>;
|
|
833
|
+
using OutputTupleT = detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
|
|
905
834
|
using OutputValueT = typename OutputTupleT::Value;
|
|
906
|
-
|
|
907
|
-
using
|
|
908
|
-
|
|
909
|
-
using InitT = detail::reduce::empty_problem_init_t<AccumT>;
|
|
835
|
+
using AccumT = OutputTupleT;
|
|
836
|
+
using InitT = detail::reduce::empty_problem_init_t<AccumT>;
|
|
910
837
|
|
|
911
838
|
// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
|
|
912
839
|
using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
|
|
913
|
-
|
|
914
840
|
ArgIndexInputIteratorT d_indexed_in(d_in);
|
|
915
841
|
|
|
916
|
-
// Initial value
|
|
917
842
|
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
|
|
918
843
|
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
844
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
845
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
846
|
+
{
|
|
847
|
+
return DispatchSegmentedReduce<
|
|
848
|
+
ArgIndexInputIteratorT,
|
|
849
|
+
OutputIteratorT,
|
|
850
|
+
BeginOffsetIteratorT,
|
|
851
|
+
EndOffsetIteratorT,
|
|
852
|
+
OffsetT,
|
|
853
|
+
cub::ArgMin,
|
|
854
|
+
InitT,
|
|
855
|
+
AccumT>::Dispatch(d_temp_storage,
|
|
856
|
+
temp_storage_bytes,
|
|
857
|
+
d_indexed_in,
|
|
858
|
+
d_out,
|
|
859
|
+
num_segments,
|
|
860
|
+
d_begin_offsets,
|
|
861
|
+
d_end_offsets,
|
|
862
|
+
cub::ArgMin{},
|
|
863
|
+
initial_value,
|
|
864
|
+
stream);
|
|
865
|
+
}
|
|
866
|
+
_CCCL_UNREACHABLE();
|
|
941
867
|
}
|
|
942
868
|
|
|
943
869
|
//! @rst
|
|
@@ -1144,27 +1070,32 @@ public:
|
|
|
1144
1070
|
{
|
|
1145
1071
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Max");
|
|
1146
1072
|
|
|
1147
|
-
// Integer type for global offsets
|
|
1148
1073
|
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1074
|
+
using InputT = cub::detail::it_value_t<InputIteratorT>;
|
|
1075
|
+
using init_t = InputT;
|
|
1076
|
+
|
|
1077
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
1078
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
1079
|
+
{
|
|
1080
|
+
return DispatchSegmentedReduce<
|
|
1081
|
+
InputIteratorT,
|
|
1082
|
+
OutputIteratorT,
|
|
1083
|
+
BeginOffsetIteratorT,
|
|
1084
|
+
EndOffsetIteratorT,
|
|
1085
|
+
OffsetT,
|
|
1086
|
+
::cuda::maximum<>,
|
|
1087
|
+
init_t>::Dispatch(d_temp_storage,
|
|
1088
|
+
temp_storage_bytes,
|
|
1089
|
+
d_in,
|
|
1090
|
+
d_out,
|
|
1091
|
+
num_segments,
|
|
1092
|
+
d_begin_offsets,
|
|
1093
|
+
d_end_offsets,
|
|
1094
|
+
::cuda::maximum<>{},
|
|
1095
|
+
::cuda::std::numeric_limits<init_t>::lowest(),
|
|
1096
|
+
stream);
|
|
1097
|
+
}
|
|
1098
|
+
_CCCL_UNREACHABLE();
|
|
1168
1099
|
}
|
|
1169
1100
|
|
|
1170
1101
|
//! @rst
|
|
@@ -1229,9 +1160,7 @@ public:
|
|
|
1229
1160
|
// `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
|
|
1230
1161
|
// integral constant or larger integral types
|
|
1231
1162
|
using offset_t = int;
|
|
1232
|
-
|
|
1233
|
-
// The input value type
|
|
1234
|
-
using input_t = cub::detail::it_value_t<InputIteratorT>;
|
|
1163
|
+
using input_t = detail::it_value_t<InputIteratorT>;
|
|
1235
1164
|
|
|
1236
1165
|
return detail::reduce::
|
|
1237
1166
|
DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::maximum<>, input_t>::Dispatch(
|
|
@@ -1353,54 +1282,45 @@ public:
|
|
|
1353
1282
|
{
|
|
1354
1283
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMax");
|
|
1355
1284
|
|
|
1356
|
-
// Integer type for global offsets
|
|
1357
1285
|
// Using common iterator value type is a breaking change, see:
|
|
1358
1286
|
// https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
|
|
1359
1287
|
using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
1360
1288
|
|
|
1361
|
-
|
|
1362
|
-
using InputValueT = cub::detail::it_value_t<InputIteratorT>;
|
|
1363
|
-
|
|
1364
|
-
// The output tuple type
|
|
1289
|
+
using InputValueT = cub::detail::it_value_t<InputIteratorT>;
|
|
1365
1290
|
using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
|
|
1366
|
-
|
|
1367
|
-
using
|
|
1368
|
-
|
|
1369
|
-
using InitT = detail::reduce::empty_problem_init_t<AccumT>;
|
|
1370
|
-
|
|
1371
|
-
// The output value type
|
|
1291
|
+
using AccumT = OutputTupleT;
|
|
1292
|
+
using InitT = detail::reduce::empty_problem_init_t<AccumT>;
|
|
1372
1293
|
using OutputValueT = typename OutputTupleT::Value;
|
|
1373
1294
|
|
|
1374
1295
|
// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
|
|
1375
1296
|
using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
|
|
1376
|
-
|
|
1377
1297
|
ArgIndexInputIteratorT d_indexed_in(d_in);
|
|
1378
1298
|
|
|
1379
|
-
// Initial value
|
|
1380
1299
|
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
|
|
1381
1300
|
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1301
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
1302
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
1303
|
+
{
|
|
1304
|
+
return DispatchSegmentedReduce<
|
|
1305
|
+
ArgIndexInputIteratorT,
|
|
1306
|
+
OutputIteratorT,
|
|
1307
|
+
BeginOffsetIteratorT,
|
|
1308
|
+
EndOffsetIteratorT,
|
|
1309
|
+
OffsetT,
|
|
1310
|
+
cub::ArgMax,
|
|
1311
|
+
InitT,
|
|
1312
|
+
AccumT>::Dispatch(d_temp_storage,
|
|
1313
|
+
temp_storage_bytes,
|
|
1314
|
+
d_indexed_in,
|
|
1315
|
+
d_out,
|
|
1316
|
+
num_segments,
|
|
1317
|
+
d_begin_offsets,
|
|
1318
|
+
d_end_offsets,
|
|
1319
|
+
cub::ArgMax{},
|
|
1320
|
+
initial_value,
|
|
1321
|
+
stream);
|
|
1322
|
+
}
|
|
1323
|
+
_CCCL_UNREACHABLE();
|
|
1404
1324
|
}
|
|
1405
1325
|
|
|
1406
1326
|
//! @rst
|
|
@@ -1476,34 +1396,25 @@ public:
|
|
|
1476
1396
|
// integral constant or larger integral types
|
|
1477
1397
|
using input_t = int;
|
|
1478
1398
|
|
|
1479
|
-
|
|
1480
|
-
using
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
using output_tuple_t = cub::detail::non_void_value_t<OutputIteratorT, ::cuda::std::pair<input_t, input_value_t>>;
|
|
1484
|
-
|
|
1485
|
-
using accum_t = output_tuple_t;
|
|
1486
|
-
|
|
1487
|
-
using init_t = detail::reduce::empty_problem_init_t<accum_t>;
|
|
1488
|
-
|
|
1489
|
-
// The output value type
|
|
1399
|
+
using input_value_t = detail::it_value_t<InputIteratorT>;
|
|
1400
|
+
using output_tuple_t = detail::non_void_value_t<OutputIteratorT, ::cuda::std::pair<input_t, input_value_t>>;
|
|
1401
|
+
using accum_t = output_tuple_t;
|
|
1402
|
+
using init_t = detail::reduce::empty_problem_init_t<accum_t>;
|
|
1490
1403
|
using output_value_t = typename output_tuple_t::second_type;
|
|
1491
1404
|
|
|
1492
1405
|
// Wrapped input iterator to produce index-value <input_t, InputT> tuples
|
|
1493
1406
|
auto d_indexed_in = THRUST_NS_QUALIFIER::make_transform_iterator(
|
|
1494
1407
|
THRUST_NS_QUALIFIER::counting_iterator<::cuda::std::int64_t>{0},
|
|
1495
1408
|
detail::reduce::generate_idx_value<InputIteratorT, output_value_t>(d_in, segment_size));
|
|
1496
|
-
|
|
1497
1409
|
using arg_index_input_iterator_t = decltype(d_indexed_in);
|
|
1498
1410
|
|
|
1499
|
-
// Initial value
|
|
1500
1411
|
init_t initial_value{accum_t(1, ::cuda::std::numeric_limits<input_value_t>::lowest())};
|
|
1501
1412
|
|
|
1502
1413
|
return detail::reduce::DispatchFixedSizeSegmentedReduce<
|
|
1503
1414
|
arg_index_input_iterator_t,
|
|
1504
1415
|
OutputIteratorT,
|
|
1505
1416
|
input_t,
|
|
1506
|
-
|
|
1417
|
+
detail::arg_max,
|
|
1507
1418
|
init_t,
|
|
1508
1419
|
accum_t>::Dispatch(d_temp_storage,
|
|
1509
1420
|
temp_storage_bytes,
|
|
@@ -1511,7 +1422,7 @@ public:
|
|
|
1511
1422
|
d_out,
|
|
1512
1423
|
num_segments,
|
|
1513
1424
|
segment_size,
|
|
1514
|
-
|
|
1425
|
+
detail::arg_max(),
|
|
1515
1426
|
initial_value,
|
|
1516
1427
|
stream);
|
|
1517
1428
|
}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
3
|
+
|
|
4
|
+
//! @file
|
|
5
|
+
#pragma once
|
|
6
|
+
|
|
7
|
+
#include <cub/config.cuh>
|
|
8
|
+
|
|
9
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
10
|
+
|
|
11
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
12
|
+
# pragma GCC system_header
|
|
13
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
14
|
+
# pragma clang system_header
|
|
15
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
16
|
+
# pragma system_header
|
|
17
|
+
#endif // no system header
|
|
18
|
+
|
|
19
|
+
#include <cub/device/device_for.cuh>
|
|
20
|
+
#include <cub/device/device_transform.cuh>
|
|
21
|
+
#include <cub/util_debug.cuh>
|
|
22
|
+
|
|
23
|
+
#include <cuda/std/functional>
|
|
24
|
+
#include <cuda/std/mdspan>
|
|
25
|
+
|
|
26
|
+
CUB_NAMESPACE_BEGIN
|
|
27
|
+
|
|
28
|
+
namespace detail::copy_mdspan
|
|
29
|
+
{
|
|
30
|
+
|
|
31
|
+
template <typename MdspanIn, typename MdspanOut>
|
|
32
|
+
struct copy_mdspan_t
|
|
33
|
+
{
|
|
34
|
+
MdspanIn mdspan_in;
|
|
35
|
+
MdspanOut mdspan_out;
|
|
36
|
+
|
|
37
|
+
_CCCL_API copy_mdspan_t(MdspanIn mdspan_in, MdspanOut mdspan_out)
|
|
38
|
+
: mdspan_in{mdspan_in}
|
|
39
|
+
, mdspan_out{mdspan_out}
|
|
40
|
+
{}
|
|
41
|
+
|
|
42
|
+
template <typename Idx, typename... Indices>
|
|
43
|
+
_CCCL_DEVICE_API _CCCL_FORCEINLINE void operator()(Idx, Indices... indices)
|
|
44
|
+
{
|
|
45
|
+
mdspan_out(indices...) = mdspan_in(indices...);
|
|
46
|
+
}
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
template <typename T_In,
|
|
50
|
+
typename E_In,
|
|
51
|
+
typename L_In,
|
|
52
|
+
typename A_In,
|
|
53
|
+
typename T_Out,
|
|
54
|
+
typename E_Out,
|
|
55
|
+
typename L_Out,
|
|
56
|
+
typename A_Out>
|
|
57
|
+
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
|
|
58
|
+
copy(::cuda::std::mdspan<T_In, E_In, L_In, A_In> mdspan_in,
|
|
59
|
+
::cuda::std::mdspan<T_Out, E_Out, L_Out, A_Out> mdspan_out,
|
|
60
|
+
::cudaStream_t stream)
|
|
61
|
+
{
|
|
62
|
+
if (mdspan_in.is_exhaustive() && mdspan_out.is_exhaustive()
|
|
63
|
+
&& detail::have_same_strides(mdspan_in.mapping(), mdspan_out.mapping()))
|
|
64
|
+
{
|
|
65
|
+
return cub::DeviceTransform::Transform(
|
|
66
|
+
mdspan_in.data_handle(),
|
|
67
|
+
mdspan_out.data_handle(),
|
|
68
|
+
mdspan_in.size(),
|
|
69
|
+
::cuda::proclaim_copyable_arguments(::cuda::std::identity{}),
|
|
70
|
+
stream);
|
|
71
|
+
}
|
|
72
|
+
// TODO (fbusato): add ForEachInLayout when mdspan_in and mdspan_out have compatible layouts
|
|
73
|
+
// Compatible layouts could use more efficient iteration patterns
|
|
74
|
+
return cub::DeviceFor::ForEachInExtents(mdspan_in.extents(), copy_mdspan_t{mdspan_in, mdspan_out}, stream);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
} // namespace detail::copy_mdspan
|
|
78
|
+
|
|
79
|
+
CUB_NAMESPACE_END
|
|
@@ -144,11 +144,11 @@ __launch_bounds__(
|
|
|
144
144
|
auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage);
|
|
145
145
|
MergeAgent{
|
|
146
146
|
temp_storage.Alias(),
|
|
147
|
-
|
|
148
|
-
|
|
147
|
+
keys1,
|
|
148
|
+
items1,
|
|
149
149
|
num_keys1,
|
|
150
|
-
|
|
151
|
-
|
|
150
|
+
keys2,
|
|
151
|
+
items2,
|
|
152
152
|
num_keys2,
|
|
153
153
|
keys_result,
|
|
154
154
|
items_result,
|
|
@@ -44,7 +44,6 @@
|
|
|
44
44
|
# pragma system_header
|
|
45
45
|
#endif // no system header
|
|
46
46
|
|
|
47
|
-
#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
|
|
48
47
|
#include <cub/device/dispatch/kernels/radix_sort.cuh>
|
|
49
48
|
#include <cub/device/dispatch/tuning/tuning_radix_sort.cuh>
|
|
50
49
|
#include <cub/util_debug.cuh>
|
|
@@ -1379,14 +1378,6 @@ struct DispatchSegmentedRadixSort
|
|
|
1379
1378
|
// Number of radix sort invocations until all segments have been processed
|
|
1380
1379
|
const auto num_invocations = ::cuda::ceil_div(num_segments, max_num_segments_per_invocation);
|
|
1381
1380
|
|
|
1382
|
-
// If d_begin_offsets and d_end_offsets do not support operator+ then we can't have more than
|
|
1383
|
-
// max_num_segments_per_invocation segments per invocation
|
|
1384
|
-
if (num_invocations > 1
|
|
1385
|
-
&& !detail::all_iterators_support_add_assign_operator(::cuda::std::int64_t{}, d_begin_offsets, d_end_offsets))
|
|
1386
|
-
{
|
|
1387
|
-
return cudaErrorInvalidValue;
|
|
1388
|
-
}
|
|
1389
|
-
|
|
1390
1381
|
BeginOffsetIteratorT begin_offsets_current_it = d_begin_offsets;
|
|
1391
1382
|
EndOffsetIteratorT end_offsets_current_it = d_end_offsets;
|
|
1392
1383
|
|
|
@@ -1435,8 +1426,8 @@ struct DispatchSegmentedRadixSort
|
|
|
1435
1426
|
|
|
1436
1427
|
if (invocation_index + 1 < num_invocations)
|
|
1437
1428
|
{
|
|
1438
|
-
|
|
1439
|
-
|
|
1429
|
+
begin_offsets_current_it += num_current_segments;
|
|
1430
|
+
end_offsets_current_it += num_current_segments;
|
|
1440
1431
|
}
|
|
1441
1432
|
|
|
1442
1433
|
// Sync the stream if specified to flush runtime errors
|