cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +48 -46
- cuda/cccl/headers/include/cuda/__device/attributes.h +171 -121
- cuda/cccl/headers/include/cuda/__device/device_ref.h +30 -42
- cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +1 -0
- cuda/cccl/headers/include/cuda/__event/timed_event.h +1 -0
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/devices +10 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/string_view +12 -5
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +77 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +1 -1
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +141 -138
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -84,70 +84,6 @@ CUB_NAMESPACE_BEGIN
|
|
|
84
84
|
//! @endrst
|
|
85
85
|
struct DeviceSegmentedReduce
|
|
86
86
|
{
|
|
87
|
-
private:
|
|
88
|
-
template <typename InputIteratorT,
|
|
89
|
-
typename OutputIteratorT,
|
|
90
|
-
typename BeginOffsetIteratorT,
|
|
91
|
-
typename EndOffsetIteratorT,
|
|
92
|
-
typename OffsetT,
|
|
93
|
-
typename ReductionOpT,
|
|
94
|
-
typename InitT,
|
|
95
|
-
typename... Ts>
|
|
96
|
-
CUB_RUNTIME_FUNCTION static cudaError_t segmented_reduce(
|
|
97
|
-
::cuda::std::false_type,
|
|
98
|
-
void* d_temp_storage,
|
|
99
|
-
size_t& temp_storage_bytes,
|
|
100
|
-
InputIteratorT d_in,
|
|
101
|
-
OutputIteratorT d_out,
|
|
102
|
-
::cuda::std::int64_t num_segments,
|
|
103
|
-
BeginOffsetIteratorT d_begin_offsets,
|
|
104
|
-
EndOffsetIteratorT d_end_offsets,
|
|
105
|
-
ReductionOpT reduction_op,
|
|
106
|
-
InitT initial_value,
|
|
107
|
-
cudaStream_t stream);
|
|
108
|
-
|
|
109
|
-
template <typename InputIteratorT,
|
|
110
|
-
typename OutputIteratorT,
|
|
111
|
-
typename BeginOffsetIteratorT,
|
|
112
|
-
typename EndOffsetIteratorT,
|
|
113
|
-
typename OffsetT,
|
|
114
|
-
typename ReductionOpT,
|
|
115
|
-
typename InitT,
|
|
116
|
-
typename... Ts>
|
|
117
|
-
CUB_RUNTIME_FUNCTION static cudaError_t segmented_reduce(
|
|
118
|
-
::cuda::std::true_type,
|
|
119
|
-
void* d_temp_storage,
|
|
120
|
-
size_t& temp_storage_bytes,
|
|
121
|
-
InputIteratorT d_in,
|
|
122
|
-
OutputIteratorT d_out,
|
|
123
|
-
::cuda::std::int64_t num_segments,
|
|
124
|
-
BeginOffsetIteratorT d_begin_offsets,
|
|
125
|
-
EndOffsetIteratorT d_end_offsets,
|
|
126
|
-
ReductionOpT reduction_op,
|
|
127
|
-
InitT initial_value,
|
|
128
|
-
cudaStream_t stream)
|
|
129
|
-
{
|
|
130
|
-
return DispatchSegmentedReduce<
|
|
131
|
-
InputIteratorT,
|
|
132
|
-
OutputIteratorT,
|
|
133
|
-
BeginOffsetIteratorT,
|
|
134
|
-
EndOffsetIteratorT,
|
|
135
|
-
OffsetT,
|
|
136
|
-
ReductionOpT,
|
|
137
|
-
InitT,
|
|
138
|
-
Ts...>::Dispatch(d_temp_storage,
|
|
139
|
-
temp_storage_bytes,
|
|
140
|
-
d_in,
|
|
141
|
-
d_out,
|
|
142
|
-
num_segments,
|
|
143
|
-
d_begin_offsets,
|
|
144
|
-
d_end_offsets,
|
|
145
|
-
reduction_op,
|
|
146
|
-
initial_value,
|
|
147
|
-
stream);
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
public:
|
|
151
87
|
//! @rst
|
|
152
88
|
//! Computes a device-wide segmented reduction using the specified
|
|
153
89
|
//! binary ``reduction_op`` functor.
|
|
@@ -261,24 +197,29 @@ public:
|
|
|
261
197
|
{
|
|
262
198
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Reduce");
|
|
263
199
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
200
|
+
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
201
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
202
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
203
|
+
{
|
|
204
|
+
return DispatchSegmentedReduce<
|
|
205
|
+
InputIteratorT,
|
|
206
|
+
OutputIteratorT,
|
|
207
|
+
BeginOffsetIteratorT,
|
|
208
|
+
EndOffsetIteratorT,
|
|
209
|
+
OffsetT,
|
|
210
|
+
ReductionOpT,
|
|
211
|
+
T>::Dispatch(d_temp_storage,
|
|
212
|
+
temp_storage_bytes,
|
|
213
|
+
d_in,
|
|
214
|
+
d_out,
|
|
215
|
+
num_segments,
|
|
216
|
+
d_begin_offsets,
|
|
217
|
+
d_end_offsets,
|
|
218
|
+
reduction_op,
|
|
219
|
+
initial_value, // zero-initialize
|
|
220
|
+
stream);
|
|
221
|
+
}
|
|
222
|
+
_CCCL_UNREACHABLE();
|
|
282
223
|
}
|
|
283
224
|
|
|
284
225
|
//! @rst
|
|
@@ -465,32 +406,31 @@ public:
|
|
|
465
406
|
{
|
|
466
407
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Sum");
|
|
467
408
|
|
|
468
|
-
// Integer type for global offsets
|
|
469
409
|
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
410
|
+
using OutputT = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
|
|
411
|
+
using init_t = OutputT;
|
|
412
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
413
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
414
|
+
{
|
|
415
|
+
return DispatchSegmentedReduce<
|
|
416
|
+
InputIteratorT,
|
|
417
|
+
OutputIteratorT,
|
|
418
|
+
BeginOffsetIteratorT,
|
|
419
|
+
EndOffsetIteratorT,
|
|
420
|
+
OffsetT,
|
|
421
|
+
::cuda::std::plus<>,
|
|
422
|
+
init_t>::Dispatch(d_temp_storage,
|
|
423
|
+
temp_storage_bytes,
|
|
424
|
+
d_in,
|
|
425
|
+
d_out,
|
|
426
|
+
num_segments,
|
|
427
|
+
d_begin_offsets,
|
|
428
|
+
d_end_offsets,
|
|
429
|
+
::cuda::std::plus<>{},
|
|
430
|
+
init_t{}, // zero-initialize
|
|
431
|
+
stream);
|
|
432
|
+
}
|
|
433
|
+
_CCCL_UNREACHABLE();
|
|
494
434
|
}
|
|
495
435
|
|
|
496
436
|
//! @rst
|
|
@@ -556,9 +496,7 @@ public:
|
|
|
556
496
|
// `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
|
|
557
497
|
// integral constant or larger integral types
|
|
558
498
|
using offset_t = int;
|
|
559
|
-
|
|
560
|
-
// The output value type
|
|
561
|
-
using output_t = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
|
|
499
|
+
using output_t = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
|
|
562
500
|
|
|
563
501
|
return detail::reduce::
|
|
564
502
|
DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::std::plus<>, output_t>::Dispatch(
|
|
@@ -673,32 +611,31 @@ public:
|
|
|
673
611
|
{
|
|
674
612
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Min");
|
|
675
613
|
|
|
676
|
-
// Integer type for global offsets
|
|
677
614
|
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
615
|
+
using InputT = detail::it_value_t<InputIteratorT>;
|
|
616
|
+
using init_t = InputT;
|
|
617
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
618
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
619
|
+
{
|
|
620
|
+
return DispatchSegmentedReduce<
|
|
621
|
+
InputIteratorT,
|
|
622
|
+
OutputIteratorT,
|
|
623
|
+
BeginOffsetIteratorT,
|
|
624
|
+
EndOffsetIteratorT,
|
|
625
|
+
OffsetT,
|
|
626
|
+
::cuda::minimum<>,
|
|
627
|
+
init_t>::Dispatch(d_temp_storage,
|
|
628
|
+
temp_storage_bytes,
|
|
629
|
+
d_in,
|
|
630
|
+
d_out,
|
|
631
|
+
num_segments,
|
|
632
|
+
d_begin_offsets,
|
|
633
|
+
d_end_offsets,
|
|
634
|
+
::cuda::minimum<>{},
|
|
635
|
+
::cuda::std::numeric_limits<init_t>::max(),
|
|
636
|
+
stream);
|
|
637
|
+
}
|
|
638
|
+
_CCCL_UNREACHABLE();
|
|
702
639
|
}
|
|
703
640
|
|
|
704
641
|
//! @rst
|
|
@@ -769,9 +706,7 @@ public:
|
|
|
769
706
|
// `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
|
|
770
707
|
// integral constant or larger integral types
|
|
771
708
|
using offset_t = int;
|
|
772
|
-
|
|
773
|
-
// The input value type
|
|
774
|
-
using input_t = cub::detail::it_value_t<InputIteratorT>;
|
|
709
|
+
using input_t = detail::it_value_t<InputIteratorT>;
|
|
775
710
|
|
|
776
711
|
return detail::reduce::
|
|
777
712
|
DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::minimum<>, input_t>::Dispatch(
|
|
@@ -890,54 +825,45 @@ public:
|
|
|
890
825
|
{
|
|
891
826
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMin");
|
|
892
827
|
|
|
893
|
-
// Integer type for global offsets
|
|
894
828
|
// Using common iterator value type is a breaking change, see:
|
|
895
829
|
// https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
|
|
896
830
|
using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
897
831
|
|
|
898
|
-
|
|
899
|
-
using
|
|
900
|
-
|
|
901
|
-
// The output tuple type
|
|
902
|
-
using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
|
|
903
|
-
|
|
904
|
-
// The output value type
|
|
832
|
+
using InputValueT = detail::it_value_t<InputIteratorT>;
|
|
833
|
+
using OutputTupleT = detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
|
|
905
834
|
using OutputValueT = typename OutputTupleT::Value;
|
|
906
|
-
|
|
907
|
-
using
|
|
908
|
-
|
|
909
|
-
using InitT = detail::reduce::empty_problem_init_t<AccumT>;
|
|
835
|
+
using AccumT = OutputTupleT;
|
|
836
|
+
using InitT = detail::reduce::empty_problem_init_t<AccumT>;
|
|
910
837
|
|
|
911
838
|
// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
|
|
912
839
|
using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
|
|
913
|
-
|
|
914
840
|
ArgIndexInputIteratorT d_indexed_in(d_in);
|
|
915
841
|
|
|
916
|
-
// Initial value
|
|
917
842
|
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
|
|
918
843
|
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
844
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
845
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
846
|
+
{
|
|
847
|
+
return DispatchSegmentedReduce<
|
|
848
|
+
ArgIndexInputIteratorT,
|
|
849
|
+
OutputIteratorT,
|
|
850
|
+
BeginOffsetIteratorT,
|
|
851
|
+
EndOffsetIteratorT,
|
|
852
|
+
OffsetT,
|
|
853
|
+
cub::ArgMin,
|
|
854
|
+
InitT,
|
|
855
|
+
AccumT>::Dispatch(d_temp_storage,
|
|
856
|
+
temp_storage_bytes,
|
|
857
|
+
d_indexed_in,
|
|
858
|
+
d_out,
|
|
859
|
+
num_segments,
|
|
860
|
+
d_begin_offsets,
|
|
861
|
+
d_end_offsets,
|
|
862
|
+
cub::ArgMin{},
|
|
863
|
+
initial_value,
|
|
864
|
+
stream);
|
|
865
|
+
}
|
|
866
|
+
_CCCL_UNREACHABLE();
|
|
941
867
|
}
|
|
942
868
|
|
|
943
869
|
//! @rst
|
|
@@ -1144,27 +1070,32 @@ public:
|
|
|
1144
1070
|
{
|
|
1145
1071
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Max");
|
|
1146
1072
|
|
|
1147
|
-
// Integer type for global offsets
|
|
1148
1073
|
using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1074
|
+
using InputT = cub::detail::it_value_t<InputIteratorT>;
|
|
1075
|
+
using init_t = InputT;
|
|
1076
|
+
|
|
1077
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
1078
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
1079
|
+
{
|
|
1080
|
+
return DispatchSegmentedReduce<
|
|
1081
|
+
InputIteratorT,
|
|
1082
|
+
OutputIteratorT,
|
|
1083
|
+
BeginOffsetIteratorT,
|
|
1084
|
+
EndOffsetIteratorT,
|
|
1085
|
+
OffsetT,
|
|
1086
|
+
::cuda::maximum<>,
|
|
1087
|
+
init_t>::Dispatch(d_temp_storage,
|
|
1088
|
+
temp_storage_bytes,
|
|
1089
|
+
d_in,
|
|
1090
|
+
d_out,
|
|
1091
|
+
num_segments,
|
|
1092
|
+
d_begin_offsets,
|
|
1093
|
+
d_end_offsets,
|
|
1094
|
+
::cuda::maximum<>{},
|
|
1095
|
+
::cuda::std::numeric_limits<init_t>::lowest(),
|
|
1096
|
+
stream);
|
|
1097
|
+
}
|
|
1098
|
+
_CCCL_UNREACHABLE();
|
|
1168
1099
|
}
|
|
1169
1100
|
|
|
1170
1101
|
//! @rst
|
|
@@ -1229,9 +1160,7 @@ public:
|
|
|
1229
1160
|
// `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
|
|
1230
1161
|
// integral constant or larger integral types
|
|
1231
1162
|
using offset_t = int;
|
|
1232
|
-
|
|
1233
|
-
// The input value type
|
|
1234
|
-
using input_t = cub::detail::it_value_t<InputIteratorT>;
|
|
1163
|
+
using input_t = detail::it_value_t<InputIteratorT>;
|
|
1235
1164
|
|
|
1236
1165
|
return detail::reduce::
|
|
1237
1166
|
DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::maximum<>, input_t>::Dispatch(
|
|
@@ -1353,54 +1282,45 @@ public:
|
|
|
1353
1282
|
{
|
|
1354
1283
|
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMax");
|
|
1355
1284
|
|
|
1356
|
-
// Integer type for global offsets
|
|
1357
1285
|
// Using common iterator value type is a breaking change, see:
|
|
1358
1286
|
// https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
|
|
1359
1287
|
using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
|
|
1360
1288
|
|
|
1361
|
-
|
|
1362
|
-
using InputValueT = cub::detail::it_value_t<InputIteratorT>;
|
|
1363
|
-
|
|
1364
|
-
// The output tuple type
|
|
1289
|
+
using InputValueT = cub::detail::it_value_t<InputIteratorT>;
|
|
1365
1290
|
using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
|
|
1366
|
-
|
|
1367
|
-
using
|
|
1368
|
-
|
|
1369
|
-
using InitT = detail::reduce::empty_problem_init_t<AccumT>;
|
|
1370
|
-
|
|
1371
|
-
// The output value type
|
|
1291
|
+
using AccumT = OutputTupleT;
|
|
1292
|
+
using InitT = detail::reduce::empty_problem_init_t<AccumT>;
|
|
1372
1293
|
using OutputValueT = typename OutputTupleT::Value;
|
|
1373
1294
|
|
|
1374
1295
|
// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
|
|
1375
1296
|
using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
|
|
1376
|
-
|
|
1377
1297
|
ArgIndexInputIteratorT d_indexed_in(d_in);
|
|
1378
1298
|
|
|
1379
|
-
// Initial value
|
|
1380
1299
|
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
|
|
1381
1300
|
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1301
|
+
static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
|
|
1302
|
+
if constexpr (::cuda::std::is_integral_v<OffsetT>)
|
|
1303
|
+
{
|
|
1304
|
+
return DispatchSegmentedReduce<
|
|
1305
|
+
ArgIndexInputIteratorT,
|
|
1306
|
+
OutputIteratorT,
|
|
1307
|
+
BeginOffsetIteratorT,
|
|
1308
|
+
EndOffsetIteratorT,
|
|
1309
|
+
OffsetT,
|
|
1310
|
+
cub::ArgMax,
|
|
1311
|
+
InitT,
|
|
1312
|
+
AccumT>::Dispatch(d_temp_storage,
|
|
1313
|
+
temp_storage_bytes,
|
|
1314
|
+
d_indexed_in,
|
|
1315
|
+
d_out,
|
|
1316
|
+
num_segments,
|
|
1317
|
+
d_begin_offsets,
|
|
1318
|
+
d_end_offsets,
|
|
1319
|
+
cub::ArgMax{},
|
|
1320
|
+
initial_value,
|
|
1321
|
+
stream);
|
|
1322
|
+
}
|
|
1323
|
+
_CCCL_UNREACHABLE();
|
|
1404
1324
|
}
|
|
1405
1325
|
|
|
1406
1326
|
//! @rst
|
|
@@ -1476,34 +1396,25 @@ public:
|
|
|
1476
1396
|
// integral constant or larger integral types
|
|
1477
1397
|
using input_t = int;
|
|
1478
1398
|
|
|
1479
|
-
|
|
1480
|
-
using
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
using output_tuple_t = cub::detail::non_void_value_t<OutputIteratorT, ::cuda::std::pair<input_t, input_value_t>>;
|
|
1484
|
-
|
|
1485
|
-
using accum_t = output_tuple_t;
|
|
1486
|
-
|
|
1487
|
-
using init_t = detail::reduce::empty_problem_init_t<accum_t>;
|
|
1488
|
-
|
|
1489
|
-
// The output value type
|
|
1399
|
+
using input_value_t = detail::it_value_t<InputIteratorT>;
|
|
1400
|
+
using output_tuple_t = detail::non_void_value_t<OutputIteratorT, ::cuda::std::pair<input_t, input_value_t>>;
|
|
1401
|
+
using accum_t = output_tuple_t;
|
|
1402
|
+
using init_t = detail::reduce::empty_problem_init_t<accum_t>;
|
|
1490
1403
|
using output_value_t = typename output_tuple_t::second_type;
|
|
1491
1404
|
|
|
1492
1405
|
// Wrapped input iterator to produce index-value <input_t, InputT> tuples
|
|
1493
1406
|
auto d_indexed_in = THRUST_NS_QUALIFIER::make_transform_iterator(
|
|
1494
1407
|
THRUST_NS_QUALIFIER::counting_iterator<::cuda::std::int64_t>{0},
|
|
1495
1408
|
detail::reduce::generate_idx_value<InputIteratorT, output_value_t>(d_in, segment_size));
|
|
1496
|
-
|
|
1497
1409
|
using arg_index_input_iterator_t = decltype(d_indexed_in);
|
|
1498
1410
|
|
|
1499
|
-
// Initial value
|
|
1500
1411
|
init_t initial_value{accum_t(1, ::cuda::std::numeric_limits<input_value_t>::lowest())};
|
|
1501
1412
|
|
|
1502
1413
|
return detail::reduce::DispatchFixedSizeSegmentedReduce<
|
|
1503
1414
|
arg_index_input_iterator_t,
|
|
1504
1415
|
OutputIteratorT,
|
|
1505
1416
|
input_t,
|
|
1506
|
-
|
|
1417
|
+
detail::arg_max,
|
|
1507
1418
|
init_t,
|
|
1508
1419
|
accum_t>::Dispatch(d_temp_storage,
|
|
1509
1420
|
temp_storage_bytes,
|
|
@@ -1511,7 +1422,7 @@ public:
|
|
|
1511
1422
|
d_out,
|
|
1512
1423
|
num_segments,
|
|
1513
1424
|
segment_size,
|
|
1514
|
-
|
|
1425
|
+
detail::arg_max(),
|
|
1515
1426
|
initial_value,
|
|
1516
1427
|
stream);
|
|
1517
1428
|
}
|
|
@@ -144,11 +144,11 @@ __launch_bounds__(
|
|
|
144
144
|
auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage);
|
|
145
145
|
MergeAgent{
|
|
146
146
|
temp_storage.Alias(),
|
|
147
|
-
|
|
148
|
-
|
|
147
|
+
keys1,
|
|
148
|
+
items1,
|
|
149
149
|
num_keys1,
|
|
150
|
-
|
|
151
|
-
|
|
150
|
+
keys2,
|
|
151
|
+
items2,
|
|
152
152
|
num_keys2,
|
|
153
153
|
keys_result,
|
|
154
154
|
items_result,
|
|
@@ -44,7 +44,6 @@
|
|
|
44
44
|
# pragma system_header
|
|
45
45
|
#endif // no system header
|
|
46
46
|
|
|
47
|
-
#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
|
|
48
47
|
#include <cub/device/dispatch/kernels/radix_sort.cuh>
|
|
49
48
|
#include <cub/device/dispatch/tuning/tuning_radix_sort.cuh>
|
|
50
49
|
#include <cub/util_debug.cuh>
|
|
@@ -1379,14 +1378,6 @@ struct DispatchSegmentedRadixSort
|
|
|
1379
1378
|
// Number of radix sort invocations until all segments have been processed
|
|
1380
1379
|
const auto num_invocations = ::cuda::ceil_div(num_segments, max_num_segments_per_invocation);
|
|
1381
1380
|
|
|
1382
|
-
// If d_begin_offsets and d_end_offsets do not support operator+ then we can't have more than
|
|
1383
|
-
// max_num_segments_per_invocation segments per invocation
|
|
1384
|
-
if (num_invocations > 1
|
|
1385
|
-
&& !detail::all_iterators_support_add_assign_operator(::cuda::std::int64_t{}, d_begin_offsets, d_end_offsets))
|
|
1386
|
-
{
|
|
1387
|
-
return cudaErrorInvalidValue;
|
|
1388
|
-
}
|
|
1389
|
-
|
|
1390
1381
|
BeginOffsetIteratorT begin_offsets_current_it = d_begin_offsets;
|
|
1391
1382
|
EndOffsetIteratorT end_offsets_current_it = d_end_offsets;
|
|
1392
1383
|
|
|
@@ -1435,8 +1426,8 @@ struct DispatchSegmentedRadixSort
|
|
|
1435
1426
|
|
|
1436
1427
|
if (invocation_index + 1 < num_invocations)
|
|
1437
1428
|
{
|
|
1438
|
-
|
|
1439
|
-
|
|
1429
|
+
begin_offsets_current_it += num_current_segments;
|
|
1430
|
+
end_offsets_current_it += num_current_segments;
|
|
1440
1431
|
}
|
|
1441
1432
|
|
|
1442
1433
|
// Sync the stream if specified to flush runtime errors
|
|
@@ -46,7 +46,6 @@
|
|
|
46
46
|
|
|
47
47
|
#include <cub/detail/launcher/cuda_runtime.cuh>
|
|
48
48
|
#include <cub/detail/type_traits.cuh> // for cub::detail::invoke_result_t
|
|
49
|
-
#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
|
|
50
49
|
#include <cub/device/dispatch/kernels/reduce.cuh>
|
|
51
50
|
#include <cub/device/dispatch/kernels/segmented_reduce.cuh>
|
|
52
51
|
#include <cub/device/dispatch/tuning/tuning_reduce.cuh>
|
|
@@ -823,17 +822,6 @@ struct DispatchSegmentedReduce
|
|
|
823
822
|
static_cast<::cuda::std::int64_t>(::cuda::std::numeric_limits<::cuda::std::int32_t>::max());
|
|
824
823
|
const ::cuda::std::int64_t num_invocations = ::cuda::ceil_div(num_segments, num_segments_per_invocation);
|
|
825
824
|
|
|
826
|
-
// If we need multiple passes over the segments but the iterators do not support the + operator, we cannot use the
|
|
827
|
-
// streaming approach and have to fail, returning cudaErrorInvalidValue. This is because c.parallel passes
|
|
828
|
-
// indirect_arg_t as the iterator type, which does not support the + operator.
|
|
829
|
-
// TODO (elstehle): Remove this check once https://github.com/NVIDIA/cccl/issues/4148 is resolved.
|
|
830
|
-
if (num_invocations > 1
|
|
831
|
-
&& !detail::all_iterators_support_add_assign_operator(
|
|
832
|
-
::cuda::std::int64_t{}, d_out, d_begin_offsets, d_end_offsets))
|
|
833
|
-
{
|
|
834
|
-
return cudaErrorInvalidValue;
|
|
835
|
-
}
|
|
836
|
-
|
|
837
825
|
for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
|
|
838
826
|
{
|
|
839
827
|
const auto current_seg_offset = invocation_index * num_segments_per_invocation;
|
|
@@ -865,9 +853,9 @@ struct DispatchSegmentedReduce
|
|
|
865
853
|
|
|
866
854
|
if (invocation_index + 1 < num_invocations)
|
|
867
855
|
{
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
856
|
+
d_out += num_current_segments;
|
|
857
|
+
d_begin_offsets += num_current_segments;
|
|
858
|
+
d_end_offsets += num_current_segments;
|
|
871
859
|
}
|
|
872
860
|
|
|
873
861
|
// Sync the stream if specified to flush runtime errors
|
|
@@ -1182,15 +1170,6 @@ struct DispatchFixedSizeSegmentedReduce
|
|
|
1182
1170
|
|
|
1183
1171
|
const ::cuda::std::int64_t num_invocations = ::cuda::ceil_div(num_segments, num_segments_per_invocation);
|
|
1184
1172
|
|
|
1185
|
-
// If we need multiple passes over the segments but the iterators do not support the + operator, we cannot use the
|
|
1186
|
-
// streaming approach and have to fail, returning cudaErrorInvalidValue. This is because c.parallel passes
|
|
1187
|
-
// indirect_arg_t as the iterator type, which does not support the + operator.
|
|
1188
|
-
// TODO (srinivas/elstehle): Remove this check once https://github.com/NVIDIA/cccl/issues/4148 is resolved.
|
|
1189
|
-
if (num_invocations > 1 && !detail::all_iterators_support_plus_operator(::cuda::std::int64_t{}, d_in, d_out))
|
|
1190
|
-
{
|
|
1191
|
-
return cudaErrorInvalidValue;
|
|
1192
|
-
}
|
|
1193
|
-
|
|
1194
1173
|
cudaError error = cudaSuccess;
|
|
1195
1174
|
for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
|
|
1196
1175
|
{
|
|
@@ -1204,13 +1183,16 @@ struct DispatchFixedSizeSegmentedReduce
|
|
|
1204
1183
|
launcher_factory(
|
|
1205
1184
|
static_cast<::cuda::std::int32_t>(num_current_blocks), ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream)
|
|
1206
1185
|
.doit(fixed_size_segmented_reduce_kernel,
|
|
1207
|
-
|
|
1208
|
-
|
|
1186
|
+
d_in,
|
|
1187
|
+
d_out,
|
|
1209
1188
|
segment_size,
|
|
1210
1189
|
static_cast<::cuda::std::int32_t>(num_current_segments),
|
|
1211
1190
|
reduction_op,
|
|
1212
1191
|
init);
|
|
1213
1192
|
|
|
1193
|
+
d_in += num_segments_per_invocation * segment_size;
|
|
1194
|
+
d_out += num_segments_per_invocation;
|
|
1195
|
+
|
|
1214
1196
|
error = CubDebug(cudaPeekAtLastError());
|
|
1215
1197
|
if (cudaSuccess != error)
|
|
1216
1198
|
{
|