cuda-cccl 0.1.3.2.0.dev438__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
- cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
- cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +10 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/util_device.cuh +51 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +4 -4
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
- cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
- cuda/cccl/headers/include/cuda/__device/device_ref.h +32 -51
- cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +330 -36
- cuda/cccl/headers/include/cuda/__event/event.h +8 -8
- cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -4
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
- cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
- cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/devices +10 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
- cuda/cccl/headers/include/cuda/std/string_view +12 -5
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
- cuda/cccl/parallel/experimental/__init__.py +21 -70
- cuda/compute/__init__.py +77 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +141 -1
- cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +171 -166
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -47,9 +47,7 @@
|
|
|
47
47
|
|
|
48
48
|
CUB_NAMESPACE_BEGIN
|
|
49
49
|
|
|
50
|
-
namespace detail
|
|
51
|
-
{
|
|
52
|
-
namespace reduce
|
|
50
|
+
namespace detail::reduce
|
|
53
51
|
{
|
|
54
52
|
|
|
55
53
|
/**
|
|
@@ -172,6 +170,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
|
|
|
172
170
|
AccumT,
|
|
173
171
|
TransformOpT>;
|
|
174
172
|
|
|
173
|
+
static_assert(sizeof(typename AgentReduceT::TempStorage) <= max_smem_per_block,
|
|
174
|
+
"cub::DeviceReduce ran out of CUDA shared memory, which we judged to be extremely unlikely. Please "
|
|
175
|
+
"file an issue at: https://github.com/NVIDIA/cccl/issues");
|
|
176
|
+
|
|
175
177
|
// Shared memory storage
|
|
176
178
|
__shared__ typename AgentReduceT::TempStorage temp_storage;
|
|
177
179
|
|
|
@@ -253,6 +255,10 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(
|
|
|
253
255
|
AccumT,
|
|
254
256
|
TransformOpT>;
|
|
255
257
|
|
|
258
|
+
static_assert(sizeof(typename AgentReduceT::TempStorage) <= max_smem_per_block,
|
|
259
|
+
"cub::DeviceReduce ran out of CUDA shared memory, which we judged to be extremely unlikely. Please "
|
|
260
|
+
"file an issue at: https://github.com/NVIDIA/cccl/issues");
|
|
261
|
+
|
|
256
262
|
// Shared memory storage
|
|
257
263
|
__shared__ typename AgentReduceT::TempStorage temp_storage;
|
|
258
264
|
|
|
@@ -572,7 +578,6 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
|
|
|
572
578
|
}
|
|
573
579
|
}
|
|
574
580
|
|
|
575
|
-
} // namespace reduce
|
|
576
|
-
} // namespace detail
|
|
581
|
+
} // namespace detail::reduce
|
|
577
582
|
|
|
578
583
|
CUB_NAMESPACE_END
|
|
@@ -42,9 +42,7 @@
|
|
|
42
42
|
|
|
43
43
|
CUB_NAMESPACE_BEGIN
|
|
44
44
|
|
|
45
|
-
namespace detail
|
|
46
|
-
{
|
|
47
|
-
namespace scan
|
|
45
|
+
namespace detail::scan
|
|
48
46
|
{
|
|
49
47
|
|
|
50
48
|
/******************************************************************************
|
|
@@ -186,7 +184,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
|
|
|
186
184
|
AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(num_items, tile_state, start_tile);
|
|
187
185
|
}
|
|
188
186
|
|
|
189
|
-
} // namespace scan
|
|
190
|
-
} // namespace detail
|
|
187
|
+
} // namespace detail::scan
|
|
191
188
|
|
|
192
189
|
CUB_NAMESPACE_END
|
|
@@ -43,9 +43,7 @@
|
|
|
43
43
|
|
|
44
44
|
CUB_NAMESPACE_BEGIN
|
|
45
45
|
|
|
46
|
-
namespace detail
|
|
47
|
-
{
|
|
48
|
-
namespace reduce
|
|
46
|
+
namespace detail::reduce
|
|
49
47
|
{
|
|
50
48
|
|
|
51
49
|
/// Normalize input iterator to segment offset
|
|
@@ -318,7 +316,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
|
|
|
318
316
|
}
|
|
319
317
|
}
|
|
320
318
|
|
|
321
|
-
} // namespace reduce
|
|
322
|
-
} // namespace detail
|
|
319
|
+
} // namespace detail::reduce
|
|
323
320
|
|
|
324
321
|
CUB_NAMESPACE_END
|
|
@@ -29,6 +29,56 @@ using local_segment_index_t = ::cuda::std::uint32_t;
|
|
|
29
29
|
// Type used for total number of segments and to index within segments globally
|
|
30
30
|
using global_segment_offset_t = ::cuda::std::int64_t;
|
|
31
31
|
|
|
32
|
+
template <typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
33
|
+
struct LargeSegmentsSelectorT
|
|
34
|
+
{
|
|
35
|
+
OffsetT value{};
|
|
36
|
+
BeginOffsetIteratorT d_offset_begin{};
|
|
37
|
+
EndOffsetIteratorT d_offset_end{};
|
|
38
|
+
global_segment_offset_t base_segment_offset{};
|
|
39
|
+
|
|
40
|
+
#if !_CCCL_COMPILER(NVRTC)
|
|
41
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE
|
|
42
|
+
LargeSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end)
|
|
43
|
+
: value(value)
|
|
44
|
+
, d_offset_begin(d_offset_begin)
|
|
45
|
+
, d_offset_end(d_offset_end)
|
|
46
|
+
{}
|
|
47
|
+
#endif
|
|
48
|
+
|
|
49
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE bool operator()(local_segment_index_t segment_id) const
|
|
50
|
+
{
|
|
51
|
+
const OffsetT segment_size =
|
|
52
|
+
d_offset_end[base_segment_offset + segment_id] - d_offset_begin[base_segment_offset + segment_id];
|
|
53
|
+
return segment_size > value;
|
|
54
|
+
}
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
template <typename OffsetT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
|
58
|
+
struct SmallSegmentsSelectorT
|
|
59
|
+
{
|
|
60
|
+
OffsetT value{};
|
|
61
|
+
BeginOffsetIteratorT d_offset_begin{};
|
|
62
|
+
EndOffsetIteratorT d_offset_end{};
|
|
63
|
+
global_segment_offset_t base_segment_offset{};
|
|
64
|
+
|
|
65
|
+
#if !_CCCL_COMPILER(NVRTC)
|
|
66
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE
|
|
67
|
+
SmallSegmentsSelectorT(OffsetT value, BeginOffsetIteratorT d_offset_begin, EndOffsetIteratorT d_offset_end)
|
|
68
|
+
: value(value)
|
|
69
|
+
, d_offset_begin(d_offset_begin)
|
|
70
|
+
, d_offset_end(d_offset_end)
|
|
71
|
+
{}
|
|
72
|
+
#endif
|
|
73
|
+
|
|
74
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE bool operator()(local_segment_index_t segment_id) const
|
|
75
|
+
{
|
|
76
|
+
const OffsetT segment_size =
|
|
77
|
+
d_offset_end[base_segment_offset + segment_id] - d_offset_begin[base_segment_offset + segment_id];
|
|
78
|
+
return segment_size < value;
|
|
79
|
+
}
|
|
80
|
+
};
|
|
81
|
+
|
|
32
82
|
/**
|
|
33
83
|
* @brief Fallback kernel, in case there's not enough segments to
|
|
34
84
|
* take advantage of partitioning.
|
|
@@ -89,7 +139,7 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREAD
|
|
|
89
139
|
{
|
|
90
140
|
using ActivePolicyT = typename ChainedPolicyT::ActivePolicy;
|
|
91
141
|
using LargeSegmentPolicyT = typename ActivePolicyT::LargeSegmentPolicy;
|
|
92
|
-
using MediumPolicyT = typename ActivePolicyT::
|
|
142
|
+
using MediumPolicyT = typename ActivePolicyT::MediumSegmentPolicy;
|
|
93
143
|
|
|
94
144
|
const auto segment_id = static_cast<local_segment_index_t>(blockIdx.x);
|
|
95
145
|
OffsetT segment_begin = d_begin_offsets[segment_id];
|
|
@@ -253,7 +303,7 @@ template <SortOrder Order,
|
|
|
253
303
|
typename BeginOffsetIteratorT,
|
|
254
304
|
typename EndOffsetIteratorT,
|
|
255
305
|
typename OffsetT>
|
|
256
|
-
__launch_bounds__(ChainedPolicyT::ActivePolicy::
|
|
306
|
+
__launch_bounds__(ChainedPolicyT::ActivePolicy::SmallSegmentPolicy::BLOCK_THREADS)
|
|
257
307
|
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelSmall(
|
|
258
308
|
local_segment_index_t small_segments,
|
|
259
309
|
local_segment_index_t medium_segments,
|
|
@@ -272,10 +322,9 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic
|
|
|
272
322
|
const local_segment_index_t tid = threadIdx.x;
|
|
273
323
|
const local_segment_index_t bid = blockIdx.x;
|
|
274
324
|
|
|
275
|
-
using ActivePolicyT
|
|
276
|
-
using
|
|
277
|
-
using MediumPolicyT
|
|
278
|
-
using SmallPolicyT = typename SmallAndMediumPolicyT::SmallPolicyT;
|
|
325
|
+
using ActivePolicyT = typename ChainedPolicyT::ActivePolicy;
|
|
326
|
+
using SmallPolicyT = typename ActivePolicyT::SmallSegmentPolicy;
|
|
327
|
+
using MediumPolicyT = typename ActivePolicyT::MediumSegmentPolicy;
|
|
279
328
|
|
|
280
329
|
constexpr auto threads_per_medium_segment = static_cast<local_segment_index_t>(MediumPolicyT::WARP_THREADS);
|
|
281
330
|
constexpr auto threads_per_small_segment = static_cast<local_segment_index_t>(SmallPolicyT::WARP_THREADS);
|
|
@@ -286,11 +335,9 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolic
|
|
|
286
335
|
using SmallAgentWarpMergeSortT =
|
|
287
336
|
sub_warp_merge_sort::AgentSubWarpSort<Order == SortOrder::Descending, SmallPolicyT, KeyT, ValueT, OffsetT>;
|
|
288
337
|
|
|
289
|
-
constexpr auto segments_per_medium_block =
|
|
290
|
-
static_cast<local_segment_index_t>(SmallAndMediumPolicyT::SEGMENTS_PER_MEDIUM_BLOCK);
|
|
338
|
+
constexpr auto segments_per_medium_block = static_cast<local_segment_index_t>(MediumPolicyT::SEGMENTS_PER_BLOCK);
|
|
291
339
|
|
|
292
|
-
constexpr auto segments_per_small_block =
|
|
293
|
-
static_cast<local_segment_index_t>(SmallAndMediumPolicyT::SEGMENTS_PER_SMALL_BLOCK);
|
|
340
|
+
constexpr auto segments_per_small_block = static_cast<local_segment_index_t>(SmallPolicyT::SEGMENTS_PER_BLOCK);
|
|
294
341
|
|
|
295
342
|
__shared__ union
|
|
296
343
|
{
|
|
@@ -202,14 +202,18 @@ _CCCL_HOST_DEVICE _CCCL_CONSTEVAL auto load_store_type()
|
|
|
202
202
|
}
|
|
203
203
|
}
|
|
204
204
|
|
|
205
|
-
template <typename VectorizedPolicy,
|
|
205
|
+
template <typename VectorizedPolicy,
|
|
206
|
+
typename Offset,
|
|
207
|
+
typename F,
|
|
208
|
+
typename RandomAccessIteratorOut,
|
|
209
|
+
typename... RandomAccessIteratorsIn>
|
|
206
210
|
_CCCL_DEVICE void transform_kernel_vectorized(
|
|
207
211
|
Offset num_items,
|
|
208
212
|
int num_elem_per_thread_prefetch,
|
|
209
213
|
bool can_vectorize,
|
|
210
214
|
F f,
|
|
211
215
|
RandomAccessIteratorOut out,
|
|
212
|
-
|
|
216
|
+
RandomAccessIteratorsIn... ins)
|
|
213
217
|
{
|
|
214
218
|
constexpr int block_dim = VectorizedPolicy::block_threads;
|
|
215
219
|
constexpr int items_per_thread = VectorizedPolicy::items_per_thread_vectorized;
|
|
@@ -240,9 +244,12 @@ _CCCL_DEVICE void transform_kernel_vectorized(
|
|
|
240
244
|
constexpr int load_store_size = VectorizedPolicy::load_store_word_size;
|
|
241
245
|
using load_store_t = decltype(load_store_type<load_store_size>());
|
|
242
246
|
using output_t = it_value_t<RandomAccessIteratorOut>;
|
|
243
|
-
using result_t
|
|
247
|
+
using result_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const it_value_t<RandomAccessIteratorsIn>&...>>;
|
|
244
248
|
// picks output type size if there are no inputs
|
|
245
|
-
constexpr int element_size = int{
|
|
249
|
+
constexpr int element_size = int{first_nonzero_value(
|
|
250
|
+
(sizeof(it_value_t<RandomAccessIteratorsIn>)
|
|
251
|
+
* THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
|
|
252
|
+
size_of<output_t>)};
|
|
246
253
|
constexpr int load_store_count = (items_per_thread * element_size) / load_store_size;
|
|
247
254
|
|
|
248
255
|
static_assert((items_per_thread * element_size) % load_store_size == 0);
|
|
@@ -258,18 +265,35 @@ _CCCL_DEVICE void transform_kernel_vectorized(
|
|
|
258
265
|
|
|
259
266
|
auto provide_array = [&](auto... inputs) {
|
|
260
267
|
// load inputs
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
auto in_vec = reinterpret_cast<const load_store_t*>(in);
|
|
264
|
-
auto input_vec = reinterpret_cast<load_store_t*>(input.data());
|
|
265
|
-
_CCCL_PRAGMA_UNROLL_FULL()
|
|
266
|
-
for (int i = 0; i < load_store_count; ++i)
|
|
268
|
+
[[maybe_unused]] auto load_tile = [](auto in, auto& input) {
|
|
269
|
+
if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<decltype(in)>)
|
|
267
270
|
{
|
|
268
|
-
|
|
271
|
+
auto in_vec = reinterpret_cast<const load_store_t*>(in) + threadIdx.x;
|
|
272
|
+
auto input_vec = reinterpret_cast<load_store_t*>(input.data());
|
|
273
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
274
|
+
for (int i = 0; i < load_store_count; ++i)
|
|
275
|
+
{
|
|
276
|
+
input_vec[i] = in_vec[i * VectorizedPolicy::block_threads];
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
else
|
|
280
|
+
{
|
|
281
|
+
constexpr int elems = load_store_size / element_size;
|
|
282
|
+
in += threadIdx.x * elems;
|
|
283
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
284
|
+
for (int i = 0; i < load_store_count; ++i)
|
|
285
|
+
{
|
|
286
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
287
|
+
for (int j = 0; j < elems; ++j)
|
|
288
|
+
{
|
|
289
|
+
input[i * elems + j] = in[i * elems * VectorizedPolicy::block_threads + j];
|
|
290
|
+
}
|
|
291
|
+
}
|
|
269
292
|
}
|
|
270
293
|
};
|
|
271
294
|
_CCCL_PDL_GRID_DEPENDENCY_SYNC();
|
|
272
|
-
(
|
|
295
|
+
(load_tile(ins, inputs), ...);
|
|
296
|
+
|
|
273
297
|
// Benchmarks showed up to 38% slowdown on H200 (some improvements as well), so omitted. See #5249 for details.
|
|
274
298
|
// _CCCL_PDL_TRIGGER_NEXT_LAUNCH();
|
|
275
299
|
|
|
@@ -280,7 +304,7 @@ _CCCL_DEVICE void transform_kernel_vectorized(
|
|
|
280
304
|
output[i] = f(inputs[i]...);
|
|
281
305
|
}
|
|
282
306
|
};
|
|
283
|
-
provide_array(uninitialized_array<
|
|
307
|
+
provide_array(uninitialized_array<it_value_t<RandomAccessIteratorsIn>, items_per_thread>{}...);
|
|
284
308
|
|
|
285
309
|
// write output
|
|
286
310
|
if constexpr (can_vectorize_store)
|
|
@@ -43,9 +43,7 @@
|
|
|
43
43
|
|
|
44
44
|
CUB_NAMESPACE_BEGIN
|
|
45
45
|
|
|
46
|
-
namespace detail
|
|
47
|
-
{
|
|
48
|
-
namespace adjacent_difference
|
|
46
|
+
namespace detail::adjacent_difference
|
|
49
47
|
{
|
|
50
48
|
template <typename InputIteratorT, bool MayAlias>
|
|
51
49
|
struct policy_hub
|
|
@@ -64,7 +62,6 @@ struct policy_hub
|
|
|
64
62
|
|
|
65
63
|
using MaxPolicy = Policy500;
|
|
66
64
|
};
|
|
67
|
-
} // namespace adjacent_difference
|
|
68
|
-
} // namespace detail
|
|
65
|
+
} // namespace detail::adjacent_difference
|
|
69
66
|
|
|
70
67
|
CUB_NAMESPACE_END
|
|
@@ -43,9 +43,7 @@
|
|
|
43
43
|
|
|
44
44
|
CUB_NAMESPACE_BEGIN
|
|
45
45
|
|
|
46
|
-
namespace detail
|
|
47
|
-
{
|
|
48
|
-
namespace batch_memcpy
|
|
46
|
+
namespace detail::batch_memcpy
|
|
49
47
|
{
|
|
50
48
|
/**
|
|
51
49
|
* Parameterizable tuning policy type for AgentBatchMemcpy
|
|
@@ -115,7 +113,6 @@ struct policy_hub
|
|
|
115
113
|
|
|
116
114
|
using MaxPolicy = Policy700;
|
|
117
115
|
};
|
|
118
|
-
} // namespace batch_memcpy
|
|
119
|
-
} // namespace detail
|
|
116
|
+
} // namespace detail::batch_memcpy
|
|
120
117
|
|
|
121
118
|
CUB_NAMESPACE_END
|
|
@@ -42,9 +42,7 @@
|
|
|
42
42
|
|
|
43
43
|
CUB_NAMESPACE_BEGIN
|
|
44
44
|
|
|
45
|
-
namespace detail
|
|
46
|
-
{
|
|
47
|
-
namespace for_each
|
|
45
|
+
namespace detail::for_each
|
|
48
46
|
{
|
|
49
47
|
|
|
50
48
|
struct policy_hub_t
|
|
@@ -57,7 +55,6 @@ struct policy_hub_t
|
|
|
57
55
|
using MaxPolicy = policy_500_t;
|
|
58
56
|
};
|
|
59
57
|
|
|
60
|
-
} // namespace for_each
|
|
61
|
-
} // namespace detail
|
|
58
|
+
} // namespace detail::for_each
|
|
62
59
|
|
|
63
60
|
CUB_NAMESPACE_END
|
|
@@ -46,9 +46,7 @@
|
|
|
46
46
|
|
|
47
47
|
CUB_NAMESPACE_BEGIN
|
|
48
48
|
|
|
49
|
-
namespace detail
|
|
50
|
-
{
|
|
51
|
-
namespace histogram
|
|
49
|
+
namespace detail::histogram
|
|
52
50
|
{
|
|
53
51
|
enum class primitive_sample
|
|
54
52
|
{
|
|
@@ -272,7 +270,6 @@ struct policy_hub
|
|
|
272
270
|
|
|
273
271
|
using MaxPolicy = Policy1000;
|
|
274
272
|
};
|
|
275
|
-
} // namespace histogram
|
|
276
|
-
} // namespace detail
|
|
273
|
+
} // namespace detail::histogram
|
|
277
274
|
|
|
278
275
|
CUB_NAMESPACE_END
|
|
@@ -42,9 +42,7 @@
|
|
|
42
42
|
|
|
43
43
|
CUB_NAMESPACE_BEGIN
|
|
44
44
|
|
|
45
|
-
namespace detail
|
|
46
|
-
{
|
|
47
|
-
namespace merge
|
|
45
|
+
namespace detail::merge
|
|
48
46
|
{
|
|
49
47
|
template <typename KeyT, typename ValueT>
|
|
50
48
|
struct policy_hub
|
|
@@ -73,7 +71,6 @@ struct policy_hub
|
|
|
73
71
|
|
|
74
72
|
using max_policy = policy600;
|
|
75
73
|
};
|
|
76
|
-
} // namespace merge
|
|
77
|
-
} // namespace detail
|
|
74
|
+
} // namespace detail::merge
|
|
78
75
|
|
|
79
76
|
CUB_NAMESPACE_END
|
|
@@ -62,6 +62,14 @@ struct MergeSortPolicyWrapper<StaticPolicyT, ::cuda::std::void_t<decltype(Static
|
|
|
62
62
|
{}
|
|
63
63
|
|
|
64
64
|
CUB_DEFINE_SUB_POLICY_GETTER(MergeSort);
|
|
65
|
+
|
|
66
|
+
#if defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
67
|
+
_CCCL_DEVICE static constexpr auto EncodedPolicy()
|
|
68
|
+
{
|
|
69
|
+
using namespace ptx_json;
|
|
70
|
+
return object<key<"MergeSortPolicy">() = MergeSort().EncodedPolicy()>();
|
|
71
|
+
}
|
|
72
|
+
#endif
|
|
65
73
|
};
|
|
66
74
|
|
|
67
75
|
template <typename PolicyT>
|
|
@@ -46,9 +46,7 @@
|
|
|
46
46
|
|
|
47
47
|
CUB_NAMESPACE_BEGIN
|
|
48
48
|
|
|
49
|
-
namespace detail
|
|
50
|
-
{
|
|
51
|
-
namespace radix
|
|
49
|
+
namespace detail::radix
|
|
52
50
|
{
|
|
53
51
|
// sm90 default
|
|
54
52
|
template <size_t KeySize, size_t ValueSize, size_t OffsetSize>
|
|
@@ -1062,7 +1060,6 @@ struct policy_hub
|
|
|
1062
1060
|
using MaxPolicy = Policy1000;
|
|
1063
1061
|
};
|
|
1064
1062
|
|
|
1065
|
-
} // namespace radix
|
|
1066
|
-
} // namespace detail
|
|
1063
|
+
} // namespace detail::radix
|
|
1067
1064
|
|
|
1068
1065
|
CUB_NAMESPACE_END
|
|
@@ -50,9 +50,7 @@
|
|
|
50
50
|
|
|
51
51
|
CUB_NAMESPACE_BEGIN
|
|
52
52
|
|
|
53
|
-
namespace detail
|
|
54
|
-
{
|
|
55
|
-
namespace reduce_by_key
|
|
53
|
+
namespace detail::reduce_by_key
|
|
56
54
|
{
|
|
57
55
|
enum class primitive_key
|
|
58
56
|
{
|
|
@@ -939,7 +937,6 @@ struct policy_hub
|
|
|
939
937
|
};
|
|
940
938
|
using MaxPolicy = Policy1000;
|
|
941
939
|
};
|
|
942
|
-
} // namespace reduce_by_key
|
|
943
|
-
} // namespace detail
|
|
940
|
+
} // namespace detail::reduce_by_key
|
|
944
941
|
|
|
945
942
|
CUB_NAMESPACE_END
|
|
@@ -52,9 +52,7 @@
|
|
|
52
52
|
|
|
53
53
|
CUB_NAMESPACE_BEGIN
|
|
54
54
|
|
|
55
|
-
namespace detail
|
|
56
|
-
{
|
|
57
|
-
namespace rle
|
|
55
|
+
namespace detail::rle
|
|
58
56
|
{
|
|
59
57
|
enum class primitive_key
|
|
60
58
|
{
|
|
@@ -670,7 +668,6 @@ struct policy_hub
|
|
|
670
668
|
using MaxPolicy = Policy1000;
|
|
671
669
|
};
|
|
672
670
|
} // namespace non_trivial_runs
|
|
673
|
-
} // namespace rle
|
|
674
|
-
} // namespace detail
|
|
671
|
+
} // namespace detail::rle
|
|
675
672
|
|
|
676
673
|
CUB_NAMESPACE_END
|
|
@@ -53,9 +53,7 @@
|
|
|
53
53
|
|
|
54
54
|
CUB_NAMESPACE_BEGIN
|
|
55
55
|
|
|
56
|
-
namespace detail
|
|
57
|
-
{
|
|
58
|
-
namespace scan
|
|
56
|
+
namespace detail::scan
|
|
59
57
|
{
|
|
60
58
|
enum class keep_rejects
|
|
61
59
|
{
|
|
@@ -615,7 +613,6 @@ struct policy_hub
|
|
|
615
613
|
|
|
616
614
|
using MaxPolicy = Policy1000;
|
|
617
615
|
};
|
|
618
|
-
} // namespace scan
|
|
619
|
-
} // namespace detail
|
|
616
|
+
} // namespace detail::scan
|
|
620
617
|
|
|
621
618
|
CUB_NAMESPACE_END
|
|
@@ -49,9 +49,7 @@
|
|
|
49
49
|
|
|
50
50
|
CUB_NAMESPACE_BEGIN
|
|
51
51
|
|
|
52
|
-
namespace detail
|
|
53
|
-
{
|
|
54
|
-
namespace scan_by_key
|
|
52
|
+
namespace detail::scan_by_key
|
|
55
53
|
{
|
|
56
54
|
enum class primitive_accum
|
|
57
55
|
{
|
|
@@ -1007,7 +1005,6 @@ struct policy_hub
|
|
|
1007
1005
|
|
|
1008
1006
|
using MaxPolicy = Policy1000;
|
|
1009
1007
|
};
|
|
1010
|
-
} // namespace scan_by_key
|
|
1011
|
-
} // namespace detail
|
|
1008
|
+
} // namespace detail::scan_by_key
|
|
1012
1009
|
|
|
1013
1010
|
CUB_NAMESPACE_END
|