cuda-cccl 0.1.3.2.0.dev438__cp311-cp311-manylinux_2_26_x86_64.whl → 0.3.0__cp311-cp311-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (60) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +23 -0
  2. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +22 -14
  3. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  4. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
  5. cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
  6. cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
  7. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  8. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +321 -262
  9. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +8 -0
  10. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
  11. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
  12. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +203 -51
  13. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
  14. cuda/cccl/headers/include/cub/util_device.cuh +51 -35
  15. cuda/cccl/headers/include/cuda/__algorithm/copy.h +3 -3
  16. cuda/cccl/headers/include/cuda/__device/all_devices.h +3 -6
  17. cuda/cccl/headers/include/cuda/__device/arch_traits.h +3 -3
  18. cuda/cccl/headers/include/cuda/__device/attributes.h +7 -7
  19. cuda/cccl/headers/include/cuda/__device/device_ref.h +3 -10
  20. cuda/cccl/headers/include/cuda/__driver/driver_api.h +225 -33
  21. cuda/cccl/headers/include/cuda/__event/event.h +7 -8
  22. cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
  23. cuda/cccl/headers/include/cuda/__event/timed_event.h +3 -4
  24. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
  25. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
  26. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
  27. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
  28. cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
  29. cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
  30. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -12
  31. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
  32. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
  33. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
  34. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
  35. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
  36. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
  37. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
  38. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
  39. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
  40. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
  41. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
  42. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
  43. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
  44. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
  45. cuda/cccl/parallel/experimental/__init__.py +4 -0
  46. cuda/cccl/parallel/experimental/_bindings.pyi +28 -0
  47. cuda/cccl/parallel/experimental/_bindings_impl.pyx +140 -0
  48. cuda/cccl/parallel/experimental/algorithms/__init__.py +4 -0
  49. cuda/cccl/parallel/experimental/algorithms/_reduce.py +0 -2
  50. cuda/cccl/parallel/experimental/algorithms/_scan.py +0 -2
  51. cuda/cccl/parallel/experimental/algorithms/_three_way_partition.py +261 -0
  52. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-311-x86_64-linux-gnu.so +0 -0
  53. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  54. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-311-x86_64-linux-gnu.so +0 -0
  55. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  56. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/METADATA +1 -1
  57. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/RECORD +59 -57
  58. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
  59. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/WHEEL +0 -0
  60. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -51,6 +51,7 @@
51
51
  #include <cub/block/radix_rank_sort_operations.cuh>
52
52
  #include <cub/iterator/cache_modified_input_iterator.cuh>
53
53
  #include <cub/thread/thread_load.cuh>
54
+ #include <cub/util_device.cuh>
54
55
  #include <cub/util_type.cuh>
55
56
 
56
57
  #include <cuda/std/cstdint>
@@ -119,6 +120,28 @@ struct AgentRadixSortDownsweepPolicy : ScalingType
119
120
  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
120
121
  };
121
122
 
123
+ #if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
124
+ namespace detail
125
+ {
126
+ // Only define this when needed.
127
+ // Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
128
+ // either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
129
+ // version is always defined, and that's the only one needed for regular CUB operations.
130
+ //
131
+ // TODO: enable this unconditionally once concepts are always available
132
+ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
133
+ RadixSortDownsweepAgentPolicy,
134
+ (GenericAgentPolicy),
135
+ (BLOCK_THREADS, BlockThreads, int),
136
+ (ITEMS_PER_THREAD, ItemsPerThread, int),
137
+ (RADIX_BITS, RadixBits, int),
138
+ (LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
139
+ (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
140
+ (RANK_ALGORITHM, RankAlgorithm, cub::RadixRankAlgorithm),
141
+ (SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
142
+ } // namespace detail
143
+ #endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
144
+
122
145
  /******************************************************************************
123
146
  * Thread block abstractions
124
147
  ******************************************************************************/
@@ -48,33 +48,41 @@
48
48
 
49
49
  CUB_NAMESPACE_BEGIN
50
50
 
51
- template <int WARP_THREADS_ARG,
51
+ template <int BLOCK_THREADS_ARG,
52
+ int WARP_THREADS_ARG,
52
53
  int ITEMS_PER_THREAD_ARG,
53
54
  cub::WarpLoadAlgorithm LOAD_ALGORITHM_ARG = cub::WARP_LOAD_DIRECT,
54
55
  cub::CacheLoadModifier LOAD_MODIFIER_ARG = cub::LOAD_LDG,
55
56
  cub::WarpStoreAlgorithm STORE_ALGORITHM_ARG = cub::WARP_STORE_DIRECT>
56
57
  struct AgentSubWarpMergeSortPolicy
57
58
  {
58
- static constexpr int WARP_THREADS = WARP_THREADS_ARG;
59
- static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_ARG;
60
- static constexpr int ITEMS_PER_TILE = WARP_THREADS * ITEMS_PER_THREAD;
59
+ static constexpr int BLOCK_THREADS = BLOCK_THREADS_ARG;
60
+ static constexpr int WARP_THREADS = WARP_THREADS_ARG;
61
+ static constexpr int ITEMS_PER_THREAD = ITEMS_PER_THREAD_ARG;
62
+ static constexpr int ITEMS_PER_TILE = WARP_THREADS * ITEMS_PER_THREAD;
63
+ static constexpr int SEGMENTS_PER_BLOCK = BLOCK_THREADS / WARP_THREADS;
61
64
 
62
65
  static constexpr cub::WarpLoadAlgorithm LOAD_ALGORITHM = LOAD_ALGORITHM_ARG;
63
66
  static constexpr cub::CacheLoadModifier LOAD_MODIFIER = LOAD_MODIFIER_ARG;
64
67
  static constexpr cub::WarpStoreAlgorithm STORE_ALGORITHM = STORE_ALGORITHM_ARG;
65
68
  };
66
69
 
67
- template <int BLOCK_THREADS_ARG, typename SmallPolicy, typename MediumPolicy>
68
- struct AgentSmallAndMediumSegmentedSortPolicy
70
+ #if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
71
+ namespace detail
69
72
  {
70
- static constexpr int BLOCK_THREADS = BLOCK_THREADS_ARG;
71
- using SmallPolicyT = SmallPolicy;
72
- using MediumPolicyT = MediumPolicy;
73
-
74
- static constexpr int SEGMENTS_PER_MEDIUM_BLOCK = BLOCK_THREADS / MediumPolicyT::WARP_THREADS;
75
-
76
- static constexpr int SEGMENTS_PER_SMALL_BLOCK = BLOCK_THREADS / SmallPolicyT::WARP_THREADS;
77
- };
73
+ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
74
+ SubWarpMergeSortAgentPolicy,
75
+ (GenericAgentPolicy),
76
+ (BLOCK_THREADS, BlockThreads, int),
77
+ (WARP_THREADS, WarpThreads, int),
78
+ (ITEMS_PER_THREAD, ItemsPerThread, int),
79
+ (ITEMS_PER_TILE, ItemsPerTile, int),
80
+ (SEGMENTS_PER_BLOCK, SegmentsPerBlock, int),
81
+ (LOAD_ALGORITHM, LoadAlgorithm, cub::WarpLoadAlgorithm),
82
+ (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
83
+ (STORE_ALGORITHM, StoreAlgorithm, cub::WarpStoreAlgorithm))
84
+ } // namespace detail
85
+ #endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
78
86
 
79
87
  namespace detail
80
88
  {
@@ -0,0 +1,432 @@
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3
+
4
+ //! @file
5
+ //! The @c cub::BlockLoadToShared class provides a :ref:`collective <collective-primitives>` method for asynchronously
6
+ //! loading data from global to shared memory.
7
+
8
+ #pragma once
9
+
10
+ #include <cub/config.cuh>
11
+
12
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
13
+ # pragma GCC system_header
14
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
15
+ # pragma clang system_header
16
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
17
+ # pragma system_header
18
+ #endif // no system header
19
+
20
+ #include <cub/util_ptx.cuh>
21
+ #include <cub/util_type.cuh>
22
+
23
+ #include <thrust/type_traits/is_trivially_relocatable.h>
24
+
25
+ #include <cuda/cmath>
26
+ #include <cuda/memory>
27
+ #include <cuda/ptx>
28
+ #include <cuda/std/__algorithm/max.h>
29
+ #include <cuda/std/__bit/has_single_bit.h>
30
+ #include <cuda/std/cstdint>
31
+ #include <cuda/std/span>
32
+
33
+ #include <nv/target>
34
+
35
+ CUB_NAMESPACE_BEGIN
36
+
37
+ namespace detail
38
+ {
39
+
40
+ //! @rst
41
+ //! The @c BlockLoadToShared class provides a :ref:`collective <collective-primitives>` method for asynchronously
42
+ //! loading data from global to shared memory.
43
+ //!
44
+ //! Overview
45
+ //! +++++++++++++++++++++++++++++++++++++++++++++
46
+ //!
47
+ //! - Given one or more spans of input elements in global memory and buffers in shared memory, this primitive
48
+ //! asynchronously copies the elements to shared memory and takes care of synchronization.
49
+ //! - @rowmajor
50
+ //! - Shared memory buffers are assumed to be aligned according to `SharedBufferAlignBytes<T>()`.
51
+ //! - Global memory spans are by default assumed to be aligned according to the value type. Higher alignment guarantees
52
+ //! can optionally be specified.
53
+ //! - After one or more calls to `CopyAsync`, `Commit` needs to be called before optionally doing other work and then
54
+ //! calling `Wait` which guarantees the data to be available in shared memory and resets the state and allows for the
55
+ //! next wave of `CopyAsync`.
56
+ //!
57
+ //! Performance Considerations
58
+ //! +++++++++++++++++++++++++++++++++++++++++++++
59
+ //!
60
+ //! - Uses special instructions/hardware acceleration when available (cp.async.bulk on Hopper+, copy.async on Ampere).
61
+ //! - By guaranteeing 16 byte alignment and size multiple for the global span, a faster path is taken.
62
+ template <int BlockDimX, int BlockDimY = 1, int BlockDimZ = 1>
63
+ struct BlockLoadToShared
64
+ {
65
+ private:
66
+ /// Constants
67
+ static constexpr int block_threads = BlockDimX * BlockDimY * BlockDimZ;
68
+ // The alignment needed for cp.async.bulk and L1-skipping cp.async
69
+ static constexpr int minimum_align = 16;
70
+
71
+ // Helper for fallback to gmem->reg->smem
72
+ struct alignas(minimum_align) vec_load_t
73
+ {
74
+ char c_array[minimum_align];
75
+ };
76
+
77
+ struct _TempStorage
78
+ {
79
+ ::cuda::std::uint64_t mbarrier_handle;
80
+ };
81
+
82
+ #ifdef CCCL_ENABLE_DEVICE_ASSERTIONS
83
+ enum struct State
84
+ {
85
+ ready_to_copy,
86
+ ready_to_copy_or_commit,
87
+ committed,
88
+ invalidated,
89
+ };
90
+ #endif // CCCL_ENABLE_DEVICE_ASSERTIONS
91
+
92
+ /// Shared storage reference
93
+ _TempStorage& temp_storage;
94
+
95
+ const int linear_tid{cub::RowMajorTid(BlockDimX, BlockDimY, BlockDimZ)};
96
+
97
+ // Thread selection for uniform operations
98
+ const bool elected{__elect_thread()};
99
+ // Keep track of current mbarrier phase for waiting.
100
+ uint32_t phase_parity{};
101
+ // Keep track of the amount of bytes from multiple transactions for Commit() (only needed for TMA).
102
+ // Also used to check for proper ordering of member function calls in debug mode.
103
+ uint32_t num_bytes_bulk_total{};
104
+
105
+ #ifdef CCCL_ENABLE_DEVICE_ASSERTIONS
106
+ State state{State::ready_to_copy};
107
+ #endif // CCCL_ENABLE_DEVICE_ASSERTIONS
108
+
109
+ /// Internal storage allocator
110
+ _CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& __private_storage()
111
+ {
112
+ __shared__ _TempStorage private_storage;
113
+ return private_storage;
114
+ }
115
+
116
+ _CCCL_DEVICE _CCCL_FORCEINLINE bool __elect_thread() const
117
+ {
118
+ // Otherwise elect.sync in the last warp with a full mask is UB.
119
+ static_assert(block_threads % cub::detail::warp_threads == 0, "The block size must be a multiple of the warp size");
120
+ return NV_DISPATCH_TARGET(
121
+ NV_PROVIDES_SM_90,
122
+ ( // Use last warp to try to avoid having the elected thread also working on the peeling in the first warp.
123
+ (linear_tid >= block_threads - cub::detail::warp_threads) && ::cuda::ptx::elect_sync(~0u)),
124
+ NV_IS_DEVICE,
125
+ (linear_tid == 0));
126
+ }
127
+
128
+ _CCCL_DEVICE _CCCL_FORCEINLINE void __init_mbarrier()
129
+ {
130
+ {
131
+ NV_IF_TARGET(NV_PROVIDES_SM_90,
132
+ (if (elected) { ::cuda::ptx::mbarrier_init(&temp_storage.mbarrier_handle, 1); }
133
+ // TODO The following sync was added to avoid a racecheck posititive. Is it really needed?
134
+ __syncthreads();));
135
+ }
136
+ }
137
+
138
+ _CCCL_DEVICE _CCCL_FORCEINLINE void __copy_aligned_async_bulk(char* smem_dst, const char* gmem_src, int num_bytes)
139
+ {
140
+ if (elected)
141
+ {
142
+ #if __cccl_ptx_isa >= 860
143
+ NV_IF_TARGET(
144
+ NV_PROVIDES_SM_90,
145
+ (::cuda::ptx::cp_async_bulk(
146
+ ::cuda::ptx::space_shared,
147
+ ::cuda::ptx::space_global,
148
+ smem_dst,
149
+ gmem_src,
150
+ num_bytes,
151
+ &temp_storage.mbarrier_handle);));
152
+ #else
153
+ NV_IF_TARGET(
154
+ NV_PROVIDES_SM_90,
155
+ (::cuda::ptx::cp_async_bulk(
156
+ ::cuda::ptx::space_cluster,
157
+ ::cuda::ptx::space_global,
158
+ smem_dst,
159
+ gmem_src,
160
+ num_bytes,
161
+ &temp_storage.mbarrier_handle);));
162
+ #endif // __cccl_ptx_isa >= 800
163
+ // Needed for arrival on mbarrier in Commit()
164
+ num_bytes_bulk_total += num_bytes;
165
+ }
166
+ }
167
+
168
+ _CCCL_DEVICE _CCCL_FORCEINLINE void __copy_aligned_async(char* smem_dst, const char* gmem_src, int num_bytes)
169
+ {
170
+ for (int offset = linear_tid * minimum_align; offset < num_bytes; offset += block_threads * minimum_align)
171
+ {
172
+ [[maybe_unused]] const auto thread_src = gmem_src + offset;
173
+ [[maybe_unused]] const auto thread_dst = smem_dst + offset;
174
+ // LDGSTS borrowed from cuda::memcpy_async, assumes 16 byte alignment to avoid L1 (.cg)
175
+ NV_IF_TARGET(NV_PROVIDES_SM_80,
176
+ (asm volatile("cp.async.cg.shared.global [%0], [%1], %2, %2;" : : "r"(
177
+ static_cast<::cuda::std::uint32_t>(::__cvta_generic_to_shared(thread_dst))),
178
+ "l"(thread_src),
179
+ "n"(16) : "memory");));
180
+ }
181
+ }
182
+
183
+ _CCCL_DEVICE _CCCL_FORCEINLINE void __copy_aligned_fallback(char* smem_dst, const char* gmem_src, int num_bytes)
184
+ {
185
+ for (int offset = linear_tid * minimum_align; offset < num_bytes; offset += block_threads * minimum_align)
186
+ {
187
+ const auto thread_src = gmem_src + offset;
188
+ const auto thread_dst = smem_dst + offset;
189
+ *::cuda::ptr_rebind<vec_load_t>(thread_dst) = *::cuda::ptr_rebind<vec_load_t>(thread_src);
190
+ }
191
+ }
192
+
193
+ _CCCL_DEVICE _CCCL_FORCEINLINE void __copy_aligned(char* smem_dst, const char* gmem_src, int num_bytes)
194
+ {
195
+ NV_DISPATCH_TARGET(
196
+ NV_PROVIDES_SM_90,
197
+ (__copy_aligned_async_bulk(smem_dst, gmem_src, num_bytes);),
198
+ NV_PROVIDES_SM_80,
199
+ (__copy_aligned_async(smem_dst, gmem_src, num_bytes);),
200
+ NV_IS_DEVICE,
201
+ (__copy_aligned_fallback(smem_dst, gmem_src, num_bytes);));
202
+ }
203
+
204
+ // Dispatch to fallback for waiting pre TMA/SM_90
205
+ _CCCL_DEVICE _CCCL_FORCEINLINE bool __try_wait()
206
+ {
207
+ NV_DISPATCH_TARGET(
208
+ NV_PROVIDES_SM_90,
209
+ (return ::cuda::ptx::mbarrier_try_wait_parity(&temp_storage.mbarrier_handle, phase_parity);),
210
+ NV_PROVIDES_SM_80,
211
+ (asm volatile("cp.async.wait_group 0;" :: : "memory"); //
212
+ __syncthreads();
213
+ return true;),
214
+ NV_IS_DEVICE,
215
+ (__syncthreads(); //
216
+ return true;));
217
+ }
218
+
219
+ public:
220
+ /// @smemstorage{BlockLoadToShared}
221
+ using TempStorage = cub::Uninitialized<_TempStorage>;
222
+
223
+ //! @name Collective constructors
224
+ //! @{
225
+
226
+ //! @brief Collective constructor using a private static allocation of shared memory as temporary storage.
227
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoadToShared()
228
+ : temp_storage(__private_storage())
229
+ {
230
+ __init_mbarrier();
231
+ }
232
+
233
+ //! @brief Collective constructor using the specified memory allocation as temporary storage.
234
+ //!
235
+ //! @param[in] temp_storage
236
+ //! Reference to memory allocation having layout type TempStorage
237
+ _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoadToShared(TempStorage& temp_storage)
238
+ : temp_storage(temp_storage.Alias())
239
+ {
240
+ _CCCL_ASSERT(::cuda::device::is_object_from(temp_storage, ::cuda::device::address_space::shared),
241
+ "temp_storage has to be in shared memory");
242
+ __init_mbarrier();
243
+ }
244
+
245
+ _CCCL_DEVICE BlockLoadToShared(const BlockLoadToShared<BlockDimX, BlockDimY, BlockDimZ>&) = delete;
246
+
247
+ //! @} end member group
248
+
249
+ _CCCL_DEVICE BlockLoadToShared& operator=(const BlockLoadToShared<BlockDimX, BlockDimY, BlockDimZ>&) = delete;
250
+
251
+ //! @brief Invalidates underlying @c mbarrier enabling reuse of its temporary storage.
252
+ //! @note
253
+ //! Block-synchronization is needed after calling `Invalidate()` to reuse the shared memory from the temporary
254
+ //! storage.
255
+ // This is not the destructor to avoid overhead when shared memory reuse is not needed.
256
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Invalidate()
257
+ {
258
+ #ifdef CCCL_ENABLE_DEVICE_ASSERTIONS
259
+ _CCCL_ASSERT(state == State::ready_to_copy, "Wait() must be called before Invalidate()");
260
+ state = State::invalidated;
261
+ #endif // CCCL_ENABLE_DEVICE_ASSERTIONS
262
+ // Make sure all threads are done interacting with the mbarrier
263
+ __syncthreads();
264
+ if (elected)
265
+ {
266
+ NV_IF_TARGET(NV_PROVIDES_SM_90,
267
+ (
268
+ // Borrowed from cuda::barrier
269
+ // TODO Make this available through cuda::ptx::
270
+ asm volatile("mbarrier.inval.shared.b64 [%0];" ::"r"(static_cast<::cuda::std::uint32_t>(
271
+ ::__cvta_generic_to_shared(&temp_storage.mbarrier_handle))) : "memory");));
272
+ }
273
+ // Make sure the elected thread is done invalidating the mbarrier
274
+ __syncthreads();
275
+ }
276
+
277
+ //! @brief Copy elements from global to shared memory
278
+ //! @tparam T
279
+ //! **[inferred]** Value type for this transaction
280
+ //! @tparam GmemAlign
281
+ //! Guaranteed alignment in bytes of the source range (both begin and end) in global memory
282
+ //! @param[in] smem_dst
283
+ //! Destination buffer in shared memory that is aligned to `SharedBufferAlignBytes<T>()` and at least
284
+ //! `SharedBufferSizeBytes<T, GmemAlign>(size(gmem_src))` big.
285
+ //! @param[in] gmem_src
286
+ //! Source range in global memory, determines the size of the transaction
287
+ //! @return
288
+ //! The range in shared memory (same size as `gmem_src`) which should be used to access the data after `Commit` and
289
+ //! `Wait`.
290
+ // TODO Allow spans with static sizes?
291
+ template <typename T, int GmemAlign = alignof(T)>
292
+ [[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::span<T>
293
+ CopyAsync(::cuda::std::span<char> smem_dst, ::cuda::std::span<const T> gmem_src)
294
+ {
295
+ // TODO Should this be weakened to thrust::is_trivially_relocatable?
296
+ static_assert(THRUST_NS_QUALIFIER::is_trivially_relocatable_v<T>);
297
+ static_assert(::cuda::std::has_single_bit(unsigned{GmemAlign}));
298
+ static_assert(GmemAlign >= int{alignof(T)});
299
+ constexpr bool bulk_aligned = GmemAlign >= minimum_align;
300
+ // Avoid 64b multiplication in span::size_bytes()
301
+ const int num_bytes = static_cast<int>(sizeof(T)) * static_cast<int>(size(gmem_src));
302
+ const auto dst_ptr = data(smem_dst);
303
+ const auto src_ptr = ::cuda::ptr_rebind<char>(data(gmem_src));
304
+ _CCCL_ASSERT(dst_ptr == nullptr || ::cuda::device::is_address_from(dst_ptr, ::cuda::device::address_space::shared),
305
+ "Destination address needs to point to shared memory");
306
+ _CCCL_ASSERT(src_ptr == nullptr || ::cuda::device::is_address_from(src_ptr, ::cuda::device::address_space::global),
307
+ "Source address needs to point to global memory");
308
+ _CCCL_ASSERT((src_ptr != nullptr && dst_ptr != nullptr) || num_bytes == 0,
309
+ "Only when the source range is empty are nullptrs allowed");
310
+ _CCCL_ASSERT(::cuda::is_aligned(src_ptr, GmemAlign),
311
+ "Begin of global memory range needs to be aligned according to GmemAlign.");
312
+ _CCCL_ASSERT(::cuda::is_aligned(src_ptr + num_bytes, GmemAlign),
313
+ "End of global memory range needs to be aligned according to GmemAlign.");
314
+ _CCCL_ASSERT(::cuda::is_aligned(dst_ptr, SharedBufferAlignBytes<T>()),
315
+ "Shared memory needs to be 16 byte aligned.");
316
+ _CCCL_ASSERT((static_cast<int>(size(smem_dst)) >= SharedBufferSizeBytes<T, GmemAlign>(size(gmem_src))),
317
+ "Shared memory destination buffer must have enough space");
318
+ #ifdef CCCL_ENABLE_DEVICE_ASSERTIONS
319
+ _CCCL_ASSERT(state == State::ready_to_copy || state == State::ready_to_copy_or_commit,
320
+ "Wait() must be called before another CopyAsync()");
321
+ state = State::ready_to_copy_or_commit;
322
+ #endif // CCCL_ENABLE_DEVICE_ASSERTIONS
323
+ if constexpr (bulk_aligned)
324
+ {
325
+ __copy_aligned(dst_ptr, src_ptr, num_bytes);
326
+ return {::cuda::ptr_rebind<T>(data(smem_dst)), size(gmem_src)};
327
+ }
328
+ else
329
+ {
330
+ const auto src_ptr_aligned = ::cuda::align_up(src_ptr, minimum_align);
331
+ const int align_diff = static_cast<int>(src_ptr_aligned - src_ptr);
332
+ const int head_padding_bytes = (minimum_align - align_diff) % minimum_align;
333
+ const auto actual_dst_ptr = dst_ptr + head_padding_bytes;
334
+ const int head_peeling_bytes = ::cuda::std::min(align_diff, num_bytes);
335
+ const int num_bytes_bulk = ::cuda::round_down(num_bytes - head_peeling_bytes, minimum_align);
336
+ __copy_aligned(actual_dst_ptr + head_peeling_bytes, src_ptr_aligned, num_bytes_bulk);
337
+
338
+ // Peel head and tail
339
+ // Make sure we have enough threads for the worst case of minimum_align bytes on each side.
340
+ static_assert(block_threads >= 2 * (minimum_align - 1));
341
+ // |-------------head--------------|--------------------------tail--------------------------|
342
+ // 0, 1, ... head_peeling_bytes - 1, head_peeling_bytes + num_bytes_bulk, ..., num_bytes - 1
343
+ const int begin_offset = linear_tid < head_peeling_bytes ? 0 : num_bytes_bulk;
344
+ if (const int idx = begin_offset + linear_tid; idx < num_bytes)
345
+ {
346
+ actual_dst_ptr[idx] = src_ptr[idx];
347
+ }
348
+ return {::cuda::ptr_rebind<T>(actual_dst_ptr), size(gmem_src)};
349
+ }
350
+ }
351
+
352
+ // Avoid need to explicitly specify `T` for non-const src.
353
+ //! @brief Convenience overload, see `CopyAsync(span<char>, span<const T>)`.
354
+ template <typename T, int GmemAlign = alignof(T)>
355
+ [[nodiscard]] _CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::span<T>
356
+ CopyAsync(::cuda::std::span<char> smem_dst, ::cuda::std::span<T> gmem_src)
357
+ {
358
+ return CopyAsync<T, GmemAlign>(smem_dst, ::cuda::std::span<const T>{gmem_src});
359
+ }
360
+
361
+ //! @brief Commit one or more @c CopyAsync() calls.
362
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Commit()
363
+ {
364
+ #ifdef CCCL_ENABLE_DEVICE_ASSERTIONS
365
+ _CCCL_ASSERT(state == State::ready_to_copy_or_commit, "CopyAsync() must be called before Commit()");
366
+ state = State::committed;
367
+ #endif // CCCL_ENABLE_DEVICE_ASSERTIONS
368
+
369
+ NV_DISPATCH_TARGET(
370
+ NV_PROVIDES_SM_90,
371
+ (if (elected) {
372
+ ::cuda::ptx::mbarrier_arrive_expect_tx(
373
+ ::cuda::ptx::sem_release,
374
+ ::cuda::ptx::scope_cta,
375
+ ::cuda::ptx::space_shared,
376
+ &temp_storage.mbarrier_handle,
377
+ num_bytes_bulk_total);
378
+ num_bytes_bulk_total = 0u;
379
+ } //
380
+ __syncthreads();),
381
+ NV_PROVIDES_SM_80,
382
+ (asm volatile("cp.async.commit_group ;" :: : "memory");));
383
+ }
384
+
385
+ //! @brief Wait for previously committed copies to arrive. Prepare for next calls to @c CopyAsync() .
386
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Wait()
387
+ {
388
+ #ifdef CCCL_ENABLE_DEVICE_ASSERTIONS
389
+ _CCCL_ASSERT(state == State::committed, "Commit() must be called before Wait()");
390
+ state = State::ready_to_copy;
391
+ #endif // CCCL_ENABLE_DEVICE_ASSERTIONS
392
+
393
+ while (!__try_wait())
394
+ ;
395
+ phase_parity ^= 1u;
396
+ }
397
+
398
+ // Having these as static members does require using "template" in user code which is kludgy.
399
+
400
+ //! @brief Returns the alignment needed for the shared memory destination buffer.
401
+ //! @tparam T
402
+ //! Value type to be loaded.
403
+ template <typename T>
404
+ _CCCL_HOST_DEVICE static constexpr int SharedBufferAlignBytes()
405
+ {
406
+ return (::cuda::std::max) (int{alignof(T)}, minimum_align);
407
+ }
408
+
409
+ //! @brief Returns the size needed for the shared memory destination buffer.
410
+ //! @tparam T
411
+ //! Value type to be loaded.
412
+ //! @tparam GmemAlign
413
+ //! Guaranteed alignment in bytes of the source range (both begin and end) in global memory
414
+ //! @param[in] num_items
415
+ //! Size of the source range in global memory
416
+ template <typename T, int GmemAlign = alignof(T)>
417
+ _CCCL_HOST_DEVICE static constexpr int SharedBufferSizeBytes(::cuda::std::size_t num_items)
418
+ {
419
+ static_assert(::cuda::std::has_single_bit(unsigned{GmemAlign}));
420
+ static_assert(GmemAlign >= int{alignof(T)});
421
+ _CCCL_ASSERT(num_items <= ::cuda::std::size_t{::cuda::std::numeric_limits<int>::max()},
422
+ "num_items must fit into an int");
423
+ constexpr bool bulk_aligned = GmemAlign >= minimum_align;
424
+ const int num_bytes = static_cast<int>(num_items) * int{sizeof(T)};
425
+ const int extra_space = (bulk_aligned || num_bytes == 0) ? 0 : minimum_align;
426
+ return bulk_aligned ? num_bytes : (::cuda::round_up(num_bytes, minimum_align) + extra_space);
427
+ }
428
+ };
429
+
430
+ } // namespace detail
431
+
432
+ CUB_NAMESPACE_END
@@ -1,29 +1,5 @@
1
- /******************************************************************************
2
- * Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved.
3
- *
4
- * Redistribution and use in source and binary forms, with or without
5
- * modification, are permitted provided that the following conditions are met:
6
- * * Redistributions of source code must retain the above copyright
7
- * notice, this list of conditions and the following disclaimer.
8
- * * Redistributions in binary form must reproduce the above copyright
9
- * notice, this list of conditions and the following disclaimer in the
10
- * documentation and/or other materials provided with the distribution.
11
- * * Neither the name of the NVIDIA CORPORATION nor the
12
- * names of its contributors may be used to endorse or promote products
13
- * derived from this software without specific prior written permission.
14
- *
15
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
- *
26
- ******************************************************************************/
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: BSD-3-Clause
27
3
 
28
4
  #pragma once
29
5
 
@@ -113,6 +89,22 @@ template <int Rank, typename Extents>
113
89
  return true;
114
90
  }
115
91
 
92
+ template <typename MappingTypeLhs, typename MappingTypeRhs>
93
+ [[nodiscard]] _CCCL_API bool have_same_strides(const MappingTypeLhs& mapping_lhs, const MappingTypeRhs& mapping_rhs)
94
+ {
95
+ auto extents_lhs = mapping_lhs.extents();
96
+ auto extents_rhs = mapping_rhs.extents();
97
+ _CCCL_ASSERT(extents_lhs.rank() == extents_rhs.rank(), "extents must have the same rank");
98
+ for (size_t i = 0; i < extents_lhs.rank(); i++)
99
+ {
100
+ if (mapping_lhs.stride(i) != mapping_rhs.stride(i))
101
+ {
102
+ return false;
103
+ }
104
+ }
105
+ return true;
106
+ }
107
+
116
108
  } // namespace detail
117
109
 
118
110
  CUB_NAMESPACE_END