cuda-cccl 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.4__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (911) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +12 -38
  2. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +16 -40
  3. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -28
  4. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +24 -56
  5. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +12 -38
  6. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +31 -56
  7. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +31 -35
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +47 -48
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +39 -42
  10. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +33 -60
  11. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +18 -44
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +26 -55
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +22 -49
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +15 -41
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +9 -35
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +20 -49
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +14 -40
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +18 -40
  19. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +0 -2
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +20 -46
  21. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +3 -28
  22. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +7 -31
  23. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +10 -34
  24. cuda/cccl/headers/include/cub/block/block_exchange.cuh +120 -154
  25. cuda/cccl/headers/include/cub/block/block_histogram.cuh +28 -52
  26. cuda/cccl/headers/include/cub/block/block_load.cuh +124 -146
  27. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +0 -16
  28. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +58 -87
  29. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +81 -100
  30. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +92 -156
  31. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +8 -32
  32. cuda/cccl/headers/include/cub/block/block_reduce.cuh +21 -46
  33. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +51 -79
  34. cuda/cccl/headers/include/cub/block/block_scan.cuh +94 -401
  35. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +10 -34
  36. cuda/cccl/headers/include/cub/block/block_store.cuh +73 -97
  37. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +2 -29
  38. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +5 -29
  39. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +25 -49
  40. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +12 -34
  41. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +10 -34
  42. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +3 -27
  43. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +12 -36
  44. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +9 -33
  45. cuda/cccl/headers/include/cub/config.cuh +2 -26
  46. cuda/cccl/headers/include/cub/cub.cuh +3 -27
  47. cuda/cccl/headers/include/cub/detail/array_utils.cuh +2 -26
  48. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +2 -28
  49. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +3 -27
  50. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +0 -2
  51. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -3
  52. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +2 -28
  53. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +0 -2
  54. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +0 -2
  55. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +0 -2
  56. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +0 -2
  57. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +7 -12
  58. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +6 -33
  59. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +13 -36
  60. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +9 -38
  61. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +58 -32
  62. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +51 -51
  63. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +7 -31
  64. cuda/cccl/headers/include/cub/detail/rfa.cuh +2 -27
  65. cuda/cccl/headers/include/cub/detail/strong_load.cuh +3 -29
  66. cuda/cccl/headers/include/cub/detail/strong_store.cuh +3 -29
  67. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +2 -9
  68. cuda/cccl/headers/include/cub/detail/type_traits.cuh +0 -2
  69. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +6 -31
  70. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +2 -25
  71. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +2 -26
  72. cuda/cccl/headers/include/cub/device/device_for.cuh +3 -5
  73. cuda/cccl/headers/include/cub/device/device_histogram.cuh +3 -27
  74. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +2 -26
  75. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +2 -26
  76. cuda/cccl/headers/include/cub/device/device_partition.cuh +3 -27
  77. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3 -27
  78. cuda/cccl/headers/include/cub/device/device_reduce.cuh +10 -31
  79. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +3 -27
  80. cuda/cccl/headers/include/cub/device/device_scan.cuh +16 -34
  81. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +3 -27
  82. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +3 -27
  83. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2 -26
  84. cuda/cccl/headers/include/cub/device/device_select.cuh +3 -27
  85. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +2 -28
  86. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +2 -27
  87. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +0 -2
  88. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +3 -29
  89. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +14 -34
  90. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +5 -30
  91. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +4 -29
  92. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +5 -32
  93. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +3 -29
  94. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -29
  95. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +1 -2
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +47 -59
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +21 -30
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +2 -27
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +3 -27
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +3 -27
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +0 -2
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +51 -36
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +3 -28
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +0 -1
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +27 -55
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +4 -28
  107. cuda/cccl/headers/include/cub/device/dispatch/kernels/{for_each.cuh → kernel_for_each.cuh} +0 -2
  108. cuda/cccl/headers/include/cub/device/dispatch/kernels/{histogram.cuh → kernel_histogram.cuh} +149 -157
  109. cuda/cccl/headers/include/cub/device/dispatch/kernels/{merge_sort.cuh → kernel_merge_sort.cuh} +0 -2
  110. cuda/cccl/headers/include/cub/device/dispatch/kernels/{radix_sort.cuh → kernel_radix_sort.cuh} +0 -2
  111. cuda/cccl/headers/include/cub/device/dispatch/kernels/{reduce.cuh → kernel_reduce.cuh} +2 -28
  112. cuda/cccl/headers/include/cub/device/dispatch/kernels/{scan.cuh → kernel_scan.cuh} +2 -28
  113. cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_reduce.cuh → kernel_segmented_reduce.cuh} +3 -29
  114. cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_sort.cuh → kernel_segmented_sort.cuh} +0 -1
  115. cuda/cccl/headers/include/cub/device/dispatch/kernels/{three_way_partition.cuh → kernel_three_way_partition.cuh} +0 -1
  116. cuda/cccl/headers/include/cub/device/dispatch/kernels/{transform.cuh → kernel_transform.cuh} +11 -11
  117. cuda/cccl/headers/include/cub/device/dispatch/kernels/{unique_by_key.cuh → kernel_unique_by_key.cuh} +0 -1
  118. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -26
  119. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -26
  120. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -28
  121. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +6 -26
  122. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -26
  123. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +5 -31
  124. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +31 -33
  125. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +15 -40
  126. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -26
  127. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -28
  128. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +20 -44
  129. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -26
  130. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +20 -45
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +2 -27
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +11 -36
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +0 -1
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +2 -27
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +14 -40
  136. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +3 -27
  137. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +3 -27
  138. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +3 -27
  139. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +3 -27
  140. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +3 -27
  141. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +3 -27
  142. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +3 -27
  143. cuda/cccl/headers/include/cub/thread/thread_load.cuh +3 -28
  144. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +3 -27
  145. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +3 -26
  146. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +3 -29
  147. cuda/cccl/headers/include/cub/thread/thread_search.cuh +3 -27
  148. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +0 -2
  149. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +2 -26
  150. cuda/cccl/headers/include/cub/thread/thread_store.cuh +3 -27
  151. cuda/cccl/headers/include/cub/util_allocator.cuh +3 -27
  152. cuda/cccl/headers/include/cub/util_arch.cuh +3 -29
  153. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +2 -26
  154. cuda/cccl/headers/include/cub/util_debug.cuh +3 -27
  155. cuda/cccl/headers/include/cub/util_device.cuh +18 -59
  156. cuda/cccl/headers/include/cub/util_macro.cuh +4 -28
  157. cuda/cccl/headers/include/cub/util_math.cuh +2 -28
  158. cuda/cccl/headers/include/cub/util_namespace.cuh +3 -28
  159. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +3 -27
  160. cuda/cccl/headers/include/cub/util_ptx.cuh +6 -30
  161. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +3 -29
  162. cuda/cccl/headers/include/cub/util_type.cuh +5 -32
  163. cuda/cccl/headers/include/cub/util_vsmem.cuh +2 -28
  164. cuda/cccl/headers/include/cub/version.cuh +2 -26
  165. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +10 -35
  166. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +5 -30
  167. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +15 -39
  168. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +5 -35
  169. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +22 -46
  170. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +3 -27
  171. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +2 -26
  172. cuda/cccl/headers/include/cub/warp/warp_load.cuh +4 -27
  173. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +2 -26
  174. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +3 -22
  175. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +3 -27
  176. cuda/cccl/headers/include/cub/warp/warp_store.cuh +4 -27
  177. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +0 -2
  178. cuda/cccl/headers/include/cuda/__barrier/barrier.h +1 -1
  179. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +0 -1
  180. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +277 -235
  181. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +0 -1
  182. cuda/cccl/headers/include/cuda/__driver/driver_api.h +13 -0
  183. cuda/cccl/headers/include/cuda/__execution/determinism.h +0 -2
  184. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +0 -2
  185. cuda/cccl/headers/include/cuda/__functional/maximum.h +25 -7
  186. cuda/cccl/headers/include/cuda/__functional/minimum.h +25 -7
  187. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  188. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +0 -2
  189. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +13 -4
  190. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +4 -2
  191. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +0 -1
  192. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +28 -7
  193. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +1 -1
  194. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  195. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +2 -3
  196. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +1 -7
  197. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +0 -1
  198. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +1 -1
  199. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +126 -0
  200. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +898 -0
  201. cuda/cccl/headers/include/cuda/__memory_resource/device_memory_pool.h +149 -0
  202. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +3 -3
  203. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  204. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +139 -0
  205. cuda/cccl/headers/include/cuda/__memory_resource/managed_memory_pool.h +146 -0
  206. cuda/cccl/headers/include/cuda/__memory_resource/memory_resource_base.h +578 -0
  207. cuda/cccl/headers/include/cuda/__memory_resource/pinned_memory_pool.h +188 -0
  208. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +3 -3
  209. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +37 -3
  210. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +13 -3
  211. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  212. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +2 -2
  213. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +344 -0
  214. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +0 -6
  215. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +1 -1
  216. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  217. cuda/cccl/headers/include/cuda/{std/__cuda → __runtime}/api_wrapper.h +3 -3
  218. cuda/cccl/headers/include/cuda/__stream/get_stream.h +0 -1
  219. cuda/cccl/headers/include/cuda/{__fwd/barrier_native_handle.h → __stream/internal_streams.h} +17 -15
  220. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +2 -2
  221. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +1 -0
  222. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +1 -0
  223. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +2 -1
  224. cuda/cccl/headers/include/cuda/barrier +42 -16
  225. cuda/cccl/headers/include/cuda/memory +1 -0
  226. cuda/cccl/headers/include/cuda/memory_resource +6 -1
  227. cuda/cccl/headers/include/cuda/numeric +2 -0
  228. cuda/cccl/headers/include/cuda/pipeline +3 -2
  229. cuda/cccl/headers/include/cuda/ptx +1 -0
  230. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +0 -2
  231. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +1 -1
  232. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +115 -58
  233. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +844 -378
  234. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +12 -5
  235. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +31 -0
  236. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +10 -0
  237. cuda/cccl/headers/include/cuda/std/__atomic/types.h +2 -3
  238. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +37 -13
  239. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +0 -28
  240. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +7 -0
  241. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +10 -0
  242. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +2 -45
  243. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +0 -2
  244. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +8 -0
  245. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +0 -2
  246. cuda/cccl/headers/include/cuda/std/__chrono/day.h +0 -2
  247. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +13 -17
  248. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +0 -2
  249. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +0 -2
  250. cuda/cccl/headers/include/cuda/std/__chrono/month.h +0 -2
  251. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +0 -2
  252. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +0 -2
  253. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +5 -8
  254. cuda/cccl/headers/include/cuda/std/__chrono/year.h +0 -2
  255. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +4 -0
  256. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +2 -3
  257. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +4 -0
  258. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +4 -0
  259. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +2 -3
  260. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +2 -3
  261. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +2 -3
  262. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +2 -3
  263. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +2 -3
  264. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +2 -3
  265. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +2 -3
  266. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +2 -3
  267. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +2 -3
  268. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +2 -3
  269. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +2 -2
  270. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +4 -0
  271. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +2 -3
  272. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +2 -3
  273. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +4 -0
  274. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +2 -3
  275. cuda/cccl/headers/include/cuda/std/__complex/complex.h +0 -6
  276. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +2 -2
  277. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +27 -1
  278. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +2 -4
  279. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +15 -36
  280. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +93 -0
  281. cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/stdexcept → __exception/throw_error.h} +3 -3
  282. cuda/cccl/headers/include/cuda/std/__expected/expected.h +28 -43
  283. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +2 -10
  284. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +2 -2
  285. cuda/cccl/headers/include/cuda/std/__functional/bind.h +6 -6
  286. cuda/cccl/headers/include/cuda/std/__functional/function.h +2 -6
  287. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +5 -5
  288. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +5 -0
  289. cuda/cccl/headers/include/cuda/std/__fwd/array.h +2 -2
  290. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +12 -0
  291. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  292. cuda/cccl/headers/include/cuda/std/__fwd/get.h +21 -22
  293. cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/iosfwd → __fwd/ios.h} +5 -10
  294. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +19 -10
  295. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +2 -2
  296. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +5 -0
  297. cuda/cccl/headers/include/cuda/std/__fwd/span.h +2 -2
  298. cuda/cccl/headers/include/cuda/std/__fwd/string.h +7 -0
  299. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +18 -0
  300. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +3 -0
  301. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  302. cuda/cccl/headers/include/cuda/std/{__type_traits/is_reference_wrapper.h → __fwd/variant.h} +16 -15
  303. cuda/cccl/headers/include/cuda/std/__internal/features.h +14 -0
  304. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +1 -1
  305. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +1 -1
  306. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +58 -40
  307. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +1 -1
  308. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +1 -1
  309. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +0 -5
  310. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +4 -18
  311. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +1 -2
  312. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +0 -2
  313. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +0 -2
  314. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +0 -4
  315. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +0 -5
  316. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +3 -10
  317. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +4 -15
  318. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +4 -4
  319. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +4 -4
  320. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +2 -4
  321. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +3 -3
  322. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +1 -1
  323. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +1 -0
  324. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +6 -12
  325. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -5
  326. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +7 -2
  327. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +1 -0
  328. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +5 -0
  329. cuda/cccl/headers/include/cuda/std/__new/allocate.h +5 -0
  330. cuda/cccl/headers/include/cuda/{__barrier/barrier_native_handle.h → std/__new/device_new.h} +9 -24
  331. cuda/cccl/headers/include/cuda/std/__new_ +1 -0
  332. cuda/cccl/headers/include/cuda/std/__optional/optional.h +5 -4
  333. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +4 -4
  334. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +1 -1
  335. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  336. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  337. cuda/cccl/headers/include/cuda/std/__random_ +2 -0
  338. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +7 -19
  339. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -4
  340. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +5 -4
  341. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +1 -1
  342. cuda/cccl/headers/include/cuda/std/__string/string_view.h +5 -5
  343. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  344. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  345. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +0 -160
  346. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +123 -129
  347. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  348. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  349. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  350. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  351. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +7 -0
  352. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  353. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +1 -2
  354. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  355. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +1 -1
  356. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  357. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +0 -2
  358. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +4 -24
  359. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +0 -2
  360. cuda/cccl/headers/include/cuda/std/__utility/pair.h +20 -20
  361. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +0 -2
  362. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  363. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  364. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  365. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  366. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  367. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  368. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  369. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  370. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  371. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  372. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  373. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  374. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  375. cuda/cccl/headers/include/cuda/std/array +1 -1
  376. cuda/cccl/headers/include/cuda/std/atomic +1 -1
  377. cuda/cccl/headers/include/cuda/std/bitset +2 -10
  378. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +6 -6
  379. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1 -4
  380. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3 -6
  381. cuda/cccl/headers/include/cuda/std/functional +1 -1
  382. cuda/cccl/headers/include/cuda/std/initializer_list +8 -0
  383. cuda/cccl/headers/include/cuda/std/inplace_vector +6 -5
  384. cuda/cccl/headers/include/cuda/std/iterator +1 -1
  385. cuda/cccl/headers/include/cuda/std/numbers +0 -2
  386. cuda/cccl/headers/include/cuda/std/ratio +2 -2
  387. cuda/cccl/headers/include/cuda/std/span +2 -2
  388. cuda/cccl/headers/include/cuda/std/string_view +24 -42
  389. cuda/cccl/headers/include/cuda/std/tuple +18 -1
  390. cuda/cccl/headers/include/cuda/std/type_traits +0 -1
  391. cuda/cccl/headers/include/cuda/std/variant +8 -1
  392. cuda/cccl/headers/include/nv/target +2 -6
  393. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +15 -2
  394. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +0 -2
  395. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +0 -1
  396. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +0 -1
  397. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +0 -2
  398. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +0 -2
  399. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +0 -2
  400. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +0 -2
  401. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +0 -2
  402. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +0 -4
  403. cuda/cccl/headers/include/thrust/detail/binary_search.inl +14 -2
  404. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +2 -7
  405. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +2 -8
  406. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +2 -8
  407. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +2 -8
  408. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +2 -8
  409. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +2 -8
  410. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +2 -7
  411. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +2 -8
  412. cuda/cccl/headers/include/thrust/detail/complex/clog.h +2 -8
  413. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +2 -8
  414. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +2 -7
  415. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +2 -8
  416. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +2 -8
  417. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +2 -8
  418. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +2 -8
  419. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +2 -8
  420. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +2 -8
  421. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +2 -8
  422. cuda/cccl/headers/include/thrust/detail/config/device_system.h +2 -0
  423. cuda/cccl/headers/include/thrust/detail/config/host_system.h +2 -0
  424. cuda/cccl/headers/include/thrust/detail/config/namespace.h +0 -1
  425. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +0 -2
  426. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +0 -2
  427. cuda/cccl/headers/include/thrust/detail/copy.h +0 -2
  428. cuda/cccl/headers/include/thrust/detail/copy.inl +14 -4
  429. cuda/cccl/headers/include/thrust/detail/copy_if.inl +14 -2
  430. cuda/cccl/headers/include/thrust/detail/count.inl +14 -2
  431. cuda/cccl/headers/include/thrust/detail/equal.inl +14 -2
  432. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +4 -5
  433. cuda/cccl/headers/include/thrust/detail/extrema.inl +14 -2
  434. cuda/cccl/headers/include/thrust/detail/fill.inl +14 -2
  435. cuda/cccl/headers/include/thrust/detail/find.inl +14 -2
  436. cuda/cccl/headers/include/thrust/detail/for_each.inl +14 -2
  437. cuda/cccl/headers/include/thrust/detail/functional/actor.h +2 -5
  438. cuda/cccl/headers/include/thrust/detail/functional/operators.h +2 -5
  439. cuda/cccl/headers/include/thrust/detail/gather.inl +14 -2
  440. cuda/cccl/headers/include/thrust/detail/generate.inl +14 -2
  441. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +0 -2
  442. cuda/cccl/headers/include/thrust/detail/inner_product.inl +14 -2
  443. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -0
  444. cuda/cccl/headers/include/thrust/detail/logical.inl +14 -2
  445. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +13 -1
  446. cuda/cccl/headers/include/thrust/detail/merge.inl +14 -2
  447. cuda/cccl/headers/include/thrust/detail/mismatch.inl +14 -2
  448. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +0 -4
  449. cuda/cccl/headers/include/thrust/detail/partition.inl +14 -2
  450. cuda/cccl/headers/include/thrust/detail/random_bijection.h +0 -2
  451. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +0 -2
  452. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +0 -2
  453. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +0 -6
  454. cuda/cccl/headers/include/thrust/detail/reduce.inl +21 -3
  455. cuda/cccl/headers/include/thrust/detail/reference.h +27 -3
  456. cuda/cccl/headers/include/thrust/detail/remove.inl +14 -2
  457. cuda/cccl/headers/include/thrust/detail/replace.inl +14 -2
  458. cuda/cccl/headers/include/thrust/detail/reverse.inl +14 -2
  459. cuda/cccl/headers/include/thrust/detail/scan.inl +21 -3
  460. cuda/cccl/headers/include/thrust/detail/scatter.inl +14 -2
  461. cuda/cccl/headers/include/thrust/detail/sequence.inl +13 -1
  462. cuda/cccl/headers/include/thrust/detail/set_operations.inl +13 -1
  463. cuda/cccl/headers/include/thrust/detail/sort.inl +13 -1
  464. cuda/cccl/headers/include/thrust/detail/static_assert.h +0 -2
  465. cuda/cccl/headers/include/thrust/detail/static_map.h +0 -3
  466. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +13 -1
  467. cuda/cccl/headers/include/thrust/detail/tabulate.inl +14 -2
  468. cuda/cccl/headers/include/thrust/detail/temporary_array.h +0 -4
  469. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +0 -1
  470. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +14 -3
  471. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +13 -1
  472. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +13 -1
  473. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +0 -2
  474. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +0 -2
  475. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +2 -7
  476. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +0 -2
  477. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +0 -4
  478. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +0 -4
  479. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +14 -2
  480. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +14 -2
  481. cuda/cccl/headers/include/thrust/detail/unique.inl +21 -3
  482. cuda/cccl/headers/include/thrust/detail/vector_base.h +0 -2
  483. cuda/cccl/headers/include/thrust/detail/vector_base.inl +0 -2
  484. cuda/cccl/headers/include/thrust/execution_policy.h +10 -9
  485. cuda/cccl/headers/include/thrust/functional.h +0 -2
  486. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +9 -4
  487. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +8 -4
  488. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +0 -1
  489. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +0 -1
  490. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +0 -1
  491. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +0 -1
  492. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +0 -1
  493. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +2 -6
  494. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +0 -1
  495. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +0 -2
  496. cuda/cccl/headers/include/thrust/mr/allocator.h +0 -2
  497. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +9 -4
  498. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +10 -10
  499. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +0 -2
  500. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +0 -2
  501. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +0 -2
  502. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +8 -4
  503. cuda/cccl/headers/include/thrust/mr/memory_resource.h +0 -2
  504. cuda/cccl/headers/include/thrust/mr/new.h +0 -2
  505. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +0 -2
  506. cuda/cccl/headers/include/thrust/mr/pool.h +10 -10
  507. cuda/cccl/headers/include/thrust/mr/pool_options.h +4 -6
  508. cuda/cccl/headers/include/thrust/mr/sync_pool.h +0 -2
  509. cuda/cccl/headers/include/thrust/mr/tls_pool.h +0 -2
  510. cuda/cccl/headers/include/thrust/mr/validator.h +0 -2
  511. cuda/cccl/headers/include/thrust/per_device_resource.h +13 -1
  512. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +0 -2
  513. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +0 -2
  514. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +2 -9
  515. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +0 -2
  516. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +2 -9
  517. cuda/cccl/headers/include/thrust/random/detail/mod.h +2 -9
  518. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +0 -2
  519. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +2 -7
  520. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +2 -9
  521. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +0 -2
  522. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +0 -2
  523. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +0 -2
  524. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +0 -2
  525. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +0 -2
  526. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +0 -2
  527. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +0 -2
  528. cuda/cccl/headers/include/thrust/random/normal_distribution.h +0 -2
  529. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +0 -2
  530. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +0 -2
  531. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +0 -2
  532. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +0 -2
  533. cuda/cccl/headers/include/thrust/random.h +0 -2
  534. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +15 -11
  535. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +2 -7
  536. cuda/cccl/headers/include/thrust/system/cpp/memory.h +0 -1
  537. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +0 -2
  538. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +0 -2
  539. cuda/cccl/headers/include/thrust/system/cpp/vector.h +0 -1
  540. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +0 -4
  541. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +0 -1
  542. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +0 -4
  543. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +2 -9
  544. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +4 -32
  545. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +2 -9
  546. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +0 -2
  547. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +0 -2
  548. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +23 -2
  549. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +0 -2
  550. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +2 -11
  551. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +2 -0
  552. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +0 -4
  553. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +0 -1
  554. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +0 -5
  555. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +0 -1
  556. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +0 -2
  557. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +0 -2
  558. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +0 -1
  559. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +2 -8
  560. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +0 -2
  561. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +0 -2
  562. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +2 -26
  563. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +7 -142
  564. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +0 -2
  565. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +0 -4
  566. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +0 -2
  567. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +0 -5
  568. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +0 -4
  569. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +0 -2
  570. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +0 -1
  571. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +0 -4
  572. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +0 -4
  573. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +0 -3
  574. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +0 -2
  575. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +3 -5
  576. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +8 -10
  577. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +0 -2
  578. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +0 -1
  579. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +0 -4
  580. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +0 -2
  581. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +1 -7
  582. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +2 -7
  583. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +0 -3
  584. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +0 -4
  585. cuda/cccl/headers/include/thrust/system/cuda/error.h +2 -11
  586. cuda/cccl/headers/include/thrust/system/cuda/memory.h +2 -6
  587. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +2 -9
  588. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +2 -7
  589. cuda/cccl/headers/include/thrust/system/cuda/vector.h +2 -6
  590. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +0 -2
  591. cuda/cccl/headers/include/thrust/system/detail/errno.h +0 -2
  592. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +0 -4
  593. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +0 -2
  594. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +0 -2
  595. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +0 -2
  596. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +0 -2
  597. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +0 -2
  598. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +0 -3
  599. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +0 -2
  600. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +0 -2
  601. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +0 -2
  602. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +0 -3
  603. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +0 -2
  604. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +0 -2
  605. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +0 -2
  606. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +0 -2
  607. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +0 -2
  608. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +0 -3
  609. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +0 -2
  610. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +0 -2
  611. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +0 -2
  612. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +0 -2
  613. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +0 -2
  614. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +0 -2
  615. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +0 -2
  616. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +0 -2
  617. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +0 -2
  618. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +0 -2
  619. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +0 -2
  620. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +0 -3
  621. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +0 -2
  622. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +0 -2
  623. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +0 -2
  624. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +0 -2
  625. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +0 -2
  626. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +0 -2
  627. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +0 -2
  628. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +0 -2
  629. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +0 -2
  630. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +0 -2
  631. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +0 -3
  632. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +0 -2
  633. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +0 -2
  634. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +0 -2
  635. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +0 -3
  636. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +0 -2
  637. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +0 -2
  638. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +0 -2
  639. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +0 -2
  640. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +26 -12
  641. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +0 -2
  642. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +0 -3
  643. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +0 -2
  644. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +0 -2
  645. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +0 -1
  646. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +0 -2
  647. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +0 -2
  648. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +0 -2
  649. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +0 -2
  650. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +0 -2
  651. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +0 -2
  652. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +0 -2
  653. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +0 -3
  654. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +0 -2
  655. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +0 -2
  656. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +0 -2
  657. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +0 -2
  658. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +0 -2
  659. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +0 -2
  660. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +0 -2
  661. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +0 -2
  662. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +0 -2
  663. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +0 -2
  664. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +0 -2
  665. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +2 -4
  666. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +0 -2
  667. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +0 -3
  668. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +0 -2
  669. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +0 -2
  670. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +0 -2
  671. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +0 -2
  672. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +0 -2
  673. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +0 -2
  674. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +0 -2
  675. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +0 -2
  676. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +76 -5
  677. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +0 -2
  678. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +0 -2
  679. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +0 -2
  680. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +0 -2
  681. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +0 -2
  682. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +0 -3
  683. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +0 -2
  684. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +0 -2
  685. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +0 -2
  686. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +0 -2
  687. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +78 -6
  688. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +0 -4
  689. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +0 -2
  690. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +0 -2
  691. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +0 -2
  692. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +0 -2
  693. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +0 -2
  694. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +0 -2
  695. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +67 -6
  696. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +310 -11
  697. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +78 -5
  698. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +543 -7
  699. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +0 -2
  700. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +0 -2
  701. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +0 -2
  702. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +0 -2
  703. cuda/cccl/headers/include/thrust/system/error_code.h +0 -4
  704. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +5 -25
  705. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +2 -15
  706. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +5 -25
  707. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +40 -29
  708. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +11 -28
  709. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +2 -15
  710. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +26 -28
  711. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +2 -15
  712. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +18 -13
  713. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +5 -25
  714. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +2 -15
  715. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +5 -25
  716. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +47 -30
  717. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +2 -15
  718. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +2 -15
  719. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +2 -15
  720. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +2 -15
  721. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +2 -15
  722. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +2 -15
  723. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +2 -15
  724. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +2 -15
  725. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +2 -15
  726. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +26 -31
  727. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +2 -15
  728. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +2 -26
  729. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +35 -27
  730. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +13 -28
  731. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +56 -28
  732. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +26 -31
  733. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +2 -15
  734. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +2 -15
  735. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +176 -17
  736. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +8 -15
  737. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +2 -15
  738. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +2 -15
  739. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +2 -15
  740. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +213 -28
  741. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +2 -15
  742. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +2 -15
  743. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +2 -15
  744. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +2 -15
  745. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +2 -15
  746. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +2 -15
  747. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +2 -15
  748. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +2 -15
  749. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +21 -30
  750. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +17 -29
  751. cuda/cccl/headers/include/thrust/system/omp/memory.h +51 -9
  752. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +3 -7
  753. cuda/cccl/headers/include/thrust/system/omp/pointer.h +3 -7
  754. cuda/cccl/headers/include/thrust/system/omp/vector.h +3 -6
  755. cuda/cccl/headers/include/thrust/system/system_error.h +0 -2
  756. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +4 -25
  757. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +2 -15
  758. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +2 -15
  759. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +38 -29
  760. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +91 -24
  761. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +2 -15
  762. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +2 -15
  763. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +17 -13
  764. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +4 -25
  765. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +2 -15
  766. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +4 -25
  767. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +47 -28
  768. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +2 -15
  769. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +2 -15
  770. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +2 -15
  771. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +2 -15
  772. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +2 -15
  773. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +2 -15
  774. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +2 -15
  775. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +254 -29
  776. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +2 -15
  777. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +25 -31
  778. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +2 -15
  779. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +95 -29
  780. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +345 -28
  781. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +4 -26
  782. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +32 -42
  783. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +2 -15
  784. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +2 -15
  785. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +265 -30
  786. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +7 -17
  787. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +2 -15
  788. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +2 -15
  789. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +2 -15
  790. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +244 -32
  791. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +2 -15
  792. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +2 -15
  793. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +2 -15
  794. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +2 -15
  795. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +2 -15
  796. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +2 -15
  797. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +2 -15
  798. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +2 -15
  799. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +23 -33
  800. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +16 -29
  801. cuda/cccl/headers/include/thrust/system/tbb/memory.h +52 -24
  802. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +4 -22
  803. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +4 -22
  804. cuda/cccl/headers/include/thrust/system/tbb/vector.h +4 -21
  805. cuda/cccl/headers/include/thrust/transform.h +14 -3
  806. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +0 -4
  807. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +0 -1
  808. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +0 -4
  809. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +0 -4
  810. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +0 -4
  811. cuda/cccl/headers/include/thrust/universal_allocator.h +8 -0
  812. cuda/cccl/headers/include/thrust/universal_vector.h +9 -0
  813. cuda/cccl/headers/include/thrust/zip_function.h +2 -28
  814. cuda/compute/__init__.py +4 -0
  815. cuda/compute/_bindings.pyi +26 -3
  816. cuda/compute/_bindings_impl.pyx +143 -1
  817. cuda/compute/algorithms/__init__.py +9 -5
  818. cuda/compute/algorithms/_sort/__init__.py +23 -0
  819. cuda/compute/algorithms/{_merge_sort.py → _sort/_merge_sort.py} +10 -10
  820. cuda/compute/algorithms/{_radix_sort.py → _sort/_radix_sort.py} +9 -58
  821. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  822. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  823. cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  824. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  825. cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  826. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  827. cuda_cccl-0.3.4.dist-info/METADATA +78 -0
  828. {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/RECORD +830 -867
  829. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +0 -652
  830. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +0 -1365
  831. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +0 -2144
  832. cuda/cccl/headers/include/thrust/detail/integer_math.h +0 -113
  833. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +0 -51
  834. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +0 -51
  835. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +0 -51
  836. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +0 -51
  837. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +0 -52
  838. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +0 -51
  839. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +0 -51
  840. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +0 -51
  841. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +0 -51
  842. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +0 -51
  843. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +0 -51
  844. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +0 -51
  845. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +0 -51
  846. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +0 -51
  847. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +0 -51
  848. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +0 -51
  849. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +0 -51
  850. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +0 -51
  851. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +0 -51
  852. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +0 -51
  853. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +0 -51
  854. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +0 -51
  855. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +0 -51
  856. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +0 -51
  857. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +0 -51
  858. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +0 -51
  859. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +0 -51
  860. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +0 -51
  861. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +0 -51
  862. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +0 -51
  863. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +0 -51
  864. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +0 -51
  865. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +0 -51
  866. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +0 -51
  867. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +0 -51
  868. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +0 -51
  869. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +0 -51
  870. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +0 -51
  871. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +0 -51
  872. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +0 -51
  873. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +0 -51
  874. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +0 -51
  875. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +0 -51
  876. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +0 -85
  877. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +0 -119
  878. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +0 -145
  879. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +0 -116
  880. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +0 -356
  881. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +0 -124
  882. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +0 -586
  883. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +0 -74
  884. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +0 -59
  885. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +0 -65
  886. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +0 -87
  887. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +0 -93
  888. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +0 -102
  889. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +0 -78
  890. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +0 -65
  891. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +0 -103
  892. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +0 -87
  893. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +0 -265
  894. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +0 -71
  895. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +0 -75
  896. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +0 -73
  897. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +0 -136
  898. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +0 -91
  899. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +0 -94
  900. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +0 -327
  901. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +0 -98
  902. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +0 -137
  903. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +0 -400
  904. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +0 -87
  905. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +0 -312
  906. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +0 -295
  907. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +0 -71
  908. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +0 -75
  909. cuda_cccl-0.3.2.dist-info/METADATA +0 -42
  910. {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/WHEEL +0 -0
  911. {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/licenses/LICENSE +0 -0
@@ -1,30 +1,6 @@
1
- /******************************************************************************
2
- * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
- * Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved.
4
- *
5
- * Redistribution and use in source and binary forms, with or without
6
- * modification, are permitted provided that the following conditions are met:
7
- * * Redistributions of source code must retain the above copyright
8
- * notice, this list of conditions and the following disclaimer.
9
- * * Redistributions in binary form must reproduce the above copyright
10
- * notice, this list of conditions and the following disclaimer in the
11
- * documentation and/or other materials provided with the distribution.
12
- * * Neither the name of the NVIDIA CORPORATION nor the
13
- * names of its contributors may be used to endorse or promote products
14
- * derived from this software without specific prior written permission.
15
- *
16
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
- *
27
- ******************************************************************************/
1
+ // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
2
+ // SPDX-FileCopyrightText: Copyright (c) 2011-2024, NVIDIA CORPORATION. All rights reserved.
3
+ // SPDX-License-Identifier: BSD-3
28
4
 
29
5
  //! @file
30
6
  //! The cub::BlockExchange class provides :ref:`collective <collective-primitives>` methods for
@@ -47,6 +23,7 @@
47
23
  #include <cub/util_type.cuh>
48
24
  #include <cub/warp/warp_exchange.cuh>
49
25
 
26
+ #include <cuda/__cmath/pow2.h>
50
27
  #include <cuda/__ptx/instructions/get_sreg.h>
51
28
  #include <cuda/std/__algorithm/min.h>
52
29
  #include <cuda/std/__type_traits/integral_constant.h>
@@ -123,48 +100,43 @@ CUB_NAMESPACE_BEGIN
123
100
  //! @tparam T
124
101
  //! The data type to be exchanged
125
102
  //!
126
- //! @tparam BLOCK_DIM_X
103
+ //! @tparam BlockDimX
127
104
  //! The thread block length in threads along the X dimension
128
105
  //!
129
- //! @tparam ITEMS_PER_THREAD
106
+ //! @tparam ItemsPerThread
130
107
  //! The number of items partitioned onto each thread.
131
108
  //!
132
- //! @tparam WARP_TIME_SLICING
109
+ //! @tparam WarpTimeSlicing
133
110
  //! **[optional]** When `true`, only use enough shared memory for a single warp's worth of
134
111
  //! tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint
135
112
  //! at the expense of decreased parallelism. (Default: false)
136
113
  //!
137
- //! @tparam BLOCK_DIM_Y
114
+ //! @tparam BlockDimY
138
115
  //! **[optional]** The thread block length in threads along the Y dimension (default: 1)
139
116
  //!
140
- //! @tparam BLOCK_DIM_Z
117
+ //! @tparam BlockDimZ
141
118
  //! **[optional]** The thread block length in threads along the Z dimension (default: 1)
142
119
  //!
143
- template <typename T,
144
- int BLOCK_DIM_X,
145
- int ITEMS_PER_THREAD,
146
- bool WARP_TIME_SLICING = false,
147
- int BLOCK_DIM_Y = 1,
148
- int BLOCK_DIM_Z = 1>
120
+ template <typename T, int BlockDimX, int ItemsPerThread, bool WarpTimeSlicing = false, int BlockDimY = 1, int BlockDimZ = 1>
149
121
  class BlockExchange
150
122
  {
151
- static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; ///< The thread block size in threads
123
+ static constexpr int BLOCK_THREADS = BlockDimX * BlockDimY * BlockDimZ; ///< The thread block size in threads
152
124
  static constexpr int WARP_THREADS = detail::warp_threads;
153
125
  static constexpr int WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS; // TODO(bgruber): use ceil_div in
154
126
  // C++14
155
127
  static constexpr int LOG_SMEM_BANKS = detail::log2_smem_banks;
156
128
 
157
- static constexpr int TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD;
158
- static constexpr int TIME_SLICES = WARP_TIME_SLICING ? WARPS : 1;
129
+ static constexpr int TILE_ITEMS = BLOCK_THREADS * ItemsPerThread;
130
+ static constexpr int TIME_SLICES = WarpTimeSlicing ? WARPS : 1;
159
131
  static constexpr int TIME_SLICED_THREADS =
160
- WARP_TIME_SLICING ? ::cuda::std::min(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS;
161
- static constexpr int TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD;
132
+ WarpTimeSlicing ? ::cuda::std::min(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS;
133
+ static constexpr int TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ItemsPerThread;
162
134
  static constexpr int WARP_TIME_SLICED_THREADS = ::cuda::std::min(BLOCK_THREADS, WARP_THREADS);
163
- static constexpr int WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD;
135
+ static constexpr int WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ItemsPerThread;
164
136
 
165
137
  // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise
166
138
  // we can typically use 128b loads)
167
- static constexpr bool INSERT_PADDING = ITEMS_PER_THREAD > 4 && PowerOfTwo<ITEMS_PER_THREAD>::VALUE;
139
+ static constexpr bool INSERT_PADDING = ItemsPerThread > 4 && ::cuda::is_power_of_two(ItemsPerThread);
168
140
  static constexpr int PADDING_ITEMS = INSERT_PADDING ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0;
169
141
 
170
142
  /// Shared memory storage layout type
@@ -181,7 +153,7 @@ private:
181
153
  _TempStorage& temp_storage;
182
154
 
183
155
  // TODO(bgruber): can we use signed int here? Only these variables are unsigned:
184
- unsigned int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
156
+ unsigned int linear_tid = RowMajorTid(BlockDimX, BlockDimY, BlockDimZ);
185
157
  unsigned int lane_id = ::cuda::ptx::get_sreg_laneid();
186
158
  unsigned int warp_id = WARPS == 1 ? 0 : linear_tid / WARP_THREADS;
187
159
  unsigned int warp_offset = warp_id * WARP_TIME_SLICED_ITEMS;
@@ -203,14 +175,14 @@ private:
203
175
  //! Items to exchange, converting between **blocked** and **striped** arrangements.
204
176
  template <typename OutputT>
205
177
  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(
206
- const T (&input_items)[ITEMS_PER_THREAD],
207
- OutputT (&output_items)[ITEMS_PER_THREAD],
178
+ const T (&input_items)[ItemsPerThread],
179
+ OutputT (&output_items)[ItemsPerThread],
208
180
  ::cuda::std::false_type /*time_slicing*/)
209
181
  {
210
182
  _CCCL_PRAGMA_UNROLL_FULL()
211
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
183
+ for (int i = 0; i < ItemsPerThread; i++)
212
184
  {
213
- int item_offset = linear_tid * ITEMS_PER_THREAD + i;
185
+ int item_offset = linear_tid * ItemsPerThread + i;
214
186
  if constexpr (INSERT_PADDING)
215
187
  {
216
188
  item_offset += item_offset >> LOG_SMEM_BANKS;
@@ -221,7 +193,7 @@ private:
221
193
  __syncthreads();
222
194
 
223
195
  _CCCL_PRAGMA_UNROLL_FULL()
224
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
196
+ for (int i = 0; i < ItemsPerThread; i++)
225
197
  {
226
198
  int item_offset = i * BLOCK_THREADS + linear_tid;
227
199
  if constexpr (INSERT_PADDING)
@@ -242,11 +214,11 @@ private:
242
214
  //! Items to exchange, converting between **blocked** and **striped** arrangements.
243
215
  template <typename OutputT>
244
216
  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(
245
- const T (&input_items)[ITEMS_PER_THREAD],
246
- OutputT (&output_items)[ITEMS_PER_THREAD],
217
+ const T (&input_items)[ItemsPerThread],
218
+ OutputT (&output_items)[ItemsPerThread],
247
219
  ::cuda::std::true_type /*time_slicing*/)
248
220
  {
249
- T temp_items[ITEMS_PER_THREAD];
221
+ T temp_items[ItemsPerThread];
250
222
 
251
223
  _CCCL_PRAGMA_UNROLL_FULL()
252
224
  for (int slice = 0; slice < TIME_SLICES; slice++)
@@ -259,9 +231,9 @@ private:
259
231
  if (warp_id == slice)
260
232
  {
261
233
  _CCCL_PRAGMA_UNROLL_FULL()
262
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
234
+ for (int i = 0; i < ItemsPerThread; i++)
263
235
  {
264
- int item_offset = lane_id * ITEMS_PER_THREAD + i;
236
+ int item_offset = lane_id * ItemsPerThread + i;
265
237
  if constexpr (INSERT_PADDING)
266
238
  {
267
239
  item_offset += item_offset >> LOG_SMEM_BANKS;
@@ -273,7 +245,7 @@ private:
273
245
  __syncthreads();
274
246
 
275
247
  _CCCL_PRAGMA_UNROLL_FULL()
276
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
248
+ for (int i = 0; i < ItemsPerThread; i++)
277
249
  {
278
250
  // Read a strip of items
279
251
  const int strip_offset = i * BLOCK_THREADS;
@@ -296,7 +268,7 @@ private:
296
268
 
297
269
  // Copy
298
270
  _CCCL_PRAGMA_UNROLL_FULL()
299
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
271
+ for (int i = 0; i < ItemsPerThread; i++)
300
272
  {
301
273
  output_items[i] = temp_items[i];
302
274
  }
@@ -312,14 +284,14 @@ private:
312
284
  //! Items to exchange, converting between **blocked** and **striped** arrangements.
313
285
  template <typename OutputT>
314
286
  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(
315
- const T (&input_items)[ITEMS_PER_THREAD],
316
- OutputT (&output_items)[ITEMS_PER_THREAD],
287
+ const T (&input_items)[ItemsPerThread],
288
+ OutputT (&output_items)[ItemsPerThread],
317
289
  ::cuda::std::false_type /*time_slicing*/)
318
290
  {
319
291
  _CCCL_PRAGMA_UNROLL_FULL()
320
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
292
+ for (int i = 0; i < ItemsPerThread; i++)
321
293
  {
322
- int item_offset = warp_offset + i + (lane_id * ITEMS_PER_THREAD);
294
+ int item_offset = warp_offset + i + (lane_id * ItemsPerThread);
323
295
  if constexpr (INSERT_PADDING)
324
296
  {
325
297
  item_offset += item_offset >> LOG_SMEM_BANKS;
@@ -330,7 +302,7 @@ private:
330
302
  __syncwarp(0xffffffff);
331
303
 
332
304
  _CCCL_PRAGMA_UNROLL_FULL()
333
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
305
+ for (int i = 0; i < ItemsPerThread; i++)
334
306
  {
335
307
  int item_offset = warp_offset + (i * WARP_TIME_SLICED_THREADS) + lane_id;
336
308
  if constexpr (INSERT_PADDING)
@@ -351,16 +323,16 @@ private:
351
323
  //! Items to exchange, converting between **blocked** and **striped** arrangements.
352
324
  template <typename OutputT>
353
325
  _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(
354
- const T (&input_items)[ITEMS_PER_THREAD],
355
- OutputT (&output_items)[ITEMS_PER_THREAD],
326
+ const T (&input_items)[ItemsPerThread],
327
+ OutputT (&output_items)[ItemsPerThread],
356
328
  ::cuda::std::true_type /*time_slicing*/)
357
329
  {
358
330
  if (warp_id == 0)
359
331
  {
360
332
  _CCCL_PRAGMA_UNROLL_FULL()
361
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
333
+ for (int i = 0; i < ItemsPerThread; i++)
362
334
  {
363
- int item_offset = i + lane_id * ITEMS_PER_THREAD;
335
+ int item_offset = i + lane_id * ItemsPerThread;
364
336
  if constexpr (INSERT_PADDING)
365
337
  {
366
338
  item_offset += item_offset >> LOG_SMEM_BANKS;
@@ -371,7 +343,7 @@ private:
371
343
  __syncwarp(0xffffffff);
372
344
 
373
345
  _CCCL_PRAGMA_UNROLL_FULL()
374
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
346
+ for (int i = 0; i < ItemsPerThread; i++)
375
347
  {
376
348
  int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id;
377
349
  if constexpr (INSERT_PADDING)
@@ -390,9 +362,9 @@ private:
390
362
  if (warp_id == slice)
391
363
  {
392
364
  _CCCL_PRAGMA_UNROLL_FULL()
393
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
365
+ for (int i = 0; i < ItemsPerThread; i++)
394
366
  {
395
- int item_offset = i + lane_id * ITEMS_PER_THREAD;
367
+ int item_offset = i + lane_id * ItemsPerThread;
396
368
  if constexpr (INSERT_PADDING)
397
369
  {
398
370
  item_offset += item_offset >> LOG_SMEM_BANKS;
@@ -403,7 +375,7 @@ private:
403
375
  __syncwarp(0xffffffff);
404
376
 
405
377
  _CCCL_PRAGMA_UNROLL_FULL()
406
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
378
+ for (int i = 0; i < ItemsPerThread; i++)
407
379
  {
408
380
  int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id;
409
381
  if constexpr (INSERT_PADDING)
@@ -426,12 +398,12 @@ private:
426
398
  //! Items to exchange, converting between **blocked** and **striped** arrangements.
427
399
  template <typename OutputT>
428
400
  _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(
429
- const T (&input_items)[ITEMS_PER_THREAD],
430
- OutputT (&output_items)[ITEMS_PER_THREAD],
401
+ const T (&input_items)[ItemsPerThread],
402
+ OutputT (&output_items)[ItemsPerThread],
431
403
  ::cuda::std::false_type /*time_slicing*/)
432
404
  {
433
405
  _CCCL_PRAGMA_UNROLL_FULL()
434
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
406
+ for (int i = 0; i < ItemsPerThread; i++)
435
407
  {
436
408
  int item_offset = i * BLOCK_THREADS + linear_tid;
437
409
  if constexpr (INSERT_PADDING)
@@ -445,9 +417,9 @@ private:
445
417
 
446
418
  // No timeslicing
447
419
  _CCCL_PRAGMA_UNROLL_FULL()
448
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
420
+ for (int i = 0; i < ItemsPerThread; i++)
449
421
  {
450
- int item_offset = linear_tid * ITEMS_PER_THREAD + i;
422
+ int item_offset = linear_tid * ItemsPerThread + i;
451
423
  if constexpr (INSERT_PADDING)
452
424
  {
453
425
  item_offset += item_offset >> LOG_SMEM_BANKS;
@@ -466,12 +438,12 @@ private:
466
438
  //! Items to exchange, converting between **blocked** and **striped** arrangements.
467
439
  template <typename OutputT>
468
440
  _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(
469
- const T (&input_items)[ITEMS_PER_THREAD],
470
- OutputT (&output_items)[ITEMS_PER_THREAD],
441
+ const T (&input_items)[ItemsPerThread],
442
+ OutputT (&output_items)[ItemsPerThread],
471
443
  ::cuda::std::true_type /*time_slicing*/)
472
444
  {
473
445
  // Warp time-slicing
474
- T temp_items[ITEMS_PER_THREAD];
446
+ T temp_items[ItemsPerThread];
475
447
 
476
448
  _CCCL_PRAGMA_UNROLL_FULL()
477
449
  for (int slice = 0; slice < TIME_SLICES; slice++)
@@ -482,7 +454,7 @@ private:
482
454
  __syncthreads();
483
455
 
484
456
  _CCCL_PRAGMA_UNROLL_FULL()
485
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
457
+ for (int i = 0; i < ItemsPerThread; i++)
486
458
  {
487
459
  // Write a strip of items
488
460
  const int strip_offset = i * BLOCK_THREADS;
@@ -507,9 +479,9 @@ private:
507
479
  if (warp_id == slice)
508
480
  {
509
481
  _CCCL_PRAGMA_UNROLL_FULL()
510
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
482
+ for (int i = 0; i < ItemsPerThread; i++)
511
483
  {
512
- int item_offset = lane_id * ITEMS_PER_THREAD + i;
484
+ int item_offset = lane_id * ItemsPerThread + i;
513
485
  if constexpr (INSERT_PADDING)
514
486
  {
515
487
  item_offset += item_offset >> LOG_SMEM_BANKS;
@@ -521,7 +493,7 @@ private:
521
493
 
522
494
  // Copy
523
495
  _CCCL_PRAGMA_UNROLL_FULL()
524
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
496
+ for (int i = 0; i < ItemsPerThread; i++)
525
497
  {
526
498
  output_items[i] = temp_items[i];
527
499
  }
@@ -537,12 +509,12 @@ private:
537
509
  //! Items to exchange, converting between **blocked** and **striped** arrangements.
538
510
  template <typename OutputT>
539
511
  _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(
540
- const T (&input_items)[ITEMS_PER_THREAD],
541
- OutputT (&output_items)[ITEMS_PER_THREAD],
512
+ const T (&input_items)[ItemsPerThread],
513
+ OutputT (&output_items)[ItemsPerThread],
542
514
  ::cuda::std::false_type /*time_slicing*/)
543
515
  {
544
516
  _CCCL_PRAGMA_UNROLL_FULL()
545
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
517
+ for (int i = 0; i < ItemsPerThread; i++)
546
518
  {
547
519
  int item_offset = warp_offset + (i * WARP_TIME_SLICED_THREADS) + lane_id;
548
520
  if constexpr (INSERT_PADDING)
@@ -555,9 +527,9 @@ private:
555
527
  __syncwarp(0xffffffff);
556
528
 
557
529
  _CCCL_PRAGMA_UNROLL_FULL()
558
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
530
+ for (int i = 0; i < ItemsPerThread; i++)
559
531
  {
560
- int item_offset = warp_offset + i + (lane_id * ITEMS_PER_THREAD);
532
+ int item_offset = warp_offset + i + (lane_id * ItemsPerThread);
561
533
  if constexpr (INSERT_PADDING)
562
534
  {
563
535
  item_offset += item_offset >> LOG_SMEM_BANKS;
@@ -576,8 +548,8 @@ private:
576
548
  //! Items to exchange, converting between **blocked** and **striped** arrangements.
577
549
  template <typename OutputT>
578
550
  _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(
579
- const T (&input_items)[ITEMS_PER_THREAD],
580
- OutputT (&output_items)[ITEMS_PER_THREAD],
551
+ const T (&input_items)[ItemsPerThread],
552
+ OutputT (&output_items)[ItemsPerThread],
581
553
  ::cuda::std::true_type /*time_slicing*/)
582
554
  {
583
555
  _CCCL_PRAGMA_UNROLL_FULL()
@@ -588,7 +560,7 @@ private:
588
560
  if (warp_id == slice)
589
561
  {
590
562
  _CCCL_PRAGMA_UNROLL_FULL()
591
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
563
+ for (int i = 0; i < ItemsPerThread; i++)
592
564
  {
593
565
  int item_offset = i * WARP_TIME_SLICED_THREADS + lane_id;
594
566
  if constexpr (INSERT_PADDING)
@@ -601,9 +573,9 @@ private:
601
573
  __syncwarp(0xffffffff);
602
574
 
603
575
  _CCCL_PRAGMA_UNROLL_FULL()
604
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
576
+ for (int i = 0; i < ItemsPerThread; i++)
605
577
  {
606
- int item_offset = i + lane_id * ITEMS_PER_THREAD;
578
+ int item_offset = i + lane_id * ItemsPerThread;
607
579
  if constexpr (INSERT_PADDING)
608
580
  {
609
581
  item_offset += item_offset >> LOG_SMEM_BANKS;
@@ -626,13 +598,13 @@ private:
626
598
  //! Corresponding scatter ranks
627
599
  template <typename OutputT, typename OffsetT>
628
600
  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(
629
- const T (&input_items)[ITEMS_PER_THREAD],
630
- OutputT (&output_items)[ITEMS_PER_THREAD],
631
- OffsetT (&ranks)[ITEMS_PER_THREAD],
601
+ const T (&input_items)[ItemsPerThread],
602
+ OutputT (&output_items)[ItemsPerThread],
603
+ OffsetT (&ranks)[ItemsPerThread],
632
604
  ::cuda::std::false_type /*time_slicing*/)
633
605
  {
634
606
  _CCCL_PRAGMA_UNROLL_FULL()
635
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
607
+ for (int i = 0; i < ItemsPerThread; i++)
636
608
  {
637
609
  int item_offset = ranks[i];
638
610
  if constexpr (INSERT_PADDING)
@@ -645,9 +617,9 @@ private:
645
617
  __syncthreads();
646
618
 
647
619
  _CCCL_PRAGMA_UNROLL_FULL()
648
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
620
+ for (int i = 0; i < ItemsPerThread; i++)
649
621
  {
650
- int item_offset = linear_tid * ITEMS_PER_THREAD + i;
622
+ int item_offset = linear_tid * ItemsPerThread + i;
651
623
  if constexpr (INSERT_PADDING)
652
624
  {
653
625
  item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
@@ -668,12 +640,12 @@ private:
668
640
  //! Corresponding scatter ranks
669
641
  template <typename OutputT, typename OffsetT>
670
642
  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(
671
- const T (&input_items)[ITEMS_PER_THREAD],
672
- OutputT (&output_items)[ITEMS_PER_THREAD],
673
- OffsetT ranks[ITEMS_PER_THREAD],
643
+ const T (&input_items)[ItemsPerThread],
644
+ OutputT (&output_items)[ItemsPerThread],
645
+ OffsetT ranks[ItemsPerThread],
674
646
  ::cuda::std::true_type /*time_slicing*/)
675
647
  {
676
- T temp_items[ITEMS_PER_THREAD];
648
+ T temp_items[ItemsPerThread];
677
649
 
678
650
  _CCCL_PRAGMA_UNROLL_FULL()
679
651
  for (int slice = 0; slice < TIME_SLICES; slice++)
@@ -683,7 +655,7 @@ private:
683
655
  const int slice_offset = TIME_SLICED_ITEMS * slice;
684
656
 
685
657
  _CCCL_PRAGMA_UNROLL_FULL()
686
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
658
+ for (int i = 0; i < ItemsPerThread; i++)
687
659
  {
688
660
  int item_offset = ranks[i] - slice_offset;
689
661
  if (item_offset >= 0 && item_offset < WARP_TIME_SLICED_ITEMS)
@@ -701,9 +673,9 @@ private:
701
673
  if (warp_id == slice)
702
674
  {
703
675
  _CCCL_PRAGMA_UNROLL_FULL()
704
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
676
+ for (int i = 0; i < ItemsPerThread; i++)
705
677
  {
706
- int item_offset = lane_id * ITEMS_PER_THREAD + i;
678
+ int item_offset = lane_id * ItemsPerThread + i;
707
679
  if constexpr (INSERT_PADDING)
708
680
  {
709
681
  item_offset = (item_offset >> LOG_SMEM_BANKS) + item_offset;
@@ -715,7 +687,7 @@ private:
715
687
 
716
688
  // Copy
717
689
  _CCCL_PRAGMA_UNROLL_FULL()
718
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
690
+ for (int i = 0; i < ItemsPerThread; i++)
719
691
  {
720
692
  output_items[i] = temp_items[i];
721
693
  }
@@ -733,13 +705,13 @@ private:
733
705
  //! Corresponding scatter ranks
734
706
  template <typename OutputT, typename OffsetT>
735
707
  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
736
- const T (&input_items)[ITEMS_PER_THREAD],
737
- OutputT (&output_items)[ITEMS_PER_THREAD],
738
- OffsetT (&ranks)[ITEMS_PER_THREAD],
708
+ const T (&input_items)[ItemsPerThread],
709
+ OutputT (&output_items)[ItemsPerThread],
710
+ OffsetT (&ranks)[ItemsPerThread],
739
711
  ::cuda::std::false_type /*time_slicing*/)
740
712
  {
741
713
  _CCCL_PRAGMA_UNROLL_FULL()
742
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
714
+ for (int i = 0; i < ItemsPerThread; i++)
743
715
  {
744
716
  int item_offset = ranks[i];
745
717
  if constexpr (INSERT_PADDING)
@@ -752,7 +724,7 @@ private:
752
724
  __syncthreads();
753
725
 
754
726
  _CCCL_PRAGMA_UNROLL_FULL()
755
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
727
+ for (int i = 0; i < ItemsPerThread; i++)
756
728
  {
757
729
  int item_offset = i * BLOCK_THREADS + linear_tid;
758
730
  if constexpr (INSERT_PADDING)
@@ -775,12 +747,12 @@ private:
775
747
  //! Corresponding scatter ranks
776
748
  template <typename OutputT, typename OffsetT>
777
749
  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
778
- const T (&input_items)[ITEMS_PER_THREAD],
779
- OutputT (&output_items)[ITEMS_PER_THREAD],
780
- OffsetT (&ranks)[ITEMS_PER_THREAD],
750
+ const T (&input_items)[ItemsPerThread],
751
+ OutputT (&output_items)[ItemsPerThread],
752
+ OffsetT (&ranks)[ItemsPerThread],
781
753
  ::cuda::std::true_type /*time_slicing*/)
782
754
  {
783
- T temp_items[ITEMS_PER_THREAD];
755
+ T temp_items[ItemsPerThread];
784
756
 
785
757
  _CCCL_PRAGMA_UNROLL_FULL()
786
758
  for (int slice = 0; slice < TIME_SLICES; slice++)
@@ -791,7 +763,7 @@ private:
791
763
  __syncthreads();
792
764
 
793
765
  _CCCL_PRAGMA_UNROLL_FULL()
794
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
766
+ for (int i = 0; i < ItemsPerThread; i++)
795
767
  {
796
768
  int item_offset = ranks[i] - slice_offset;
797
769
  if (item_offset >= 0 && item_offset < WARP_TIME_SLICED_ITEMS)
@@ -807,7 +779,7 @@ private:
807
779
  __syncthreads();
808
780
 
809
781
  _CCCL_PRAGMA_UNROLL_FULL()
810
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
782
+ for (int i = 0; i < ItemsPerThread; i++)
811
783
  {
812
784
  // Read a strip of items
813
785
  const int strip_offset = i * BLOCK_THREADS;
@@ -830,7 +802,7 @@ private:
830
802
 
831
803
  // Copy
832
804
  _CCCL_PRAGMA_UNROLL_FULL()
833
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
805
+ for (int i = 0; i < ItemsPerThread; i++)
834
806
  {
835
807
  output_items[i] = temp_items[i];
836
808
  }
@@ -898,9 +870,9 @@ public:
898
870
  //! Items from exchange, converting between **striped** and **blocked** arrangements.
899
871
  template <typename OutputT>
900
872
  _CCCL_DEVICE _CCCL_FORCEINLINE void
901
- StripedToBlocked(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
873
+ StripedToBlocked(const T (&input_items)[ItemsPerThread], OutputT (&output_items)[ItemsPerThread])
902
874
  {
903
- StripedToBlocked(input_items, output_items, detail::bool_constant_v<WARP_TIME_SLICING>);
875
+ StripedToBlocked(input_items, output_items, detail::bool_constant_v<WarpTimeSlicing>);
904
876
  }
905
877
 
906
878
  //! @rst
@@ -950,9 +922,9 @@ public:
950
922
  //! Items from exchange, converting between **striped** and **blocked** arrangements.
951
923
  template <typename OutputT>
952
924
  _CCCL_DEVICE _CCCL_FORCEINLINE void
953
- BlockedToStriped(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
925
+ BlockedToStriped(const T (&input_items)[ItemsPerThread], OutputT (&output_items)[ItemsPerThread])
954
926
  {
955
- BlockedToStriped(input_items, output_items, detail::bool_constant_v<WARP_TIME_SLICING>);
927
+ BlockedToStriped(input_items, output_items, detail::bool_constant_v<WarpTimeSlicing>);
956
928
  }
957
929
 
958
930
  //! @rst
@@ -1002,9 +974,9 @@ public:
1002
974
  //! Items from exchange, converting between **striped** and **blocked** arrangements.
1003
975
  template <typename OutputT>
1004
976
  _CCCL_DEVICE _CCCL_FORCEINLINE void
1005
- WarpStripedToBlocked(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
977
+ WarpStripedToBlocked(const T (&input_items)[ItemsPerThread], OutputT (&output_items)[ItemsPerThread])
1006
978
  {
1007
- WarpStripedToBlocked(input_items, output_items, detail::bool_constant_v<WARP_TIME_SLICING>);
979
+ WarpStripedToBlocked(input_items, output_items, detail::bool_constant_v<WarpTimeSlicing>);
1008
980
  }
1009
981
 
1010
982
  //! @rst
@@ -1057,9 +1029,9 @@ public:
1057
1029
  //! Items from exchange, converting between **striped** and **blocked** arrangements.
1058
1030
  template <typename OutputT>
1059
1031
  _CCCL_DEVICE _CCCL_FORCEINLINE void
1060
- BlockedToWarpStriped(const T (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD])
1032
+ BlockedToWarpStriped(const T (&input_items)[ItemsPerThread], OutputT (&output_items)[ItemsPerThread])
1061
1033
  {
1062
- BlockedToWarpStriped(input_items, output_items, detail::bool_constant_v<WARP_TIME_SLICING>);
1034
+ BlockedToWarpStriped(input_items, output_items, detail::bool_constant_v<WarpTimeSlicing>);
1063
1035
  }
1064
1036
 
1065
1037
  //! @} end member group
@@ -1085,11 +1057,9 @@ public:
1085
1057
  //! Corresponding scatter ranks
1086
1058
  template <typename OutputT, typename OffsetT>
1087
1059
  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(
1088
- const T (&input_items)[ITEMS_PER_THREAD],
1089
- OutputT (&output_items)[ITEMS_PER_THREAD],
1090
- OffsetT (&ranks)[ITEMS_PER_THREAD])
1060
+ const T (&input_items)[ItemsPerThread], OutputT (&output_items)[ItemsPerThread], OffsetT (&ranks)[ItemsPerThread])
1091
1061
  {
1092
- ScatterToBlocked(input_items, output_items, ranks, detail::bool_constant_v<WARP_TIME_SLICING>);
1062
+ ScatterToBlocked(input_items, output_items, ranks, detail::bool_constant_v<WarpTimeSlicing>);
1093
1063
  }
1094
1064
 
1095
1065
  //! @rst
@@ -1112,11 +1082,9 @@ public:
1112
1082
  //! Corresponding scatter ranks
1113
1083
  template <typename OutputT, typename OffsetT>
1114
1084
  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(
1115
- const T (&input_items)[ITEMS_PER_THREAD],
1116
- OutputT (&output_items)[ITEMS_PER_THREAD],
1117
- OffsetT (&ranks)[ITEMS_PER_THREAD])
1085
+ const T (&input_items)[ItemsPerThread], OutputT (&output_items)[ItemsPerThread], OffsetT (&ranks)[ItemsPerThread])
1118
1086
  {
1119
- ScatterToStriped(input_items, output_items, ranks, detail::bool_constant_v<WARP_TIME_SLICING>);
1087
+ ScatterToStriped(input_items, output_items, ranks, detail::bool_constant_v<WarpTimeSlicing>);
1120
1088
  }
1121
1089
 
1122
1090
  //! @rst
@@ -1139,12 +1107,10 @@ public:
1139
1107
  //! Corresponding scatter ranks
1140
1108
  template <typename OutputT, typename OffsetT>
1141
1109
  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedGuarded(
1142
- const T (&input_items)[ITEMS_PER_THREAD],
1143
- OutputT (&output_items)[ITEMS_PER_THREAD],
1144
- OffsetT (&ranks)[ITEMS_PER_THREAD])
1110
+ const T (&input_items)[ItemsPerThread], OutputT (&output_items)[ItemsPerThread], OffsetT (&ranks)[ItemsPerThread])
1145
1111
  {
1146
1112
  _CCCL_PRAGMA_UNROLL_FULL()
1147
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
1113
+ for (int i = 0; i < ItemsPerThread; i++)
1148
1114
  {
1149
1115
  int item_offset = ranks[i];
1150
1116
  if constexpr (INSERT_PADDING)
@@ -1160,7 +1126,7 @@ public:
1160
1126
  __syncthreads();
1161
1127
 
1162
1128
  _CCCL_PRAGMA_UNROLL_FULL()
1163
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
1129
+ for (int i = 0; i < ItemsPerThread; i++)
1164
1130
  {
1165
1131
  int item_offset = i * BLOCK_THREADS + linear_tid;
1166
1132
  if constexpr (INSERT_PADDING)
@@ -1197,13 +1163,13 @@ public:
1197
1163
  //! Corresponding flag denoting item validity
1198
1164
  template <typename OutputT, typename OffsetT, typename ValidFlag>
1199
1165
  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedFlagged(
1200
- const T (&input_items)[ITEMS_PER_THREAD],
1201
- OutputT (&output_items)[ITEMS_PER_THREAD],
1202
- OffsetT (&ranks)[ITEMS_PER_THREAD],
1203
- ValidFlag (&is_valid)[ITEMS_PER_THREAD])
1166
+ const T (&input_items)[ItemsPerThread],
1167
+ OutputT (&output_items)[ItemsPerThread],
1168
+ OffsetT (&ranks)[ItemsPerThread],
1169
+ ValidFlag (&is_valid)[ItemsPerThread])
1204
1170
  {
1205
1171
  _CCCL_PRAGMA_UNROLL_FULL()
1206
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
1172
+ for (int i = 0; i < ItemsPerThread; i++)
1207
1173
  {
1208
1174
  int item_offset = ranks[i];
1209
1175
  if constexpr (INSERT_PADDING)
@@ -1219,7 +1185,7 @@ public:
1219
1185
  __syncthreads();
1220
1186
 
1221
1187
  _CCCL_PRAGMA_UNROLL_FULL()
1222
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
1188
+ for (int i = 0; i < ItemsPerThread; i++)
1223
1189
  {
1224
1190
  int item_offset = i * BLOCK_THREADS + linear_tid;
1225
1191
  if constexpr (INSERT_PADDING)
@@ -1236,28 +1202,28 @@ public:
1236
1202
 
1237
1203
  /// @param[in-out] items
1238
1204
  /// Items to exchange, converting between **striped** and **blocked** arrangements.
1239
- _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(T (&items)[ITEMS_PER_THREAD])
1205
+ _CCCL_DEVICE _CCCL_FORCEINLINE void StripedToBlocked(T (&items)[ItemsPerThread])
1240
1206
  {
1241
1207
  StripedToBlocked(items, items);
1242
1208
  }
1243
1209
 
1244
1210
  /// @param[in-out] items
1245
1211
  /// Items to exchange, converting between **striped** and **blocked** arrangements.
1246
- _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(T (&items)[ITEMS_PER_THREAD])
1212
+ _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToStriped(T (&items)[ItemsPerThread])
1247
1213
  {
1248
1214
  BlockedToStriped(items, items);
1249
1215
  }
1250
1216
 
1251
1217
  /// @param[in-out] items
1252
1218
  /// Items to exchange, converting between **striped** and **blocked** arrangements.
1253
- _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(T (&items)[ITEMS_PER_THREAD])
1219
+ _CCCL_DEVICE _CCCL_FORCEINLINE void WarpStripedToBlocked(T (&items)[ItemsPerThread])
1254
1220
  {
1255
1221
  WarpStripedToBlocked(items, items);
1256
1222
  }
1257
1223
 
1258
1224
  /// @param[in-out] items
1259
1225
  /// Items to exchange, converting between **striped** and **blocked** arrangements.
1260
- _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(T (&items)[ITEMS_PER_THREAD])
1226
+ _CCCL_DEVICE _CCCL_FORCEINLINE void BlockedToWarpStriped(T (&items)[ItemsPerThread])
1261
1227
  {
1262
1228
  BlockedToWarpStriped(items, items);
1263
1229
  }
@@ -1268,7 +1234,7 @@ public:
1268
1234
  /// @param[in] ranks
1269
1235
  /// Corresponding scatter ranks
1270
1236
  template <typename OffsetT>
1271
- _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
1237
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToBlocked(T (&items)[ItemsPerThread], OffsetT (&ranks)[ItemsPerThread])
1272
1238
  {
1273
1239
  ScatterToBlocked(items, items, ranks);
1274
1240
  }
@@ -1278,7 +1244,7 @@ public:
1278
1244
  /// @param[in] ranks
1279
1245
  /// Corresponding scatter ranks
1280
1246
  template <typename OffsetT>
1281
- _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
1247
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStriped(T (&items)[ItemsPerThread], OffsetT (&ranks)[ItemsPerThread])
1282
1248
  {
1283
1249
  ScatterToStriped(items, items, ranks);
1284
1250
  }
@@ -1289,7 +1255,7 @@ public:
1289
1255
  /// Corresponding scatter ranks
1290
1256
  template <typename OffsetT>
1291
1257
  _CCCL_DEVICE _CCCL_FORCEINLINE void
1292
- ScatterToStripedGuarded(T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD])
1258
+ ScatterToStripedGuarded(T (&items)[ItemsPerThread], OffsetT (&ranks)[ItemsPerThread])
1293
1259
  {
1294
1260
  ScatterToStripedGuarded(items, items, ranks);
1295
1261
  }
@@ -1302,7 +1268,7 @@ public:
1302
1268
  /// Corresponding flag denoting item validity
1303
1269
  template <typename OffsetT, typename ValidFlag>
1304
1270
  _CCCL_DEVICE _CCCL_FORCEINLINE void ScatterToStripedFlagged(
1305
- T (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD], ValidFlag (&is_valid)[ITEMS_PER_THREAD])
1271
+ T (&items)[ItemsPerThread], OffsetT (&ranks)[ItemsPerThread], ValidFlag (&is_valid)[ItemsPerThread])
1306
1272
  {
1307
1273
  ScatterToStripedFlagged(items, items, ranks, is_valid);
1308
1274
  }