cuda-cccl 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.4__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (911) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +12 -38
  2. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +16 -40
  3. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -28
  4. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +24 -56
  5. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +12 -38
  6. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +31 -56
  7. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +31 -35
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +47 -48
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +39 -42
  10. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +33 -60
  11. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +18 -44
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +26 -55
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +22 -49
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +15 -41
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +9 -35
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +20 -49
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +14 -40
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +18 -40
  19. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +0 -2
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +20 -46
  21. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +3 -28
  22. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +7 -31
  23. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +10 -34
  24. cuda/cccl/headers/include/cub/block/block_exchange.cuh +120 -154
  25. cuda/cccl/headers/include/cub/block/block_histogram.cuh +28 -52
  26. cuda/cccl/headers/include/cub/block/block_load.cuh +124 -146
  27. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +0 -16
  28. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +58 -87
  29. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +81 -100
  30. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +92 -156
  31. cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +8 -32
  32. cuda/cccl/headers/include/cub/block/block_reduce.cuh +21 -46
  33. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +51 -79
  34. cuda/cccl/headers/include/cub/block/block_scan.cuh +94 -401
  35. cuda/cccl/headers/include/cub/block/block_shuffle.cuh +10 -34
  36. cuda/cccl/headers/include/cub/block/block_store.cuh +73 -97
  37. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +2 -29
  38. cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +5 -29
  39. cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +25 -49
  40. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +12 -34
  41. cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +10 -34
  42. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +3 -27
  43. cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +12 -36
  44. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +9 -33
  45. cuda/cccl/headers/include/cub/config.cuh +2 -26
  46. cuda/cccl/headers/include/cub/cub.cuh +3 -27
  47. cuda/cccl/headers/include/cub/detail/array_utils.cuh +2 -26
  48. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +2 -28
  49. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +3 -27
  50. cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +0 -2
  51. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -3
  52. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +2 -28
  53. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +0 -2
  54. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +0 -2
  55. cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +0 -2
  56. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +0 -2
  57. cuda/cccl/headers/include/cub/detail/ptx-json/README.md +7 -12
  58. cuda/cccl/headers/include/cub/detail/ptx-json/array.h +6 -33
  59. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +13 -36
  60. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +9 -38
  61. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +58 -32
  62. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +51 -51
  63. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +7 -31
  64. cuda/cccl/headers/include/cub/detail/rfa.cuh +2 -27
  65. cuda/cccl/headers/include/cub/detail/strong_load.cuh +3 -29
  66. cuda/cccl/headers/include/cub/detail/strong_store.cuh +3 -29
  67. cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +2 -9
  68. cuda/cccl/headers/include/cub/detail/type_traits.cuh +0 -2
  69. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +6 -31
  70. cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +2 -25
  71. cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +2 -26
  72. cuda/cccl/headers/include/cub/device/device_for.cuh +3 -5
  73. cuda/cccl/headers/include/cub/device/device_histogram.cuh +3 -27
  74. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +2 -26
  75. cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +2 -26
  76. cuda/cccl/headers/include/cub/device/device_partition.cuh +3 -27
  77. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3 -27
  78. cuda/cccl/headers/include/cub/device/device_reduce.cuh +10 -31
  79. cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +3 -27
  80. cuda/cccl/headers/include/cub/device/device_scan.cuh +16 -34
  81. cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +3 -27
  82. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +3 -27
  83. cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2 -26
  84. cuda/cccl/headers/include/cub/device/device_select.cuh +3 -27
  85. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +2 -28
  86. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +2 -27
  87. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +0 -2
  88. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +3 -29
  89. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +14 -34
  90. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +5 -30
  91. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +4 -29
  92. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +5 -32
  93. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +3 -29
  94. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -29
  95. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +1 -2
  96. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +47 -59
  97. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +21 -30
  98. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +2 -27
  99. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +3 -27
  100. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +3 -27
  101. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +0 -2
  102. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +51 -36
  103. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +3 -28
  104. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +0 -1
  105. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +27 -55
  106. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +4 -28
  107. cuda/cccl/headers/include/cub/device/dispatch/kernels/{for_each.cuh → kernel_for_each.cuh} +0 -2
  108. cuda/cccl/headers/include/cub/device/dispatch/kernels/{histogram.cuh → kernel_histogram.cuh} +149 -157
  109. cuda/cccl/headers/include/cub/device/dispatch/kernels/{merge_sort.cuh → kernel_merge_sort.cuh} +0 -2
  110. cuda/cccl/headers/include/cub/device/dispatch/kernels/{radix_sort.cuh → kernel_radix_sort.cuh} +0 -2
  111. cuda/cccl/headers/include/cub/device/dispatch/kernels/{reduce.cuh → kernel_reduce.cuh} +2 -28
  112. cuda/cccl/headers/include/cub/device/dispatch/kernels/{scan.cuh → kernel_scan.cuh} +2 -28
  113. cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_reduce.cuh → kernel_segmented_reduce.cuh} +3 -29
  114. cuda/cccl/headers/include/cub/device/dispatch/kernels/{segmented_sort.cuh → kernel_segmented_sort.cuh} +0 -1
  115. cuda/cccl/headers/include/cub/device/dispatch/kernels/{three_way_partition.cuh → kernel_three_way_partition.cuh} +0 -1
  116. cuda/cccl/headers/include/cub/device/dispatch/kernels/{transform.cuh → kernel_transform.cuh} +11 -11
  117. cuda/cccl/headers/include/cub/device/dispatch/kernels/{unique_by_key.cuh → kernel_unique_by_key.cuh} +0 -1
  118. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -26
  119. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -26
  120. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -28
  121. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +6 -26
  122. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -26
  123. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +5 -31
  124. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +31 -33
  125. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +15 -40
  126. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -26
  127. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -28
  128. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +20 -44
  129. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -26
  130. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +20 -45
  131. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +2 -27
  132. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +11 -36
  133. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +0 -1
  134. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +2 -27
  135. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +14 -40
  136. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +3 -27
  137. cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +3 -27
  138. cuda/cccl/headers/include/cub/grid/grid_queue.cuh +3 -27
  139. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +3 -27
  140. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +3 -27
  141. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +3 -27
  142. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +3 -27
  143. cuda/cccl/headers/include/cub/thread/thread_load.cuh +3 -28
  144. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +3 -27
  145. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +3 -26
  146. cuda/cccl/headers/include/cub/thread/thread_scan.cuh +3 -29
  147. cuda/cccl/headers/include/cub/thread/thread_search.cuh +3 -27
  148. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +0 -2
  149. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +2 -26
  150. cuda/cccl/headers/include/cub/thread/thread_store.cuh +3 -27
  151. cuda/cccl/headers/include/cub/util_allocator.cuh +3 -27
  152. cuda/cccl/headers/include/cub/util_arch.cuh +3 -29
  153. cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +2 -26
  154. cuda/cccl/headers/include/cub/util_debug.cuh +3 -27
  155. cuda/cccl/headers/include/cub/util_device.cuh +18 -59
  156. cuda/cccl/headers/include/cub/util_macro.cuh +4 -28
  157. cuda/cccl/headers/include/cub/util_math.cuh +2 -28
  158. cuda/cccl/headers/include/cub/util_namespace.cuh +3 -28
  159. cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +3 -27
  160. cuda/cccl/headers/include/cub/util_ptx.cuh +6 -30
  161. cuda/cccl/headers/include/cub/util_temporary_storage.cuh +3 -29
  162. cuda/cccl/headers/include/cub/util_type.cuh +5 -32
  163. cuda/cccl/headers/include/cub/util_vsmem.cuh +2 -28
  164. cuda/cccl/headers/include/cub/version.cuh +2 -26
  165. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +10 -35
  166. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +5 -30
  167. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +15 -39
  168. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +5 -35
  169. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +22 -46
  170. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +3 -27
  171. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +2 -26
  172. cuda/cccl/headers/include/cub/warp/warp_load.cuh +4 -27
  173. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +2 -26
  174. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +3 -22
  175. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +3 -27
  176. cuda/cccl/headers/include/cub/warp/warp_store.cuh +4 -27
  177. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +0 -2
  178. cuda/cccl/headers/include/cuda/__barrier/barrier.h +1 -1
  179. cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +0 -1
  180. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +277 -235
  181. cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +0 -1
  182. cuda/cccl/headers/include/cuda/__driver/driver_api.h +13 -0
  183. cuda/cccl/headers/include/cuda/__execution/determinism.h +0 -2
  184. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +0 -2
  185. cuda/cccl/headers/include/cuda/__functional/maximum.h +25 -7
  186. cuda/cccl/headers/include/cuda/__functional/minimum.h +25 -7
  187. cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
  188. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +0 -2
  189. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +13 -4
  190. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +4 -2
  191. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +0 -1
  192. cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +28 -7
  193. cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +1 -1
  194. cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
  195. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +2 -3
  196. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +1 -7
  197. cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +0 -1
  198. cuda/cccl/headers/include/cuda/__memory/get_device_address.h +1 -1
  199. cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +126 -0
  200. cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +898 -0
  201. cuda/cccl/headers/include/cuda/__memory_resource/device_memory_pool.h +149 -0
  202. cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +3 -3
  203. cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
  204. cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +139 -0
  205. cuda/cccl/headers/include/cuda/__memory_resource/managed_memory_pool.h +146 -0
  206. cuda/cccl/headers/include/cuda/__memory_resource/memory_resource_base.h +578 -0
  207. cuda/cccl/headers/include/cuda/__memory_resource/pinned_memory_pool.h +188 -0
  208. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +3 -3
  209. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +37 -3
  210. cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +13 -3
  211. cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
  212. cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +2 -2
  213. cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +344 -0
  214. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +0 -6
  215. cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +1 -1
  216. cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
  217. cuda/cccl/headers/include/cuda/{std/__cuda → __runtime}/api_wrapper.h +3 -3
  218. cuda/cccl/headers/include/cuda/__stream/get_stream.h +0 -1
  219. cuda/cccl/headers/include/cuda/{__fwd/barrier_native_handle.h → __stream/internal_streams.h} +17 -15
  220. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +2 -2
  221. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +1 -0
  222. cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +1 -0
  223. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +2 -1
  224. cuda/cccl/headers/include/cuda/barrier +42 -16
  225. cuda/cccl/headers/include/cuda/memory +1 -0
  226. cuda/cccl/headers/include/cuda/memory_resource +6 -1
  227. cuda/cccl/headers/include/cuda/numeric +2 -0
  228. cuda/cccl/headers/include/cuda/pipeline +3 -2
  229. cuda/cccl/headers/include/cuda/ptx +1 -0
  230. cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +0 -2
  231. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +1 -1
  232. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +115 -58
  233. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +844 -378
  234. cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +12 -5
  235. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +31 -0
  236. cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +10 -0
  237. cuda/cccl/headers/include/cuda/std/__atomic/types.h +2 -3
  238. cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +37 -13
  239. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +0 -28
  240. cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +7 -0
  241. cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +10 -0
  242. cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +2 -45
  243. cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +0 -2
  244. cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +8 -0
  245. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +0 -2
  246. cuda/cccl/headers/include/cuda/std/__chrono/day.h +0 -2
  247. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +13 -17
  248. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +0 -2
  249. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +0 -2
  250. cuda/cccl/headers/include/cuda/std/__chrono/month.h +0 -2
  251. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +0 -2
  252. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +0 -2
  253. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +5 -8
  254. cuda/cccl/headers/include/cuda/std/__chrono/year.h +0 -2
  255. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +4 -0
  256. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +2 -3
  257. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +4 -0
  258. cuda/cccl/headers/include/cuda/std/__cmath/fma.h +4 -0
  259. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +2 -3
  260. cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +2 -3
  261. cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +2 -3
  262. cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +2 -3
  263. cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +2 -3
  264. cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +2 -3
  265. cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +2 -3
  266. cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +2 -3
  267. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +2 -3
  268. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +2 -3
  269. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +2 -2
  270. cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +4 -0
  271. cuda/cccl/headers/include/cuda/std/__cmath/roots.h +2 -3
  272. cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +2 -3
  273. cuda/cccl/headers/include/cuda/std/__cmath/traits.h +4 -0
  274. cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +2 -3
  275. cuda/cccl/headers/include/cuda/std/__complex/complex.h +0 -6
  276. cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +2 -2
  277. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +27 -1
  278. cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +2 -4
  279. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +15 -36
  280. cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +93 -0
  281. cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/stdexcept → __exception/throw_error.h} +3 -3
  282. cuda/cccl/headers/include/cuda/std/__expected/expected.h +28 -43
  283. cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +2 -10
  284. cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +2 -2
  285. cuda/cccl/headers/include/cuda/std/__functional/bind.h +6 -6
  286. cuda/cccl/headers/include/cuda/std/__functional/function.h +2 -6
  287. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +5 -5
  288. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +5 -0
  289. cuda/cccl/headers/include/cuda/std/__fwd/array.h +2 -2
  290. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +12 -0
  291. cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
  292. cuda/cccl/headers/include/cuda/std/__fwd/get.h +21 -22
  293. cuda/cccl/headers/include/cuda/std/{detail/libcxx/include/iosfwd → __fwd/ios.h} +5 -10
  294. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +19 -10
  295. cuda/cccl/headers/include/cuda/std/__fwd/optional.h +2 -2
  296. cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +5 -0
  297. cuda/cccl/headers/include/cuda/std/__fwd/span.h +2 -2
  298. cuda/cccl/headers/include/cuda/std/__fwd/string.h +7 -0
  299. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +18 -0
  300. cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +3 -0
  301. cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
  302. cuda/cccl/headers/include/cuda/std/{__type_traits/is_reference_wrapper.h → __fwd/variant.h} +16 -15
  303. cuda/cccl/headers/include/cuda/std/__internal/features.h +14 -0
  304. cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +1 -1
  305. cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +1 -1
  306. cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +58 -40
  307. cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +1 -1
  308. cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +1 -1
  309. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +0 -5
  310. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +4 -18
  311. cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +1 -2
  312. cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +0 -2
  313. cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +0 -2
  314. cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +0 -4
  315. cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +0 -5
  316. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +3 -10
  317. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +4 -15
  318. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +4 -4
  319. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +4 -4
  320. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +2 -4
  321. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +3 -3
  322. cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +1 -1
  323. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +1 -0
  324. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +6 -12
  325. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -5
  326. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +7 -2
  327. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +1 -0
  328. cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +5 -0
  329. cuda/cccl/headers/include/cuda/std/__new/allocate.h +5 -0
  330. cuda/cccl/headers/include/cuda/{__barrier/barrier_native_handle.h → std/__new/device_new.h} +9 -24
  331. cuda/cccl/headers/include/cuda/std/__new_ +1 -0
  332. cuda/cccl/headers/include/cuda/std/__optional/optional.h +5 -4
  333. cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +4 -4
  334. cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +1 -1
  335. cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
  336. cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
  337. cuda/cccl/headers/include/cuda/std/__random_ +2 -0
  338. cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +7 -19
  339. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -4
  340. cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +5 -4
  341. cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +1 -1
  342. cuda/cccl/headers/include/cuda/std/__string/string_view.h +5 -5
  343. cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
  344. cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
  345. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +0 -160
  346. cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +123 -129
  347. cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
  348. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
  349. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
  350. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
  351. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +7 -0
  352. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
  353. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +1 -2
  354. cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
  355. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +1 -1
  356. cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
  357. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +0 -2
  358. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +4 -24
  359. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +0 -2
  360. cuda/cccl/headers/include/cuda/std/__utility/pair.h +20 -20
  361. cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +0 -2
  362. cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
  363. cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
  364. cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
  365. cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
  366. cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
  367. cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
  368. cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
  369. cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
  370. cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
  371. cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
  372. cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
  373. cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
  374. cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
  375. cuda/cccl/headers/include/cuda/std/array +1 -1
  376. cuda/cccl/headers/include/cuda/std/atomic +1 -1
  377. cuda/cccl/headers/include/cuda/std/bitset +2 -10
  378. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +6 -6
  379. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1 -4
  380. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3 -6
  381. cuda/cccl/headers/include/cuda/std/functional +1 -1
  382. cuda/cccl/headers/include/cuda/std/initializer_list +8 -0
  383. cuda/cccl/headers/include/cuda/std/inplace_vector +6 -5
  384. cuda/cccl/headers/include/cuda/std/iterator +1 -1
  385. cuda/cccl/headers/include/cuda/std/numbers +0 -2
  386. cuda/cccl/headers/include/cuda/std/ratio +2 -2
  387. cuda/cccl/headers/include/cuda/std/span +2 -2
  388. cuda/cccl/headers/include/cuda/std/string_view +24 -42
  389. cuda/cccl/headers/include/cuda/std/tuple +18 -1
  390. cuda/cccl/headers/include/cuda/std/type_traits +0 -1
  391. cuda/cccl/headers/include/cuda/std/variant +8 -1
  392. cuda/cccl/headers/include/nv/target +2 -6
  393. cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +15 -2
  394. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +0 -2
  395. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +0 -1
  396. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +0 -1
  397. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +0 -2
  398. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +0 -2
  399. cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +0 -2
  400. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +0 -2
  401. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +0 -2
  402. cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +0 -4
  403. cuda/cccl/headers/include/thrust/detail/binary_search.inl +14 -2
  404. cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +2 -7
  405. cuda/cccl/headers/include/thrust/detail/complex/c99math.h +2 -8
  406. cuda/cccl/headers/include/thrust/detail/complex/catrig.h +2 -8
  407. cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +2 -8
  408. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +2 -8
  409. cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +2 -8
  410. cuda/cccl/headers/include/thrust/detail/complex/cexp.h +2 -7
  411. cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +2 -8
  412. cuda/cccl/headers/include/thrust/detail/complex/clog.h +2 -8
  413. cuda/cccl/headers/include/thrust/detail/complex/clogf.h +2 -8
  414. cuda/cccl/headers/include/thrust/detail/complex/cproj.h +2 -7
  415. cuda/cccl/headers/include/thrust/detail/complex/csinh.h +2 -8
  416. cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +2 -8
  417. cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +2 -8
  418. cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +2 -8
  419. cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +2 -8
  420. cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +2 -8
  421. cuda/cccl/headers/include/thrust/detail/complex/math_private.h +2 -8
  422. cuda/cccl/headers/include/thrust/detail/config/device_system.h +2 -0
  423. cuda/cccl/headers/include/thrust/detail/config/host_system.h +2 -0
  424. cuda/cccl/headers/include/thrust/detail/config/namespace.h +0 -1
  425. cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +0 -2
  426. cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +0 -2
  427. cuda/cccl/headers/include/thrust/detail/copy.h +0 -2
  428. cuda/cccl/headers/include/thrust/detail/copy.inl +14 -4
  429. cuda/cccl/headers/include/thrust/detail/copy_if.inl +14 -2
  430. cuda/cccl/headers/include/thrust/detail/count.inl +14 -2
  431. cuda/cccl/headers/include/thrust/detail/equal.inl +14 -2
  432. cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +4 -5
  433. cuda/cccl/headers/include/thrust/detail/extrema.inl +14 -2
  434. cuda/cccl/headers/include/thrust/detail/fill.inl +14 -2
  435. cuda/cccl/headers/include/thrust/detail/find.inl +14 -2
  436. cuda/cccl/headers/include/thrust/detail/for_each.inl +14 -2
  437. cuda/cccl/headers/include/thrust/detail/functional/actor.h +2 -5
  438. cuda/cccl/headers/include/thrust/detail/functional/operators.h +2 -5
  439. cuda/cccl/headers/include/thrust/detail/gather.inl +14 -2
  440. cuda/cccl/headers/include/thrust/detail/generate.inl +14 -2
  441. cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +0 -2
  442. cuda/cccl/headers/include/thrust/detail/inner_product.inl +14 -2
  443. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -0
  444. cuda/cccl/headers/include/thrust/detail/logical.inl +14 -2
  445. cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +13 -1
  446. cuda/cccl/headers/include/thrust/detail/merge.inl +14 -2
  447. cuda/cccl/headers/include/thrust/detail/mismatch.inl +14 -2
  448. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +0 -4
  449. cuda/cccl/headers/include/thrust/detail/partition.inl +14 -2
  450. cuda/cccl/headers/include/thrust/detail/random_bijection.h +0 -2
  451. cuda/cccl/headers/include/thrust/detail/range/head_flags.h +0 -2
  452. cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +0 -2
  453. cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +0 -6
  454. cuda/cccl/headers/include/thrust/detail/reduce.inl +21 -3
  455. cuda/cccl/headers/include/thrust/detail/reference.h +27 -3
  456. cuda/cccl/headers/include/thrust/detail/remove.inl +14 -2
  457. cuda/cccl/headers/include/thrust/detail/replace.inl +14 -2
  458. cuda/cccl/headers/include/thrust/detail/reverse.inl +14 -2
  459. cuda/cccl/headers/include/thrust/detail/scan.inl +21 -3
  460. cuda/cccl/headers/include/thrust/detail/scatter.inl +14 -2
  461. cuda/cccl/headers/include/thrust/detail/sequence.inl +13 -1
  462. cuda/cccl/headers/include/thrust/detail/set_operations.inl +13 -1
  463. cuda/cccl/headers/include/thrust/detail/sort.inl +13 -1
  464. cuda/cccl/headers/include/thrust/detail/static_assert.h +0 -2
  465. cuda/cccl/headers/include/thrust/detail/static_map.h +0 -3
  466. cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +13 -1
  467. cuda/cccl/headers/include/thrust/detail/tabulate.inl +14 -2
  468. cuda/cccl/headers/include/thrust/detail/temporary_array.h +0 -4
  469. cuda/cccl/headers/include/thrust/detail/temporary_array.inl +0 -1
  470. cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +14 -3
  471. cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +13 -1
  472. cuda/cccl/headers/include/thrust/detail/transform_scan.inl +13 -1
  473. cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +0 -2
  474. cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +0 -2
  475. cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +2 -7
  476. cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +0 -2
  477. cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +0 -4
  478. cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +0 -4
  479. cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +14 -2
  480. cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +14 -2
  481. cuda/cccl/headers/include/thrust/detail/unique.inl +21 -3
  482. cuda/cccl/headers/include/thrust/detail/vector_base.h +0 -2
  483. cuda/cccl/headers/include/thrust/detail/vector_base.inl +0 -2
  484. cuda/cccl/headers/include/thrust/execution_policy.h +10 -9
  485. cuda/cccl/headers/include/thrust/functional.h +0 -2
  486. cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +9 -4
  487. cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +8 -4
  488. cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +0 -1
  489. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +0 -1
  490. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +0 -1
  491. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +0 -1
  492. cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +0 -1
  493. cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +2 -6
  494. cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +0 -1
  495. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +0 -2
  496. cuda/cccl/headers/include/thrust/mr/allocator.h +0 -2
  497. cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +9 -4
  498. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +10 -10
  499. cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +0 -2
  500. cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +0 -2
  501. cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +0 -2
  502. cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +8 -4
  503. cuda/cccl/headers/include/thrust/mr/memory_resource.h +0 -2
  504. cuda/cccl/headers/include/thrust/mr/new.h +0 -2
  505. cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +0 -2
  506. cuda/cccl/headers/include/thrust/mr/pool.h +10 -10
  507. cuda/cccl/headers/include/thrust/mr/pool_options.h +4 -6
  508. cuda/cccl/headers/include/thrust/mr/sync_pool.h +0 -2
  509. cuda/cccl/headers/include/thrust/mr/tls_pool.h +0 -2
  510. cuda/cccl/headers/include/thrust/mr/validator.h +0 -2
  511. cuda/cccl/headers/include/thrust/per_device_resource.h +13 -1
  512. cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +0 -2
  513. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +0 -2
  514. cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +2 -9
  515. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +0 -2
  516. cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +2 -9
  517. cuda/cccl/headers/include/thrust/random/detail/mod.h +2 -9
  518. cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +0 -2
  519. cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +2 -7
  520. cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +2 -9
  521. cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +0 -2
  522. cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +0 -2
  523. cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +0 -2
  524. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +0 -2
  525. cuda/cccl/headers/include/thrust/random/discard_block_engine.h +0 -2
  526. cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +0 -2
  527. cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +0 -2
  528. cuda/cccl/headers/include/thrust/random/normal_distribution.h +0 -2
  529. cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +0 -2
  530. cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +0 -2
  531. cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +0 -2
  532. cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +0 -2
  533. cuda/cccl/headers/include/thrust/random.h +0 -2
  534. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +15 -11
  535. cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +2 -7
  536. cuda/cccl/headers/include/thrust/system/cpp/memory.h +0 -1
  537. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +0 -2
  538. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +0 -2
  539. cuda/cccl/headers/include/thrust/system/cpp/vector.h +0 -1
  540. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +0 -4
  541. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +0 -1
  542. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +0 -4
  543. cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +2 -9
  544. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +4 -32
  545. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +2 -9
  546. cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +0 -2
  547. cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +0 -2
  548. cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +23 -2
  549. cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +0 -2
  550. cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +2 -11
  551. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +2 -0
  552. cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +0 -4
  553. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +0 -1
  554. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +0 -5
  555. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +0 -1
  556. cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +0 -2
  557. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +0 -2
  558. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +0 -1
  559. cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +2 -8
  560. cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +0 -2
  561. cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +0 -2
  562. cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +2 -26
  563. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +7 -142
  564. cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +0 -2
  565. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +0 -4
  566. cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +0 -2
  567. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +0 -5
  568. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +0 -4
  569. cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +0 -2
  570. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +0 -1
  571. cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +0 -4
  572. cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +0 -4
  573. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +0 -3
  574. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +0 -2
  575. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +3 -5
  576. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +8 -10
  577. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +0 -2
  578. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +0 -1
  579. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +0 -4
  580. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +0 -2
  581. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +1 -7
  582. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +2 -7
  583. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +0 -3
  584. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +0 -4
  585. cuda/cccl/headers/include/thrust/system/cuda/error.h +2 -11
  586. cuda/cccl/headers/include/thrust/system/cuda/memory.h +2 -6
  587. cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +2 -9
  588. cuda/cccl/headers/include/thrust/system/cuda/pointer.h +2 -7
  589. cuda/cccl/headers/include/thrust/system/cuda/vector.h +2 -6
  590. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +0 -2
  591. cuda/cccl/headers/include/thrust/system/detail/errno.h +0 -2
  592. cuda/cccl/headers/include/thrust/system/detail/error_category.inl +0 -4
  593. cuda/cccl/headers/include/thrust/system/detail/error_code.inl +0 -2
  594. cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +0 -2
  595. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +0 -2
  596. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +0 -2
  597. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +0 -2
  598. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +0 -3
  599. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +0 -2
  600. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +0 -2
  601. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +0 -2
  602. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +0 -3
  603. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +0 -2
  604. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +0 -2
  605. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +0 -2
  606. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +0 -2
  607. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +0 -2
  608. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +0 -3
  609. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +0 -2
  610. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +0 -2
  611. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +0 -2
  612. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +0 -2
  613. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +0 -2
  614. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +0 -2
  615. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +0 -2
  616. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +0 -2
  617. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +0 -2
  618. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +0 -2
  619. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +0 -2
  620. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +0 -3
  621. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +0 -2
  622. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +0 -2
  623. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +0 -2
  624. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +0 -2
  625. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +0 -2
  626. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +0 -2
  627. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +0 -2
  628. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +0 -2
  629. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +0 -2
  630. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +0 -2
  631. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +0 -3
  632. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +0 -2
  633. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +0 -2
  634. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +0 -2
  635. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +0 -3
  636. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +0 -2
  637. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +0 -2
  638. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +0 -2
  639. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +0 -2
  640. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +26 -12
  641. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +0 -2
  642. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +0 -3
  643. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +0 -2
  644. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +0 -2
  645. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +0 -1
  646. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +0 -2
  647. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +0 -2
  648. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +0 -2
  649. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +0 -2
  650. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +0 -2
  651. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +0 -2
  652. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +0 -2
  653. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +0 -3
  654. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +0 -2
  655. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +0 -2
  656. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +0 -2
  657. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +0 -2
  658. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +0 -2
  659. cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +0 -2
  660. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +0 -2
  661. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +0 -2
  662. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +0 -2
  663. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +0 -2
  664. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +0 -2
  665. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +2 -4
  666. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +0 -2
  667. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +0 -3
  668. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +0 -2
  669. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +0 -2
  670. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +0 -2
  671. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +0 -2
  672. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +0 -2
  673. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +0 -2
  674. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +0 -2
  675. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +0 -2
  676. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +76 -5
  677. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +0 -2
  678. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +0 -2
  679. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +0 -2
  680. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +0 -2
  681. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +0 -2
  682. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +0 -3
  683. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +0 -2
  684. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +0 -2
  685. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +0 -2
  686. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +0 -2
  687. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +78 -6
  688. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +0 -4
  689. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +0 -2
  690. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +0 -2
  691. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +0 -2
  692. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +0 -2
  693. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +0 -2
  694. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +0 -2
  695. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +67 -6
  696. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +310 -11
  697. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +78 -5
  698. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +543 -7
  699. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +0 -2
  700. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +0 -2
  701. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +0 -2
  702. cuda/cccl/headers/include/thrust/system/detail/system_error.inl +0 -2
  703. cuda/cccl/headers/include/thrust/system/error_code.h +0 -4
  704. cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +5 -25
  705. cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +2 -15
  706. cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +5 -25
  707. cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +40 -29
  708. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +11 -28
  709. cuda/cccl/headers/include/thrust/system/omp/detail/count.h +2 -15
  710. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +26 -28
  711. cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +2 -15
  712. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +18 -13
  713. cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +5 -25
  714. cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +2 -15
  715. cuda/cccl/headers/include/thrust/system/omp/detail/find.h +5 -25
  716. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +47 -30
  717. cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +2 -15
  718. cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +2 -15
  719. cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +2 -15
  720. cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +2 -15
  721. cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +2 -15
  722. cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +2 -15
  723. cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +2 -15
  724. cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +2 -15
  725. cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +2 -15
  726. cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +26 -31
  727. cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +2 -15
  728. cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +2 -26
  729. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +35 -27
  730. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +13 -28
  731. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +56 -28
  732. cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +26 -31
  733. cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +2 -15
  734. cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +2 -15
  735. cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +176 -17
  736. cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +8 -15
  737. cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +2 -15
  738. cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +2 -15
  739. cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +2 -15
  740. cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +213 -28
  741. cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +2 -15
  742. cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +2 -15
  743. cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +2 -15
  744. cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +2 -15
  745. cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +2 -15
  746. cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +2 -15
  747. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +2 -15
  748. cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +2 -15
  749. cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +21 -30
  750. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +17 -29
  751. cuda/cccl/headers/include/thrust/system/omp/memory.h +51 -9
  752. cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +3 -7
  753. cuda/cccl/headers/include/thrust/system/omp/pointer.h +3 -7
  754. cuda/cccl/headers/include/thrust/system/omp/vector.h +3 -6
  755. cuda/cccl/headers/include/thrust/system/system_error.h +0 -2
  756. cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +4 -25
  757. cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +2 -15
  758. cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +2 -15
  759. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +38 -29
  760. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +91 -24
  761. cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +2 -15
  762. cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +2 -15
  763. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +17 -13
  764. cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +4 -25
  765. cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +2 -15
  766. cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +4 -25
  767. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +47 -28
  768. cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +2 -15
  769. cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +2 -15
  770. cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +2 -15
  771. cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +2 -15
  772. cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +2 -15
  773. cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +2 -15
  774. cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +2 -15
  775. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +254 -29
  776. cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +2 -15
  777. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +25 -31
  778. cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +2 -15
  779. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +95 -29
  780. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +345 -28
  781. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +4 -26
  782. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +32 -42
  783. cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +2 -15
  784. cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +2 -15
  785. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +265 -30
  786. cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +7 -17
  787. cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +2 -15
  788. cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +2 -15
  789. cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +2 -15
  790. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +244 -32
  791. cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +2 -15
  792. cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +2 -15
  793. cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +2 -15
  794. cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +2 -15
  795. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +2 -15
  796. cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +2 -15
  797. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +2 -15
  798. cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +2 -15
  799. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +23 -33
  800. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +16 -29
  801. cuda/cccl/headers/include/thrust/system/tbb/memory.h +52 -24
  802. cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +4 -22
  803. cuda/cccl/headers/include/thrust/system/tbb/pointer.h +4 -22
  804. cuda/cccl/headers/include/thrust/system/tbb/vector.h +4 -21
  805. cuda/cccl/headers/include/thrust/transform.h +14 -3
  806. cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +0 -4
  807. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +0 -1
  808. cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +0 -4
  809. cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +0 -4
  810. cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +0 -4
  811. cuda/cccl/headers/include/thrust/universal_allocator.h +8 -0
  812. cuda/cccl/headers/include/thrust/universal_vector.h +9 -0
  813. cuda/cccl/headers/include/thrust/zip_function.h +2 -28
  814. cuda/compute/__init__.py +4 -0
  815. cuda/compute/_bindings.pyi +26 -3
  816. cuda/compute/_bindings_impl.pyx +143 -1
  817. cuda/compute/algorithms/__init__.py +9 -5
  818. cuda/compute/algorithms/_sort/__init__.py +23 -0
  819. cuda/compute/algorithms/{_merge_sort.py → _sort/_merge_sort.py} +10 -10
  820. cuda/compute/algorithms/{_radix_sort.py → _sort/_radix_sort.py} +9 -58
  821. cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
  822. cuda/compute/algorithms/_sort/_sort_common.py +52 -0
  823. cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  824. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  825. cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  826. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  827. cuda_cccl-0.3.4.dist-info/METADATA +78 -0
  828. {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/RECORD +830 -867
  829. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +0 -652
  830. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +0 -1365
  831. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +0 -2144
  832. cuda/cccl/headers/include/thrust/detail/integer_math.h +0 -113
  833. cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +0 -51
  834. cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +0 -51
  835. cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +0 -51
  836. cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +0 -51
  837. cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +0 -52
  838. cuda/cccl/headers/include/thrust/system/detail/adl/count.h +0 -51
  839. cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +0 -51
  840. cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +0 -51
  841. cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +0 -51
  842. cuda/cccl/headers/include/thrust/system/detail/adl/find.h +0 -51
  843. cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +0 -51
  844. cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +0 -51
  845. cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +0 -51
  846. cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +0 -51
  847. cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +0 -51
  848. cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +0 -51
  849. cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +0 -51
  850. cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +0 -51
  851. cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +0 -51
  852. cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +0 -51
  853. cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +0 -51
  854. cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +0 -51
  855. cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +0 -51
  856. cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +0 -51
  857. cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +0 -51
  858. cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +0 -51
  859. cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +0 -51
  860. cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +0 -51
  861. cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +0 -51
  862. cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +0 -51
  863. cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +0 -51
  864. cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +0 -51
  865. cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +0 -51
  866. cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +0 -51
  867. cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +0 -51
  868. cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +0 -51
  869. cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +0 -51
  870. cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +0 -51
  871. cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +0 -51
  872. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +0 -51
  873. cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +0 -51
  874. cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +0 -51
  875. cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +0 -51
  876. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +0 -85
  877. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +0 -119
  878. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +0 -145
  879. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +0 -116
  880. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +0 -356
  881. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +0 -124
  882. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +0 -586
  883. cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +0 -74
  884. cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +0 -59
  885. cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +0 -65
  886. cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +0 -87
  887. cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +0 -93
  888. cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +0 -102
  889. cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +0 -78
  890. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +0 -65
  891. cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +0 -103
  892. cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +0 -87
  893. cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +0 -265
  894. cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +0 -71
  895. cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +0 -75
  896. cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +0 -73
  897. cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +0 -136
  898. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +0 -91
  899. cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +0 -94
  900. cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +0 -327
  901. cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +0 -98
  902. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +0 -137
  903. cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +0 -400
  904. cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +0 -87
  905. cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +0 -312
  906. cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +0 -295
  907. cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +0 -71
  908. cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +0 -75
  909. cuda_cccl-0.3.2.dist-info/METADATA +0 -42
  910. {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/WHEEL +0 -0
  911. {cuda_cccl-0.3.2.dist-info → cuda_cccl-0.3.4.dist-info}/licenses/LICENSE +0 -0
@@ -1,30 +1,6 @@
1
- /******************************************************************************
2
- * Copyright (c) 2011, Duane Merrill. All rights reserved.
3
- * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4
- *
5
- * Redistribution and use in source and binary forms, with or without
6
- * modification, are permitted provided that the following conditions are met:
7
- * * Redistributions of source code must retain the above copyright
8
- * notice, this list of conditions and the following disclaimer.
9
- * * Redistributions in binary form must reproduce the above copyright
10
- * notice, this list of conditions and the following disclaimer in the
11
- * documentation and/or other materials provided with the distribution.
12
- * * Neither the name of the NVIDIA CORPORATION nor the
13
- * names of its contributors may be used to endorse or promote products
14
- * derived from this software without specific prior written permission.
15
- *
16
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
- *
27
- ******************************************************************************/
1
+ // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
2
+ // SPDX-FileCopyrightText: Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
3
+ // SPDX-License-Identifier: BSD-3
28
4
 
29
5
  //! @file
30
6
  //! block_load.cuh Operations for reading linear tiles of data into the CUDA thread block.
@@ -46,6 +22,8 @@
46
22
  #include <cub/util_ptx.cuh>
47
23
  #include <cub/util_type.cuh>
48
24
 
25
+ #include <cuda/std/__new/device_new.h>
26
+
49
27
  CUB_NAMESPACE_BEGIN
50
28
 
51
29
  //! @name Blocked arrangement I/O (direct)
@@ -61,7 +39,7 @@ CUB_NAMESPACE_BEGIN
61
39
  //! @tparam T
62
40
  //! **[inferred]** The data type to load.
63
41
  //!
64
- //! @tparam ITEMS_PER_THREAD
42
+ //! @tparam ItemsPerThread
65
43
  //! **[inferred]** The number of consecutive items partitioned onto each thread.
66
44
  //!
67
45
  //! @tparam RandomAccessIterator
@@ -76,15 +54,15 @@ CUB_NAMESPACE_BEGIN
76
54
  //!
77
55
  //! @param[out] dst_items
78
56
  //! Destination to load data into
79
- template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
57
+ template <typename T, int ItemsPerThread, typename RandomAccessIterator>
80
58
  _CCCL_DEVICE _CCCL_FORCEINLINE void
81
- LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
59
+ LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
82
60
  {
83
61
  // Load directly in thread-blocked order
84
62
  _CCCL_PRAGMA_UNROLL_FULL()
85
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
63
+ for (int i = 0; i < ItemsPerThread; i++)
86
64
  {
87
- dst_items[i] = block_src_it[linear_tid * ITEMS_PER_THREAD + i];
65
+ dst_items[i] = block_src_it[linear_tid * ItemsPerThread + i];
88
66
  }
89
67
  }
90
68
 
@@ -98,7 +76,7 @@ LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_ite
98
76
  //! @tparam T
99
77
  //! **[inferred]** The data type to load.
100
78
  //!
101
- //! @tparam ITEMS_PER_THREAD
79
+ //! @tparam ItemsPerThread
102
80
  //! **[inferred]** The number of consecutive items partitioned onto each thread.
103
81
  //!
104
82
  //! @tparam RandomAccessIterator
@@ -116,14 +94,14 @@ LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_ite
116
94
  //!
117
95
  //! @param[in] block_items_end
118
96
  //! First out-of-bounds index when loading from block_src_it
119
- template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
97
+ template <typename T, int ItemsPerThread, typename RandomAccessIterator>
120
98
  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
121
- int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
99
+ int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
122
100
  {
123
101
  _CCCL_PRAGMA_UNROLL_FULL()
124
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
102
+ for (int i = 0; i < ItemsPerThread; i++)
125
103
  {
126
- const auto src_pos = linear_tid * ITEMS_PER_THREAD + i;
104
+ const auto src_pos = linear_tid * ItemsPerThread + i;
127
105
  if (src_pos < block_items_end)
128
106
  {
129
107
  dst_items[i] = block_src_it[src_pos];
@@ -142,7 +120,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
142
120
  //! @tparam T
143
121
  //! **[inferred]** The data type to load.
144
122
  //!
145
- //! @tparam ITEMS_PER_THREAD
123
+ //! @tparam ItemsPerThread
146
124
  //! **[inferred]** The number of consecutive items partitioned onto each thread.
147
125
  //!
148
126
  //! @tparam RandomAccessIterator
@@ -163,16 +141,16 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
163
141
  //!
164
142
  //! @param[in] oob_default
165
143
  //! Default value to assign out-of-bound items
166
- template <typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
144
+ template <typename T, typename DefaultT, int ItemsPerThread, typename RandomAccessIterator>
167
145
  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
168
146
  int linear_tid,
169
147
  RandomAccessIterator block_src_it,
170
- T (&dst_items)[ITEMS_PER_THREAD],
148
+ T (&dst_items)[ItemsPerThread],
171
149
  int block_items_end,
172
150
  DefaultT oob_default)
173
151
  {
174
152
  _CCCL_PRAGMA_UNROLL_FULL()
175
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
153
+ for (int i = 0; i < ItemsPerThread; i++)
176
154
  {
177
155
  dst_items[i] = oob_default;
178
156
  }
@@ -193,9 +171,9 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
193
171
  //!
194
172
  //! @param[out] dst_items
195
173
  //! Destination to load data into
196
- template <CacheLoadModifier MODIFIER, typename T, int ITEMS_PER_THREAD>
174
+ template <CacheLoadModifier MODIFIER, typename T, int ItemsPerThread>
197
175
  _CCCL_DEVICE _CCCL_FORCEINLINE void
198
- InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&dst_items)[ITEMS_PER_THREAD])
176
+ InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&dst_items)[ItemsPerThread])
199
177
  {
200
178
  // Find biggest memory access word that T is a whole multiple of
201
179
  using device_word_t = typename UnitWord<T>::DeviceWord;
@@ -226,7 +204,7 @@ InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&
226
204
 
227
205
  // Copy to destination
228
206
  _CCCL_PRAGMA_UNROLL_FULL()
229
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
207
+ for (int i = 0; i < ItemsPerThread; i++)
230
208
  {
231
209
  dst_items[i] = *(reinterpret_cast<T*>(vec_items) + i);
232
210
  }
@@ -248,7 +226,7 @@ InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&
248
226
  //!
249
227
  //! The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
250
228
  //!
251
- //! - ``ITEMS_PER_THREAD`` is odd
229
+ //! - ``ItemsPerThread`` is odd
252
230
  //! - The data type ``T`` is not a built-in primitive or CUDA vector type
253
231
  //! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
254
232
  //!
@@ -257,7 +235,7 @@ InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&
257
235
  //! @tparam T
258
236
  //! **[inferred]** The data type to load.
259
237
  //!
260
- //! @tparam ITEMS_PER_THREAD
238
+ //! @tparam ItemsPerThread
261
239
  //! **[inferred]** The number of consecutive items partitioned onto each thread.
262
240
  //!
263
241
  //! @param[in] linear_tid
@@ -269,9 +247,9 @@ InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&
269
247
  //!
270
248
  //! @param[out] dst_items
271
249
  //! destination to load data into
272
- template <typename T, int ITEMS_PER_THREAD>
250
+ template <typename T, int ItemsPerThread>
273
251
  _CCCL_DEVICE _CCCL_FORCEINLINE void
274
- LoadDirectBlockedVectorized(int linear_tid, T* block_src_ptr, T (&dst_items)[ITEMS_PER_THREAD])
252
+ LoadDirectBlockedVectorized(int linear_tid, T* block_src_ptr, T (&dst_items)[ItemsPerThread])
275
253
  {
276
254
  InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_src_ptr, dst_items);
277
255
  }
@@ -287,13 +265,13 @@ LoadDirectBlockedVectorized(int linear_tid, T* block_src_ptr, T (&dst_items)[ITE
287
265
  //!
288
266
  //! @endrst
289
267
  //!
290
- //! @tparam BLOCK_THREADS
268
+ //! @tparam BlockThreads
291
269
  //! The thread block size in threads
292
270
  //!
293
271
  //! @tparam T
294
272
  //! **[inferred]** The data type to load.
295
273
  //!
296
- //! @tparam ITEMS_PER_THREAD
274
+ //! @tparam ItemsPerThread
297
275
  //! **[inferred]** The number of consecutive items partitioned onto each thread.
298
276
  //!
299
277
  //! @tparam RandomAccessIterator
@@ -308,27 +286,27 @@ LoadDirectBlockedVectorized(int linear_tid, T* block_src_ptr, T (&dst_items)[ITE
308
286
  //!
309
287
  //! @param[out] dst_items
310
288
  //! Destination to load data into
311
- template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
289
+ template <int BlockThreads, typename T, int ItemsPerThread, typename RandomAccessIterator>
312
290
  _CCCL_DEVICE _CCCL_FORCEINLINE void
313
- LoadDirectStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
291
+ LoadDirectStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
314
292
  {
315
293
  _CCCL_PRAGMA_UNROLL_FULL()
316
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
294
+ for (int i = 0; i < ItemsPerThread; i++)
317
295
  {
318
- dst_items[i] = block_src_it[linear_tid + i * BLOCK_THREADS];
296
+ dst_items[i] = block_src_it[linear_tid + i * BlockThreads];
319
297
  }
320
298
  }
321
299
 
322
300
  namespace detail
323
301
  {
324
- template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator, typename TransformOpT>
302
+ template <int BlockThreads, typename T, int ItemsPerThread, typename RandomAccessIterator, typename TransformOpT>
325
303
  _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped(
326
- int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], TransformOpT transform_op)
304
+ int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], TransformOpT transform_op)
327
305
  {
328
306
  _CCCL_PRAGMA_UNROLL_FULL()
329
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
307
+ for (int i = 0; i < ItemsPerThread; i++)
330
308
  {
331
- dst_items[i] = transform_op(block_src_it[linear_tid + i * BLOCK_THREADS]);
309
+ dst_items[i] = transform_op(block_src_it[linear_tid + i * BlockThreads]);
332
310
  }
333
311
  }
334
312
  } // namespace detail
@@ -340,13 +318,13 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped(
340
318
  //!
341
319
  //! @endrst
342
320
  //!
343
- //! @tparam BLOCK_THREADS
321
+ //! @tparam BlockThreads
344
322
  //! The thread block size in threads
345
323
  //!
346
324
  //! @tparam T
347
325
  //! **inferred** The data type to load.
348
326
  //!
349
- //! @tparam ITEMS_PER_THREAD
327
+ //! @tparam ItemsPerThread
350
328
  //! **inferred** The number of consecutive items partitioned onto each thread.
351
329
  //!
352
330
  //! @tparam RandomAccessIterator
@@ -364,14 +342,14 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped(
364
342
  //!
365
343
  //! @param[in] block_items_end
366
344
  //! Number of valid items to load
367
- template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
345
+ template <int BlockThreads, typename T, int ItemsPerThread, typename RandomAccessIterator>
368
346
  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
369
- int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
347
+ int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
370
348
  {
371
349
  _CCCL_PRAGMA_UNROLL_FULL()
372
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
350
+ for (int i = 0; i < ItemsPerThread; i++)
373
351
  {
374
- const auto src_pos = linear_tid + i * BLOCK_THREADS;
352
+ const auto src_pos = linear_tid + i * BlockThreads;
375
353
  if (src_pos < block_items_end)
376
354
  {
377
355
  dst_items[i] = block_src_it[src_pos];
@@ -387,13 +365,13 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
387
365
  //!
388
366
  //! @endrst
389
367
  //!
390
- //! @tparam BLOCK_THREADS
368
+ //! @tparam BlockThreads
391
369
  //! The thread block size in threads
392
370
  //!
393
371
  //! @tparam T
394
372
  //! **inferred** The data type to load.
395
373
  //!
396
- //! @tparam ITEMS_PER_THREAD
374
+ //! @tparam ItemsPerThread
397
375
  //! **inferred** The number of consecutive items partitioned onto each thread.
398
376
  //!
399
377
  //! @tparam RandomAccessIterator
@@ -414,21 +392,21 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
414
392
  //!
415
393
  //! @param[in] oob_default
416
394
  //! Default value to assign out-of-bound items
417
- template <int BLOCK_THREADS, typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
395
+ template <int BlockThreads, typename T, typename DefaultT, int ItemsPerThread, typename RandomAccessIterator>
418
396
  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
419
397
  int linear_tid,
420
398
  RandomAccessIterator block_src_it,
421
- T (&dst_items)[ITEMS_PER_THREAD],
399
+ T (&dst_items)[ItemsPerThread],
422
400
  int block_items_end,
423
401
  DefaultT oob_default)
424
402
  {
425
403
  _CCCL_PRAGMA_UNROLL_FULL()
426
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
404
+ for (int i = 0; i < ItemsPerThread; i++)
427
405
  {
428
406
  dst_items[i] = oob_default;
429
407
  }
430
408
 
431
- LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
409
+ LoadDirectStriped<BlockThreads>(linear_tid, block_src_it, dst_items, block_items_end);
432
410
  }
433
411
 
434
412
  //! @} end member group
@@ -450,7 +428,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
450
428
  //! @tparam T
451
429
  //! **inferred** The data type to load.
452
430
  //!
453
- //! @tparam ITEMS_PER_THREAD
431
+ //! @tparam ItemsPerThread
454
432
  //! **inferred** The number of consecutive items partitioned onto each thread.
455
433
  //!
456
434
  //! @tparam RandomAccessIterator
@@ -465,17 +443,17 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
465
443
  //!
466
444
  //! @param[out] dst_items
467
445
  //! Destination to load data into
468
- template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
446
+ template <typename T, int ItemsPerThread, typename RandomAccessIterator>
469
447
  _CCCL_DEVICE _CCCL_FORCEINLINE void
470
- LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
448
+ LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
471
449
  {
472
450
  const int tid = linear_tid & (detail::warp_threads - 1);
473
451
  const int wid = linear_tid >> detail::log2_warp_threads;
474
- const int warp_offset = wid * detail::warp_threads * ITEMS_PER_THREAD;
452
+ const int warp_offset = wid * detail::warp_threads * ItemsPerThread;
475
453
 
476
454
  // Load directly in warp-striped order
477
455
  _CCCL_PRAGMA_UNROLL_FULL()
478
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
456
+ for (int i = 0; i < ItemsPerThread; i++)
479
457
  {
480
458
  new (&dst_items[i]) T(block_src_it[warp_offset + tid + (i * detail::warp_threads)]);
481
459
  }
@@ -496,7 +474,7 @@ LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst
496
474
  //! @tparam T
497
475
  //! **inferred** The data type to load.
498
476
  //!
499
- //! @tparam ITEMS_PER_THREAD
477
+ //! @tparam ItemsPerThread
500
478
  //! **inferred** The number of consecutive items partitioned onto each thread.
501
479
  //!
502
480
  //! @tparam RandomAccessIterator
@@ -514,17 +492,17 @@ LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst
514
492
  //!
515
493
  //! @param[in] block_items_end
516
494
  //! Number of valid items to load
517
- template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
495
+ template <typename T, int ItemsPerThread, typename RandomAccessIterator>
518
496
  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
519
- int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
497
+ int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
520
498
  {
521
499
  const int tid = linear_tid & (detail::warp_threads - 1);
522
500
  const int wid = linear_tid >> detail::log2_warp_threads;
523
- const int warp_offset = wid * detail::warp_threads * ITEMS_PER_THREAD;
501
+ const int warp_offset = wid * detail::warp_threads * ItemsPerThread;
524
502
 
525
503
  // Load directly in warp-striped order
526
504
  _CCCL_PRAGMA_UNROLL_FULL()
527
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
505
+ for (int i = 0; i < ItemsPerThread; i++)
528
506
  {
529
507
  const auto src_pos = warp_offset + tid + (i * detail::warp_threads);
530
508
  if (src_pos < block_items_end)
@@ -550,7 +528,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
550
528
  //! @tparam T
551
529
  //! **inferred** The data type to load.
552
530
  //!
553
- //! @tparam ITEMS_PER_THREAD
531
+ //! @tparam ItemsPerThread
554
532
  //! **inferred** The number of consecutive items partitioned onto each thread.
555
533
  //!
556
534
  //! @tparam RandomAccessIterator
@@ -571,17 +549,17 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
571
549
  //!
572
550
  //! @param[in] oob_default
573
551
  //! Default value to assign out-of-bound items
574
- template <typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
552
+ template <typename T, typename DefaultT, int ItemsPerThread, typename RandomAccessIterator>
575
553
  _CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
576
554
  int linear_tid,
577
555
  RandomAccessIterator block_src_it,
578
- T (&dst_items)[ITEMS_PER_THREAD],
556
+ T (&dst_items)[ItemsPerThread],
579
557
  int block_items_end,
580
558
  DefaultT oob_default)
581
559
  {
582
560
  // Load directly in warp-striped order
583
561
  _CCCL_PRAGMA_UNROLL_FULL()
584
- for (int i = 0; i < ITEMS_PER_THREAD; i++)
562
+ for (int i = 0; i < ItemsPerThread; i++)
585
563
  {
586
564
  dst_items[i] = oob_default;
587
565
  }
@@ -629,7 +607,7 @@ enum BlockLoadAlgorithm
629
607
  //!
630
608
  //! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read from memory using CUDA's built-in
631
609
  //! vectorized loads as a coalescing optimization. For example, ``ld.global.v4.s32`` instructions will be generated
632
- //! when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``.
610
+ //! when ``T = int`` and ``ItemsPerThread % 4 == 0``.
633
611
  //!
634
612
  //! Performance Considerations
635
613
  //! ++++++++++++++++++++++++++
@@ -639,7 +617,7 @@ enum BlockLoadAlgorithm
639
617
  //! is lower).
640
618
  //! - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
641
619
  //!
642
- //! - ``ITEMS_PER_THREAD`` is odd
620
+ //! - ``ItemsPerThread`` is odd
643
621
  //! - The ``RandomAccessIterator`` is not a simple pointer type
644
622
  //! - The block input offset is not quadword-aligned
645
623
  //! - The data type ``T`` is not a built-in primitive or CUDA vector type
@@ -675,7 +653,7 @@ enum BlockLoadAlgorithm
675
653
  //! Usage Considerations
676
654
  //! ++++++++++++++++++++++++++
677
655
  //!
678
- //! - BLOCK_THREADS must be a multiple of WARP_THREADS
656
+ //! - BlockThreads must be a multiple of WARP_THREADS
679
657
  //!
680
658
  //! Performance Considerations
681
659
  //! ++++++++++++++++++++++++++
@@ -701,7 +679,7 @@ enum BlockLoadAlgorithm
701
679
  //! Usage Considerations
702
680
  //! ++++++++++++++++++++++++++
703
681
  //!
704
- //! - BLOCK_THREADS must be a multiple of WARP_THREADS
682
+ //! - BlockThreads must be a multiple of WARP_THREADS
705
683
  //!
706
684
  //! Performance Considerations
707
685
  //! ++++++++++++++++++++++++++
@@ -791,7 +769,7 @@ enum BlockLoadAlgorithm
791
769
  //! @tparam BLOCK_DIM_X
792
770
  //! The thread block length in threads along the X dimension
793
771
  //!
794
- //! @tparam ITEMS_PER_THREAD
772
+ //! @tparam ItemsPerThread
795
773
  //! The number of consecutive items partitioned onto each thread.
796
774
  //!
797
775
  //! @tparam ALGORITHM
@@ -804,20 +782,20 @@ enum BlockLoadAlgorithm
804
782
  //! **[optional]** The thread block length in threads along the Z dimension (default: 1)
805
783
  //!
806
784
  template <typename T,
807
- int BLOCK_DIM_X,
808
- int ITEMS_PER_THREAD,
809
- BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT,
810
- int BLOCK_DIM_Y = 1,
811
- int BLOCK_DIM_Z = 1>
785
+ int BlockDimX,
786
+ int ItemsPerThread,
787
+ BlockLoadAlgorithm Algorithm = BLOCK_LOAD_DIRECT,
788
+ int BlockDimY = 1,
789
+ int BlockDimZ = 1>
812
790
  class BlockLoad
813
791
  {
814
- static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; // total threads in the block
792
+ static constexpr int BlockThreads = BlockDimX * BlockDimY * BlockDimZ; // total threads in the block
815
793
 
816
- template <BlockLoadAlgorithm _POLICY, int DUMMY>
794
+ template <BlockLoadAlgorithm _POLICY, int Dummy>
817
795
  struct LoadInternal; // helper to dispatch the load algorithm
818
796
 
819
- template <int DUMMY>
820
- struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
797
+ template <int Dummy>
798
+ struct LoadInternal<BLOCK_LOAD_DIRECT, Dummy>
821
799
  {
822
800
  using TempStorage = NullType;
823
801
  int linear_tid;
@@ -827,28 +805,28 @@ class BlockLoad
827
805
  {}
828
806
 
829
807
  template <typename RandomAccessIterator>
830
- _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
808
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
831
809
  {
832
810
  LoadDirectBlocked(linear_tid, block_src_it, dst_items);
833
811
  }
834
812
 
835
813
  template <typename RandomAccessIterator>
836
814
  _CCCL_DEVICE _CCCL_FORCEINLINE void
837
- Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
815
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
838
816
  {
839
817
  LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
840
818
  }
841
819
 
842
820
  template <typename RandomAccessIterator, typename DefaultT>
843
821
  _CCCL_DEVICE _CCCL_FORCEINLINE void
844
- Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
822
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end, DefaultT oob_default)
845
823
  {
846
824
  LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
847
825
  }
848
826
  };
849
827
 
850
- template <int DUMMY>
851
- struct LoadInternal<BLOCK_LOAD_STRIPED, DUMMY>
828
+ template <int Dummy>
829
+ struct LoadInternal<BLOCK_LOAD_STRIPED, Dummy>
852
830
  {
853
831
  using TempStorage = NullType;
854
832
  int linear_tid;
@@ -858,28 +836,28 @@ class BlockLoad
858
836
  {}
859
837
 
860
838
  template <typename RandomAccessIterator>
861
- _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
839
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
862
840
  {
863
- LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items);
841
+ LoadDirectStriped<BlockThreads>(linear_tid, block_src_it, dst_items);
864
842
  }
865
843
 
866
844
  template <typename RandomAccessIterator>
867
845
  _CCCL_DEVICE _CCCL_FORCEINLINE void
868
- Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
846
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
869
847
  {
870
- LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
848
+ LoadDirectStriped<BlockThreads>(linear_tid, block_src_it, dst_items, block_items_end);
871
849
  }
872
850
 
873
851
  template <typename RandomAccessIterator, typename DefaultT>
874
852
  _CCCL_DEVICE _CCCL_FORCEINLINE void
875
- Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
853
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end, DefaultT oob_default)
876
854
  {
877
- LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
855
+ LoadDirectStriped<BlockThreads>(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
878
856
  }
879
857
  };
880
858
 
881
- template <int DUMMY>
882
- struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
859
+ template <int Dummy>
860
+ struct LoadInternal<BLOCK_LOAD_VECTORIZE, Dummy>
883
861
  {
884
862
  using TempStorage = NullType;
885
863
  int linear_tid;
@@ -889,7 +867,7 @@ class BlockLoad
889
867
  {}
890
868
 
891
869
  // attempts vectorization (pointer)
892
- _CCCL_DEVICE _CCCL_FORCEINLINE void Load(const T* block_ptr, T (&dst_items)[ITEMS_PER_THREAD])
870
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(const T* block_ptr, T (&dst_items)[ItemsPerThread])
893
871
  {
894
872
  InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, dst_items);
895
873
  }
@@ -898,14 +876,14 @@ class BlockLoad
898
876
  // Otherwise, when the pointer type is 'T*', the compiler will prefer the overloaded version
899
877
  // Load(RandomAccessIterator...) over Load(const T*...), which means it will never perform vectorized loading for
900
878
  // pointers to non-const types.
901
- _CCCL_DEVICE _CCCL_FORCEINLINE void Load(T* block_ptr, T (&dst_items)[ITEMS_PER_THREAD])
879
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(T* block_ptr, T (&dst_items)[ItemsPerThread])
902
880
  {
903
881
  InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, dst_items);
904
882
  }
905
883
 
906
884
  // any other iterator, no vectorization
907
885
  template <typename RandomAccessIterator>
908
- _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
886
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
909
887
  {
910
888
  LoadDirectBlocked(linear_tid, block_src_it, dst_items);
911
889
  }
@@ -913,7 +891,7 @@ class BlockLoad
913
891
  // attempts vectorization (cache modified iterator)
914
892
  template <CacheLoadModifier MODIFIER, typename ValueType, typename OffsetT>
915
893
  _CCCL_DEVICE _CCCL_FORCEINLINE void
916
- Load(CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT> block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
894
+ Load(CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT> block_src_it, T (&dst_items)[ItemsPerThread])
917
895
  {
918
896
  InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_src_it.ptr, dst_items);
919
897
  }
@@ -921,7 +899,7 @@ class BlockLoad
921
899
  // skips vectorization
922
900
  template <typename RandomAccessIterator>
923
901
  _CCCL_DEVICE _CCCL_FORCEINLINE void
924
- Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
902
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
925
903
  {
926
904
  LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
927
905
  }
@@ -929,16 +907,16 @@ class BlockLoad
929
907
  // skips vectorization
930
908
  template <typename RandomAccessIterator, typename DefaultT>
931
909
  _CCCL_DEVICE _CCCL_FORCEINLINE void
932
- Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
910
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end, DefaultT oob_default)
933
911
  {
934
912
  LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
935
913
  }
936
914
  };
937
915
 
938
- template <int DUMMY>
939
- struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
916
+ template <int Dummy>
917
+ struct LoadInternal<BLOCK_LOAD_TRANSPOSE, Dummy>
940
918
  {
941
- using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
919
+ using BlockExchange = BlockExchange<T, BlockDimX, ItemsPerThread, false, BlockDimY, BlockDimZ>;
942
920
  using _TempStorage = typename BlockExchange::TempStorage;
943
921
  using TempStorage = Uninitialized<_TempStorage>;
944
922
 
@@ -951,36 +929,36 @@ class BlockLoad
951
929
  {}
952
930
 
953
931
  template <typename RandomAccessIterator>
954
- _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
932
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
955
933
  {
956
- LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items);
934
+ LoadDirectStriped<BlockThreads>(linear_tid, block_src_it, dst_items);
957
935
  BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
958
936
  }
959
937
 
960
938
  template <typename RandomAccessIterator>
961
939
  _CCCL_DEVICE _CCCL_FORCEINLINE void
962
- Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
940
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
963
941
  {
964
- LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
942
+ LoadDirectStriped<BlockThreads>(linear_tid, block_src_it, dst_items, block_items_end);
965
943
  BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
966
944
  }
967
945
 
968
946
  template <typename RandomAccessIterator, typename DefaultT>
969
947
  _CCCL_DEVICE _CCCL_FORCEINLINE void
970
- Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
948
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end, DefaultT oob_default)
971
949
  {
972
- LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
950
+ LoadDirectStriped<BlockThreads>(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
973
951
  BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
974
952
  }
975
953
  };
976
954
 
977
- template <int DUMMY>
978
- struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
955
+ template <int Dummy>
956
+ struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, Dummy>
979
957
  {
980
958
  static constexpr int WARP_THREADS = detail::warp_threads;
981
- static_assert(BLOCK_THREADS % WARP_THREADS == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
959
+ static_assert(BlockThreads % WARP_THREADS == 0, "BlockThreads must be a multiple of WARP_THREADS");
982
960
 
983
- using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
961
+ using BlockExchange = BlockExchange<T, BlockDimX, ItemsPerThread, false, BlockDimY, BlockDimZ>;
984
962
  using _TempStorage = typename BlockExchange::TempStorage;
985
963
  using TempStorage = Uninitialized<_TempStorage>;
986
964
 
@@ -993,7 +971,7 @@ class BlockLoad
993
971
  {}
994
972
 
995
973
  template <typename RandomAccessIterator>
996
- _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
974
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
997
975
  {
998
976
  LoadDirectWarpStriped(linear_tid, block_src_it, dst_items);
999
977
  BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
@@ -1001,7 +979,7 @@ class BlockLoad
1001
979
 
1002
980
  template <typename RandomAccessIterator>
1003
981
  _CCCL_DEVICE _CCCL_FORCEINLINE void
1004
- Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
982
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
1005
983
  {
1006
984
  LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
1007
985
  BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
@@ -1009,20 +987,20 @@ class BlockLoad
1009
987
 
1010
988
  template <typename RandomAccessIterator, typename DefaultT>
1011
989
  _CCCL_DEVICE _CCCL_FORCEINLINE void
1012
- Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
990
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end, DefaultT oob_default)
1013
991
  {
1014
992
  LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
1015
993
  BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
1016
994
  }
1017
995
  };
1018
996
 
1019
- template <int DUMMY>
1020
- struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
997
+ template <int Dummy>
998
+ struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, Dummy>
1021
999
  {
1022
1000
  static constexpr int WARP_THREADS = detail::warp_threads;
1023
- static_assert(BLOCK_THREADS % WARP_THREADS == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
1001
+ static_assert(BlockThreads % WARP_THREADS == 0, "BlockThreads must be a multiple of WARP_THREADS");
1024
1002
 
1025
- using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z>;
1003
+ using BlockExchange = BlockExchange<T, BlockDimX, ItemsPerThread, true, BlockDimY, BlockDimZ>;
1026
1004
  using _TempStorage = typename BlockExchange::TempStorage;
1027
1005
  using TempStorage = Uninitialized<_TempStorage>;
1028
1006
 
@@ -1035,7 +1013,7 @@ class BlockLoad
1035
1013
  {}
1036
1014
 
1037
1015
  template <typename RandomAccessIterator>
1038
- _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
1016
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
1039
1017
  {
1040
1018
  LoadDirectWarpStriped(linear_tid, block_src_it, dst_items);
1041
1019
  BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
@@ -1043,7 +1021,7 @@ class BlockLoad
1043
1021
 
1044
1022
  template <typename RandomAccessIterator>
1045
1023
  _CCCL_DEVICE _CCCL_FORCEINLINE void
1046
- Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
1024
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
1047
1025
  {
1048
1026
  LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
1049
1027
  BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
@@ -1051,14 +1029,14 @@ class BlockLoad
1051
1029
 
1052
1030
  template <typename RandomAccessIterator, typename DefaultT>
1053
1031
  _CCCL_DEVICE _CCCL_FORCEINLINE void
1054
- Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
1032
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end, DefaultT oob_default)
1055
1033
  {
1056
1034
  LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
1057
1035
  BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
1058
1036
  }
1059
1037
  };
1060
1038
 
1061
- using InternalLoad = LoadInternal<ALGORITHM, 0>; // load implementation to use
1039
+ using InternalLoad = LoadInternal<Algorithm, 0>; // load implementation to use
1062
1040
  using _TempStorage = typename InternalLoad::TempStorage;
1063
1041
 
1064
1042
  // Internal storage allocator
@@ -1081,14 +1059,14 @@ public:
1081
1059
  /// @brief Collective constructor using a private static allocation of shared memory as temporary storage.
1082
1060
  _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad()
1083
1061
  : temp_storage(PrivateStorage())
1084
- , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
1062
+ , linear_tid(RowMajorTid(BlockDimX, BlockDimY, BlockDimZ))
1085
1063
  {}
1086
1064
 
1087
1065
  /// @brief Collective constructor using the specified memory allocation as temporary storage.
1088
1066
  /// @param[in] temp_storage Reference to memory allocation having layout type TempStorage
1089
1067
  _CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad(TempStorage& temp_storage)
1090
1068
  : temp_storage(temp_storage.Alias())
1091
- , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
1069
+ , linear_tid(RowMajorTid(BlockDimX, BlockDimY, BlockDimZ))
1092
1070
  {}
1093
1071
 
1094
1072
  //! @} end member group
@@ -1137,7 +1115,7 @@ public:
1137
1115
  //! @param[out] dst_items
1138
1116
  //! Destination to load data into
1139
1117
  template <typename RandomAccessIterator>
1140
- _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
1118
+ _CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread])
1141
1119
  {
1142
1120
  InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items);
1143
1121
  }
@@ -1191,7 +1169,7 @@ public:
1191
1169
  //! Number of valid items to load
1192
1170
  template <typename RandomAccessIterator>
1193
1171
  _CCCL_DEVICE _CCCL_FORCEINLINE void
1194
- Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
1172
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end)
1195
1173
  {
1196
1174
  InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end);
1197
1175
  }
@@ -1247,7 +1225,7 @@ public:
1247
1225
  //! Default value to assign out-of-bound items
1248
1226
  template <typename RandomAccessIterator, typename DefaultT>
1249
1227
  _CCCL_DEVICE _CCCL_FORCEINLINE void
1250
- Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
1228
+ Load(RandomAccessIterator block_src_it, T (&dst_items)[ItemsPerThread], int block_items_end, DefaultT oob_default)
1251
1229
  {
1252
1230
  InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end, oob_default);
1253
1231
  }