cuda-cccl 0.2.1__cp311-cp311-manylinux_2_26_x86_64.whl → 0.3.1__cp311-cp311-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (418) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +88 -80
  7. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +2 -18
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +27 -0
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +14 -3
  20. cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
  21. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  22. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +94 -13
  23. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  24. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +20 -6
  25. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
  26. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2 -2
  27. cuda/cccl/headers/include/cub/cub.cuh +8 -0
  28. cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +13 -32
  29. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
  30. cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +3 -3
  31. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
  32. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +3 -2
  33. cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +2 -10
  35. cuda/cccl/headers/include/cub/device/device_histogram.cuh +8 -8
  36. cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
  37. cuda/cccl/headers/include/cub/device/device_reduce.cuh +775 -163
  38. cuda/cccl/headers/include/cub/device/device_scan.cuh +306 -0
  39. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -246
  40. cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
  41. cuda/cccl/headers/include/cub/device/device_transform.cuh +11 -2
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +47 -48
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +9 -27
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  48. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
  49. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +100 -171
  50. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +629 -0
  51. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +8 -1
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +31 -29
  53. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +6 -15
  55. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
  56. cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +80 -0
  57. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +39 -15
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +5 -20
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +12 -5
  68. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  69. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
  70. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +37 -4
  71. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
  72. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
  73. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  74. cuda/cccl/headers/include/cub/util_device.cuh +72 -51
  75. cuda/cccl/headers/include/cub/util_ptx.cuh +8 -8
  76. cuda/cccl/headers/include/cub/util_type.cuh +15 -20
  77. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  78. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  79. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  80. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  81. cuda/cccl/headers/include/cuda/__algorithm/copy.h +64 -11
  82. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  83. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +3 -1
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
  85. cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
  86. cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
  87. cuda/cccl/headers/include/cuda/__device/device_ref.h +38 -48
  88. cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
  89. cuda/cccl/headers/include/cuda/__driver/driver_api.h +365 -33
  90. cuda/cccl/headers/include/cuda/__event/event.h +8 -8
  91. cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
  92. cuda/cccl/headers/include/cuda/__event/timed_event.h +5 -5
  93. cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +97 -52
  94. cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +5 -6
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +5 -0
  98. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +5 -0
  99. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +6 -4
  100. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +5 -0
  101. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +6 -1
  102. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +28 -27
  103. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +39 -33
  104. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +18 -11
  105. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
  106. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +10 -0
  107. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +36 -109
  108. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
  109. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
  110. cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +1 -1
  111. cuda/cccl/headers/include/cuda/__memory/address_space.h +28 -12
  112. cuda/cccl/headers/include/cuda/__memory/check_address.h +34 -29
  113. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
  114. cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
  115. cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
  116. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
  117. cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +161 -92
  118. cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +3 -2
  119. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
  120. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/devices +10 -0
  123. cuda/cccl/headers/include/cuda/iterator +1 -0
  124. cuda/cccl/headers/include/cuda/pipeline +2 -1
  125. cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +0 -6
  126. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +3 -3
  127. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +3 -3
  129. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +2 -2
  130. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +1 -1
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -1
  133. cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +2 -3
  134. cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +2 -3
  135. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +2 -2
  136. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  137. cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +6 -8
  138. cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +3 -3
  139. cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +4 -4
  140. cuda/cccl/headers/include/cuda/std/__atomic/order.h +1 -1
  141. cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +1 -1
  142. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  143. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  144. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  145. cuda/cccl/headers/include/cuda/std/__cccl/assert.h +18 -7
  146. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +4 -115
  147. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
  148. cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +8 -5
  149. cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +22 -3
  150. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +4 -4
  151. cuda/cccl/headers/include/cuda/std/__cccl/os.h +6 -0
  152. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
  153. cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +25 -0
  154. cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
  155. cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
  156. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
  157. cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
  158. cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
  159. cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
  160. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
  161. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
  162. cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +258 -0
  163. cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
  164. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
  165. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
  166. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +67 -0
  167. cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +1 -4
  168. cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +12 -9
  169. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
  170. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
  171. cuda/cccl/headers/include/cuda/std/__expected/expected.h +31 -38
  172. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
  173. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
  174. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  175. cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +4 -4
  176. cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +2 -2
  177. cuda/cccl/headers/include/cuda/std/__functional/bind.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +4 -4
  179. cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +2 -2
  180. cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +2 -2
  181. cuda/cccl/headers/include/cuda/std/__functional/function.h +10 -11
  182. cuda/cccl/headers/include/cuda/std/__functional/hash.h +5 -6
  183. cuda/cccl/headers/include/cuda/std/__functional/identity.h +4 -8
  184. cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +2 -4
  185. cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +16 -18
  186. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +2 -3
  187. cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +2 -3
  188. cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +3 -3
  189. cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +2 -2
  190. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +16 -25
  191. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +3 -3
  192. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +3 -3
  193. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +1 -1
  194. cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +21 -28
  195. cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +1 -1
  196. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +28 -39
  197. cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +3 -4
  198. cuda/cccl/headers/include/cuda/std/__iterator/next.h +2 -2
  199. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +2 -3
  200. cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +12 -41
  201. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +2 -2
  202. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +3 -4
  203. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +31 -31
  204. cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +45 -45
  205. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +3 -2
  206. cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +1 -1
  207. cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +1 -1
  208. cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +1 -1
  209. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
  210. cuda/cccl/headers/include/cuda/std/__memory/addressof.h +5 -2
  211. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +30 -30
  212. cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +1 -1
  213. cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +66 -86
  214. cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +2 -2
  215. cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +3 -3
  216. cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +5 -2
  217. cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +30 -45
  218. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +8 -12
  219. cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +21 -23
  220. cuda/cccl/headers/include/cuda/std/__new/launder.h +4 -0
  221. cuda/cccl/headers/include/cuda/std/__optional/hash.h +2 -2
  222. cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +2 -1
  223. cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +2 -1
  224. cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +1 -1
  225. cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +1 -1
  226. cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +1 -1
  227. cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +1 -1
  228. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  229. cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +4 -13
  230. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +16 -22
  231. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +13 -18
  232. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +2 -2
  233. cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +3 -4
  234. cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +2 -2
  235. cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +7 -8
  236. cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +4 -13
  237. cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +1 -1
  238. cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +2 -0
  239. cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +3 -5
  240. cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +1 -1
  241. cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +3 -44
  242. cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +2 -28
  243. cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +9 -5
  244. cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +3 -3
  245. cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +1 -4
  246. cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +3 -34
  247. cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +3 -29
  248. cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +2 -16
  249. cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +1 -1
  250. cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +4 -21
  251. cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +1 -1
  252. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +1 -1
  253. cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +3 -3
  254. cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +4 -24
  255. cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +3 -24
  256. cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +13 -9
  257. cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +3 -18
  258. cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +21 -20
  259. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +3 -17
  260. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +4 -31
  261. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +3 -42
  262. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +5 -19
  263. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +3 -19
  264. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +3 -17
  265. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +2 -15
  266. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +13 -28
  267. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +2 -17
  268. cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +2 -16
  269. cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +3 -18
  270. cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +1 -1
  271. cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +1 -1
  272. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +4 -3
  273. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +2 -2
  274. cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +2 -16
  275. cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +2 -2
  276. cuda/cccl/headers/include/cuda/std/__utility/declval.h +17 -4
  277. cuda/cccl/headers/include/cuda/std/__utility/exchange.h +1 -1
  278. cuda/cccl/headers/include/cuda/std/__utility/forward.h +1 -1
  279. cuda/cccl/headers/include/cuda/std/__utility/in_place.h +29 -0
  280. cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +2 -2
  281. cuda/cccl/headers/include/cuda/std/__utility/move.h +1 -1
  282. cuda/cccl/headers/include/cuda/std/__utility/pair.h +8 -9
  283. cuda/cccl/headers/include/cuda/std/array +2 -2
  284. cuda/cccl/headers/include/cuda/std/atomic +20 -28
  285. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  286. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +1 -32
  287. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +3 -3
  288. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +16 -1137
  289. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +12 -12
  290. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +4 -4
  291. cuda/cccl/headers/include/cuda/std/inplace_vector +9 -9
  292. cuda/cccl/headers/include/cuda/std/ratio +3 -4
  293. cuda/cccl/headers/include/cuda/std/string_view +12 -5
  294. cuda/cccl/headers/include/cuda/std/version +3 -8
  295. cuda/cccl/headers/include/thrust/advance.h +6 -8
  296. cuda/cccl/headers/include/thrust/detail/execution_policy.h +61 -21
  297. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  298. cuda/cccl/headers/include/thrust/detail/internal_functional.h +37 -2
  299. cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +1 -1
  300. cuda/cccl/headers/include/thrust/detail/reference.h +10 -16
  301. cuda/cccl/headers/include/thrust/detail/seq.h +37 -25
  302. cuda/cccl/headers/include/thrust/detail/vector_base.h +2 -4
  303. cuda/cccl/headers/include/thrust/detail/vector_base.inl +2 -2
  304. cuda/cccl/headers/include/thrust/distance.h +3 -3
  305. cuda/cccl/headers/include/thrust/execution_policy.h +202 -335
  306. cuda/cccl/headers/include/thrust/functional.h +1 -2
  307. cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +6 -1
  308. cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +1 -1
  309. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +35 -23
  310. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +66 -44
  311. cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +4 -99
  312. cuda/cccl/headers/include/thrust/system/cuda/config.h +7 -4
  313. cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +1 -1
  314. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +182 -38
  315. cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +1 -1
  316. cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +1 -1
  317. cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +199 -48
  318. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +2 -2
  319. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +1 -1
  320. cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +2 -2
  321. cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +1 -1
  322. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +15 -13
  323. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +22 -19
  324. cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +12 -42
  325. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +1 -1
  326. cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +16 -4
  327. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +30 -30
  328. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +0 -1
  329. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
  330. cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +1 -1
  331. cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +29 -15
  332. cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +1 -1
  333. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +1 -1
  334. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +1 -1
  335. cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +1 -1
  336. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +1 -1
  337. cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +7 -5
  338. cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +3 -27
  339. cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +10 -2
  340. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +16 -35
  341. cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +75 -61
  342. cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +4 -99
  343. cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +68 -51
  344. cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +2 -2
  345. cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +4 -99
  346. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
  347. cuda/cccl/parallel/experimental/__init__.py +21 -70
  348. cuda/compute/__init__.py +77 -0
  349. cuda/compute/_bindings.py +79 -0
  350. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
  351. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +177 -10
  352. cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +3 -3
  353. cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
  354. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  355. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  356. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  357. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
  358. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
  359. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  360. cuda/compute/algorithms/_three_way_partition.py +261 -0
  361. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
  362. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  363. cuda/compute/cu12/_bindings_impl.cpython-311-x86_64-linux-gnu.so +0 -0
  364. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  365. cuda/compute/cu13/_bindings_impl.cpython-311-x86_64-linux-gnu.so +0 -0
  366. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  367. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
  368. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  369. cuda/coop/__init__.py +8 -0
  370. cuda/{cccl/cooperative/experimental → coop}/_common.py +3 -1
  371. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  372. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  373. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  374. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  375. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  376. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  377. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  378. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  379. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  380. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  381. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  382. cuda/coop/warp/__init__.py +9 -0
  383. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  384. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  385. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  386. {cuda_cccl-0.2.1.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +2 -3
  387. {cuda_cccl-0.2.1.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +401 -388
  388. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  389. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  390. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
  391. cuda/cccl/headers/include/thrust/detail/util/align.h +0 -59
  392. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +0 -59
  393. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +0 -204
  394. cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +0 -92
  395. cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +0 -237
  396. cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +0 -95
  397. cuda/cccl/headers/include/thrust/system/omp/detail/par.h +0 -62
  398. cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +0 -62
  399. cuda/cccl/parallel/experimental/.gitignore +0 -4
  400. cuda/cccl/parallel/experimental/_bindings.py +0 -56
  401. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-311-x86_64-linux-gnu.so +0 -0
  402. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  403. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-311-x86_64-linux-gnu.so +0 -0
  404. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  405. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  406. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  407. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  408. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  409. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  410. /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
  411. /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
  412. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  413. /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
  414. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  415. /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
  416. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  417. {cuda_cccl-0.2.1.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
  418. {cuda_cccl-0.2.1.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,3 +1,9 @@
1
1
  # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2
2
  #
3
- # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3
+ # SPDX-License -Identifier: Apache-2.0 WITH LLVM-exception
4
+
5
+ from . import experimental
6
+
7
+ __all__ = [
8
+ "experimental",
9
+ ]
@@ -1,8 +1,24 @@
1
- # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
2
  #
3
- # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # alias for backwards compatibility
16
+
17
+ from warnings import warn
4
18
 
5
- from cuda.cccl.cooperative.experimental import block, warp
6
- from cuda.cccl.cooperative.experimental._types import StatefulFunction
19
+ from cuda.coop import * # noqa: F403
7
20
 
8
- __all__ = ["block", "warp", "StatefulFunction"]
21
+ warn(
22
+ "The module cuda.cccl.cooperative.experimental is deprecated. Use cuda.coop instead.",
23
+ FutureWarning,
24
+ )
@@ -64,9 +64,7 @@ struct AgentAdjacentDifferencePolicy
64
64
  static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
65
65
  };
66
66
 
67
- namespace detail
68
- {
69
- namespace adjacent_difference
67
+ namespace detail::adjacent_difference
70
68
  {
71
69
 
72
70
  template <typename Policy,
@@ -256,7 +254,6 @@ struct AgentDifferenceInit
256
254
  }
257
255
  };
258
256
 
259
- } // namespace adjacent_difference
260
- } // namespace detail
257
+ } // namespace detail::adjacent_difference
261
258
 
262
259
  CUB_NAMESPACE_END
@@ -62,9 +62,7 @@
62
62
 
63
63
  CUB_NAMESPACE_BEGIN
64
64
 
65
- namespace detail
66
- {
67
- namespace batch_memcpy
65
+ namespace detail::batch_memcpy
68
66
  {
69
67
  template <bool PTR_IS_FOUR_BYTE_ALIGNED>
70
68
  _CCCL_FORCEINLINE _CCCL_DEVICE void
@@ -1179,7 +1177,6 @@ private:
1179
1177
  // buffers
1180
1178
  BLevBlockOffsetTileState blev_block_scan_state;
1181
1179
  };
1182
- } // namespace batch_memcpy
1183
- } // namespace detail
1180
+ } // namespace detail::batch_memcpy
1184
1181
 
1185
1182
  CUB_NAMESPACE_END
@@ -42,9 +42,7 @@
42
42
 
43
43
  CUB_NAMESPACE_BEGIN
44
44
 
45
- namespace detail
46
- {
47
- namespace for_each
45
+ namespace detail::for_each
48
46
  {
49
47
 
50
48
  template <int BlockThreads, int ItemsPerThread>
@@ -78,7 +76,6 @@ struct agent_block_striped_t
78
76
  }
79
77
  };
80
78
 
81
- } // namespace for_each
82
- } // namespace detail
79
+ } // namespace detail::for_each
83
80
 
84
81
  CUB_NAMESPACE_END
@@ -25,22 +25,15 @@
25
25
  #include <cuda/std/__algorithm/min.h>
26
26
 
27
27
  CUB_NAMESPACE_BEGIN
28
- namespace detail
28
+ namespace detail::merge
29
29
  {
30
- namespace merge
31
- {
32
- template <int ThreadsPerBlock,
33
- int ItemsPerThread,
34
- BlockLoadAlgorithm LoadAlgorithm,
35
- CacheLoadModifier LoadCacheModifier,
36
- BlockStoreAlgorithm StoreAlgorithm>
30
+ template <int ThreadsPerBlock, int ItemsPerThread, CacheLoadModifier LoadCacheModifier, BlockStoreAlgorithm StoreAlgorithm>
37
31
  struct agent_policy_t
38
32
  {
39
33
  // do not change data member names, policy_wrapper_t depends on it
40
34
  static constexpr int BLOCK_THREADS = ThreadsPerBlock;
41
35
  static constexpr int ITEMS_PER_THREAD = ItemsPerThread;
42
36
  static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD;
43
- static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = LoadAlgorithm;
44
37
  static constexpr CacheLoadModifier LOAD_MODIFIER = LoadCacheModifier;
45
38
  static constexpr BlockStoreAlgorithm STORE_ALGORITHM = StoreAlgorithm;
46
39
  };
@@ -60,108 +53,124 @@ struct agent_t
60
53
  using policy = Policy;
61
54
 
62
55
  // key and value type are taken from the first input sequence (consistent with old Thrust behavior)
63
- using key_type = it_value_t<KeysIt1>;
64
- using item_type = it_value_t<ItemsIt1>;
65
-
66
- using keys_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt1>;
67
- using keys_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt2>;
68
- using items_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt1>;
69
- using items_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt2>;
70
-
71
- using block_load_keys1 = typename BlockLoadType<Policy, keys_load_it1>::type;
72
- using block_load_keys2 = typename BlockLoadType<Policy, keys_load_it2>::type;
73
- using block_load_items1 = typename BlockLoadType<Policy, items_load_it1>::type;
74
- using block_load_items2 = typename BlockLoadType<Policy, items_load_it2>::type;
75
-
56
+ using key_type = it_value_t<KeysIt1>;
57
+ using item_type = it_value_t<ItemsIt1>;
76
58
  using block_store_keys = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
77
59
  using block_store_items = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
78
60
 
61
+ static constexpr int items_per_thread = Policy::ITEMS_PER_THREAD;
62
+ static constexpr int threads_per_block = Policy::BLOCK_THREADS;
63
+ static constexpr int items_per_tile = Policy::ITEMS_PER_TILE;
64
+
79
65
  union temp_storages
80
66
  {
81
- typename block_load_keys1::TempStorage load_keys1;
82
- typename block_load_keys2::TempStorage load_keys2;
83
- typename block_load_items1::TempStorage load_items1;
84
- typename block_load_items2::TempStorage load_items2;
85
67
  typename block_store_keys::TempStorage store_keys;
86
68
  typename block_store_items::TempStorage store_items;
87
69
 
88
- key_type keys_shared[Policy::ITEMS_PER_TILE + 1];
89
- item_type items_shared[Policy::ITEMS_PER_TILE + 1];
70
+ // We could change SerialMerge to avoid reading one item out of bounds and drop the + 1 here. But that would
71
+ // introduce more branches (about 10% slower on 2^16 problem sizes on RTX 5090 in a first attempt)
72
+ key_type keys_shared[items_per_tile + 1];
73
+ item_type items_shared[items_per_tile + 1];
90
74
  };
91
75
 
92
76
  struct TempStorage : Uninitialized<temp_storages>
93
77
  {};
94
78
 
95
- static constexpr int items_per_thread = Policy::ITEMS_PER_THREAD;
96
- static constexpr int threads_per_block = Policy::BLOCK_THREADS;
97
- static constexpr Offset items_per_tile = Policy::ITEMS_PER_TILE;
98
-
99
79
  // Per thread data
100
80
  temp_storages& storage;
101
- keys_load_it1 keys1_in;
102
- items_load_it1 items1_in;
81
+ KeysIt1 keys1_in;
82
+ ItemsIt1 items1_in;
103
83
  Offset keys1_count;
104
- keys_load_it2 keys2_in;
105
- items_load_it2 items2_in;
84
+ KeysIt2 keys2_in;
85
+ ItemsIt2 items2_in;
106
86
  Offset keys2_count;
107
87
  KeysOutputIt keys_out;
108
88
  ItemsOutputIt items_out;
109
89
  CompareOp compare_op;
110
- Offset* merge_partitions;
90
+ Offset* key1_beg_offsets;
111
91
 
112
92
  template <bool IsFullTile>
113
93
  _CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(Offset tile_idx, Offset tile_base, int num_remaining)
114
94
  {
115
- const Offset partition_beg = merge_partitions[tile_idx + 0];
116
- const Offset partition_end = merge_partitions[tile_idx + 1];
117
-
118
95
  const Offset diag0 = items_per_tile * tile_idx;
119
- const Offset diag1 = (::cuda::std::min) (keys1_count + keys2_count, diag0 + items_per_tile);
96
+ Offset diag1 = diag0 + items_per_tile;
97
+ if constexpr (IsFullTile)
98
+ {
99
+ _CCCL_ASSERT(diag1 <= keys1_count + keys2_count, "");
100
+ }
101
+ else
102
+ {
103
+ diag1 = keys1_count + keys2_count;
104
+ }
120
105
 
121
106
  // compute bounding box for keys1 & keys2
122
- const Offset keys1_beg = partition_beg;
123
- const Offset keys1_end = partition_end;
107
+ const Offset keys1_beg = key1_beg_offsets[tile_idx + 0];
108
+ const Offset keys1_end = key1_beg_offsets[tile_idx + 1];
124
109
  const Offset keys2_beg = diag0 - keys1_beg;
125
110
  const Offset keys2_end = diag1 - keys1_end;
126
111
 
127
112
  // number of keys per tile
128
- const int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
129
- const int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
113
+ const int keys1_count_tile = static_cast<int>(keys1_end - keys1_beg);
114
+ const int keys2_count_tile = static_cast<int>(keys2_end - keys2_beg);
115
+ if constexpr (IsFullTile)
116
+ {
117
+ _CCCL_ASSERT(keys1_count_tile + keys2_count_tile == items_per_tile, "");
118
+ }
119
+ else
120
+ {
121
+ _CCCL_ASSERT(keys1_count_tile + keys2_count_tile == num_remaining, "");
122
+ }
130
123
 
131
124
  key_type keys_loc[items_per_thread];
132
- merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
133
- keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, num_keys1, num_keys2);
134
- merge_sort::reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
135
- __syncthreads();
125
+ {
126
+ auto keys1_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(keys1_in);
127
+ auto keys2_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(keys2_in);
128
+ merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
129
+ keys_loc, keys1_in_cm + keys1_beg, keys2_in_cm + keys2_beg, keys1_count_tile, keys2_count_tile);
130
+ merge_sort::reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
131
+ __syncthreads();
132
+ }
136
133
 
137
- // use binary search in shared memory to find merge path for each of thread.
134
+ // now find the merge path for each of thread.
138
135
  // we can use int type here, because the number of items in shared memory is limited
139
- const int diag0_loc = (::cuda::std::min) (num_keys1 + num_keys2, static_cast<int>(items_per_thread * threadIdx.x));
136
+ int diag0_thread = items_per_thread * static_cast<int>(threadIdx.x);
137
+ if constexpr (IsFullTile)
138
+ {
139
+ _CCCL_ASSERT(num_remaining == items_per_tile, "");
140
+ _CCCL_ASSERT(diag0_thread < num_remaining, "");
141
+ }
142
+ else
143
+ { // for partial tiles, clamp the thread diagonal to the valid items
144
+ diag0_thread = (::cuda::std::min) (diag0_thread, num_remaining);
145
+ }
140
146
 
141
- const int keys1_beg_loc =
142
- MergePath(&storage.keys_shared[0], &storage.keys_shared[num_keys1], num_keys1, num_keys2, diag0_loc, compare_op);
143
- const int keys1_end_loc = num_keys1;
144
- const int keys2_beg_loc = diag0_loc - keys1_beg_loc;
145
- const int keys2_end_loc = num_keys2;
147
+ const int keys1_beg_thread = MergePath(
148
+ &storage.keys_shared[0],
149
+ &storage.keys_shared[keys1_count_tile],
150
+ keys1_count_tile,
151
+ keys2_count_tile,
152
+ diag0_thread,
153
+ compare_op);
154
+ const int keys2_beg_thread = diag0_thread - keys1_beg_thread;
146
155
 
147
- const int num_keys1_loc = keys1_end_loc - keys1_beg_loc;
148
- const int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
156
+ const int keys1_count_thread = keys1_count_tile - keys1_beg_thread;
157
+ const int keys2_count_thread = keys2_count_tile - keys2_beg_thread;
149
158
 
150
159
  // perform serial merge
151
160
  int indices[items_per_thread];
152
- cub::SerialMerge(
161
+ SerialMerge(
153
162
  &storage.keys_shared[0],
154
- keys1_beg_loc,
155
- keys2_beg_loc + num_keys1,
156
- num_keys1_loc,
157
- num_keys2_loc,
163
+ keys1_beg_thread,
164
+ keys2_beg_thread + keys1_count_tile,
165
+ keys1_count_thread,
166
+ keys2_count_thread,
158
167
  keys_loc,
159
168
  indices,
160
169
  compare_op);
161
- __syncthreads();
162
170
 
163
171
  // write keys
164
- if (IsFullTile)
172
+ __syncthreads(); // sync after reading from SMEM before so block store can use SMEM again
173
+ if constexpr (IsFullTile)
165
174
  {
166
175
  block_store_keys{storage.store_keys}.Store(keys_out + tile_base, keys_loc);
167
176
  }
@@ -175,12 +184,15 @@ struct agent_t
175
184
  if constexpr (have_items)
176
185
  {
177
186
  item_type items_loc[items_per_thread];
178
- merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
179
- items_loc, items1_in + keys1_beg, items2_in + keys2_beg, num_keys1, num_keys2);
180
- __syncthreads(); // block_store_keys above uses shared memory, so make sure all threads are done before we write
181
- // to it
182
- merge_sort::reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
183
- __syncthreads();
187
+ {
188
+ auto items1_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(items1_in);
189
+ auto items2_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(items2_in);
190
+ merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
191
+ items_loc, items1_in_cm + keys1_beg, items2_in_cm + keys2_beg, keys1_count_tile, keys2_count_tile);
192
+ __syncthreads(); // block_store_keys above uses SMEM, so make sure all threads are done before we write to it
193
+ merge_sort::reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
194
+ __syncthreads();
195
+ }
184
196
 
185
197
  // gather items from shared mem
186
198
  _CCCL_PRAGMA_UNROLL_FULL()
@@ -191,7 +203,7 @@ struct agent_t
191
203
  __syncthreads();
192
204
 
193
205
  // write from reg to gmem
194
- if (IsFullTile)
206
+ if constexpr (IsFullTile)
195
207
  {
196
208
  block_store_items{storage.store_items}.Store(items_out + tile_base, items_loc);
197
209
  }
@@ -204,23 +216,19 @@ struct agent_t
204
216
 
205
217
  _CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
206
218
  {
207
- // XXX with 8.5 changing type to Offset (or long long) results in error!
208
- // TODO(bgruber): is the above still true?
209
- const int tile_idx = static_cast<int>(blockIdx.x);
219
+ const Offset tile_idx = blockIdx.x;
210
220
  const Offset tile_base = tile_idx * items_per_tile;
211
- // TODO(bgruber): random mixing of int and Offset
212
221
  const int items_in_tile =
213
222
  static_cast<int>((::cuda::std::min) (static_cast<Offset>(items_per_tile), keys1_count + keys2_count - tile_base));
214
223
  if (items_in_tile == items_per_tile)
215
224
  {
216
- consume_tile<true>(tile_idx, tile_base, items_per_tile); // full tile
225
+ consume_tile</* IsFullTile */ true>(tile_idx, tile_base, items_per_tile);
217
226
  }
218
227
  else
219
228
  {
220
- consume_tile<false>(tile_idx, tile_base, items_in_tile); // partial tile
229
+ consume_tile</* IsFullTile */ false>(tile_idx, tile_base, items_in_tile);
221
230
  }
222
231
  }
223
232
  };
224
- } // namespace merge
225
- } // namespace detail
233
+ } // namespace detail::merge
226
234
  CUB_NAMESPACE_END
@@ -66,9 +66,28 @@ struct AgentMergeSortPolicy
66
66
  static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
67
67
  };
68
68
 
69
+ #if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
69
70
  namespace detail
70
71
  {
71
- namespace merge_sort
72
+ // Only define this when needed.
73
+ // Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
74
+ // either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
75
+ // version is always defined, and that's the only one needed for regular CUB operations.
76
+ //
77
+ // TODO: enable this unconditionally once concepts are always available
78
+ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
79
+ MergeSortAgentPolicy,
80
+ (GenericAgentPolicy),
81
+ (BLOCK_THREADS, BlockThreads, int),
82
+ (ITEMS_PER_THREAD, ItemsPerThread, int),
83
+ (ITEMS_PER_TILE, ItemsPerTile, int),
84
+ (LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
85
+ (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
86
+ (STORE_ALGORITHM, StoreAlgorithm, cub::BlockStoreAlgorithm))
87
+ } // namespace detail
88
+ #endif // defined(CUB_DEFINE_RUNTIME_POLICIES
89
+
90
+ namespace detail::merge_sort
72
91
  {
73
92
 
74
93
  template <typename Policy,
@@ -724,7 +743,6 @@ struct AgentMerge
724
743
  }
725
744
  };
726
745
 
727
- } // namespace merge_sort
728
- } // namespace detail
746
+ } // namespace detail::merge_sort
729
747
 
730
748
  CUB_NAMESPACE_END
@@ -51,6 +51,7 @@
51
51
  #include <cub/block/radix_rank_sort_operations.cuh>
52
52
  #include <cub/iterator/cache_modified_input_iterator.cuh>
53
53
  #include <cub/thread/thread_load.cuh>
54
+ #include <cub/util_device.cuh>
54
55
  #include <cub/util_type.cuh>
55
56
 
56
57
  #include <cuda/std/cstdint>
@@ -119,13 +120,33 @@ struct AgentRadixSortDownsweepPolicy : ScalingType
119
120
  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
120
121
  };
121
122
 
123
+ #if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
124
+ namespace detail
125
+ {
126
+ // Only define this when needed.
127
+ // Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
128
+ // either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
129
+ // version is always defined, and that's the only one needed for regular CUB operations.
130
+ //
131
+ // TODO: enable this unconditionally once concepts are always available
132
+ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
133
+ RadixSortDownsweepAgentPolicy,
134
+ (GenericAgentPolicy),
135
+ (BLOCK_THREADS, BlockThreads, int),
136
+ (ITEMS_PER_THREAD, ItemsPerThread, int),
137
+ (RADIX_BITS, RadixBits, int),
138
+ (LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
139
+ (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
140
+ (RANK_ALGORITHM, RankAlgorithm, cub::RadixRankAlgorithm),
141
+ (SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
142
+ } // namespace detail
143
+ #endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
144
+
122
145
  /******************************************************************************
123
146
  * Thread block abstractions
124
147
  ******************************************************************************/
125
148
 
126
- namespace detail
127
- {
128
- namespace radix_sort
149
+ namespace detail::radix_sort
129
150
  {
130
151
 
131
152
  /**
@@ -760,7 +781,6 @@ struct AgentRadixSortDownsweep
760
781
  }
761
782
  };
762
783
 
763
- } // namespace radix_sort
764
- } // namespace detail
784
+ } // namespace detail::radix_sort
765
785
 
766
786
  CUB_NAMESPACE_END
@@ -85,9 +85,7 @@ struct AgentRadixSortExclusiveSumPolicy
85
85
  };
86
86
  };
87
87
 
88
- namespace detail
89
- {
90
- namespace radix_sort
88
+ namespace detail::radix_sort
91
89
  {
92
90
 
93
91
  template <typename AgentRadixSortHistogramPolicy,
@@ -283,7 +281,6 @@ struct AgentRadixSortHistogram
283
281
  }
284
282
  };
285
283
 
286
- } // namespace radix_sort
287
- } // namespace detail
284
+ } // namespace detail::radix_sort
288
285
 
289
286
  CUB_NAMESPACE_END
@@ -100,9 +100,7 @@ struct AgentRadixSortOnesweepPolicy : ScalingType
100
100
  static constexpr RadixSortStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
101
101
  };
102
102
 
103
- namespace detail
104
- {
105
- namespace radix_sort
103
+ namespace detail::radix_sort
106
104
  {
107
105
 
108
106
  template <typename AgentRadixSortOnesweepPolicy,
@@ -700,7 +698,6 @@ struct AgentRadixSortOnesweep
700
698
  }
701
699
  };
702
700
 
703
- } // namespace radix_sort
704
- } // namespace detail
701
+ } // namespace detail::radix_sort
705
702
 
706
703
  CUB_NAMESPACE_END
@@ -103,9 +103,7 @@ struct AgentRadixSortUpsweepPolicy : ScalingType
103
103
  * Thread block abstractions
104
104
  ******************************************************************************/
105
105
 
106
- namespace detail
107
- {
108
- namespace radix_sort
106
+ namespace detail::radix_sort
109
107
  {
110
108
 
111
109
  /**
@@ -552,7 +550,6 @@ struct AgentRadixSortUpsweep
552
550
  }
553
551
  };
554
552
 
555
- } // namespace radix_sort
556
- } // namespace detail
553
+ } // namespace detail::radix_sort
557
554
 
558
555
  CUB_NAMESPACE_END
@@ -50,10 +50,10 @@
50
50
  #include <cub/util_device.cuh>
51
51
  #include <cub/util_type.cuh>
52
52
 
53
- #include <cuda/__memory/is_aligned.h>
54
53
  #include <cuda/std/__algorithm/min.h>
55
54
  #include <cuda/std/__functional/identity.h>
56
55
  #include <cuda/std/__functional/operations.h>
56
+ #include <cuda/std/__memory/is_sufficiently_aligned.h>
57
57
  #include <cuda/std/__type_traits/conditional.h>
58
58
  #include <cuda/std/__type_traits/is_pointer.h>
59
59
 
@@ -175,9 +175,6 @@ namespace detail::reduce
175
175
  * @tparam InputIteratorT
176
176
  * Random-access iterator type for input
177
177
  *
178
- * @tparam OutputIteratorT
179
- * Random-access iterator type for output
180
- *
181
178
  * @tparam OffsetT
182
179
  * Signed integer type for global offsets
183
180
  *
@@ -202,7 +199,6 @@ namespace detail::reduce
202
199
  */
203
200
  template <typename AgentReducePolicy,
204
201
  typename InputIteratorT,
205
- typename OutputIteratorT,
206
202
  typename OffsetT,
207
203
  typename ReductionOp,
208
204
  typename AccumT,
@@ -274,7 +270,7 @@ struct AgentReduceImpl
274
270
  {
275
271
  if constexpr (AttemptVectorization)
276
272
  {
277
- return ::cuda::is_aligned(d_in, sizeof(VectorT));
273
+ return ::cuda::std::is_sufficiently_aligned<alignof(VectorT)>(d_in);
278
274
  }
279
275
  else
280
276
  {
@@ -506,9 +502,6 @@ private:
506
502
  * @tparam InputIteratorT
507
503
  * Random-access iterator type for input
508
504
  *
509
- * @tparam OutputIteratorT
510
- * Random-access iterator type for output
511
- *
512
505
  * @tparam OffsetT
513
506
  * Signed integer type for global offsets
514
507
  *
@@ -524,7 +517,6 @@ private:
524
517
  */
525
518
  template <typename AgentReducePolicy,
526
519
  typename InputIteratorT,
527
- typename OutputIteratorT,
528
520
  typename OffsetT,
529
521
  typename ReductionOp,
530
522
  typename AccumT,
@@ -532,7 +524,6 @@ template <typename AgentReducePolicy,
532
524
  struct AgentReduce
533
525
  : AgentReduceImpl<AgentReducePolicy,
534
526
  InputIteratorT,
535
- OutputIteratorT,
536
527
  OffsetT,
537
528
  ReductionOp,
538
529
  AccumT,
@@ -543,7 +534,6 @@ struct AgentReduce
543
534
  using base_t =
544
535
  AgentReduceImpl<AgentReducePolicy,
545
536
  InputIteratorT,
546
- OutputIteratorT,
547
537
  OffsetT,
548
538
  ReductionOp,
549
539
  AccumT,
@@ -574,9 +564,6 @@ struct AgentReduce
574
564
  * @tparam InputIteratorT
575
565
  * Random-access iterator type for input
576
566
  *
577
- * @tparam OutputIteratorT
578
- * Random-access iterator type for output
579
- *
580
567
  * @tparam OffsetT
581
568
  * Signed integer type for global offsets
582
569
  *
@@ -592,7 +579,6 @@ struct AgentReduce
592
579
  */
593
580
  template <typename AgentReducePolicy,
594
581
  typename InputIteratorT,
595
- typename OutputIteratorT,
596
582
  typename OffsetT,
597
583
  typename ReductionOp,
598
584
  typename AccumT,
@@ -600,7 +586,6 @@ template <typename AgentReducePolicy,
600
586
  struct AgentWarpReduce
601
587
  : AgentReduceImpl<AgentReducePolicy,
602
588
  InputIteratorT,
603
- OutputIteratorT,
604
589
  OffsetT,
605
590
  ReductionOp,
606
591
  AccumT,
@@ -612,7 +597,6 @@ struct AgentWarpReduce
612
597
  using base_t =
613
598
  AgentReduceImpl<AgentReducePolicy,
614
599
  InputIteratorT,
615
- OutputIteratorT,
616
600
  OffsetT,
617
601
  ReductionOp,
618
602
  AccumT,
@@ -134,9 +134,7 @@ struct AgentRlePolicy
134
134
  * Thread block abstractions
135
135
  ******************************************************************************/
136
136
 
137
- namespace detail
138
- {
139
- namespace rle
137
+ namespace detail::rle
140
138
  {
141
139
 
142
140
  /**
@@ -1121,7 +1119,6 @@ struct AgentRle
1121
1119
  }
1122
1120
  };
1123
1121
 
1124
- } // namespace rle
1125
- } // namespace detail
1122
+ } // namespace detail::rle
1126
1123
 
1127
1124
  CUB_NAMESPACE_END