cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -47,9 +47,7 @@
47
47
 
48
48
  CUB_NAMESPACE_BEGIN
49
49
 
50
- namespace detail
51
- {
52
- namespace three_way_partition
50
+ namespace detail::three_way_partition
53
51
  {
54
52
 
55
53
  template <typename PolicyT, typename = void>
@@ -437,7 +435,6 @@ struct policy_hub
437
435
 
438
436
  using MaxPolicy = Policy1000;
439
437
  };
440
- } // namespace three_way_partition
441
- } // namespace detail
438
+ } // namespace detail::three_way_partition
442
439
 
443
440
  CUB_NAMESPACE_END
@@ -113,11 +113,11 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
113
113
  (max_items_per_thread, MaxItemsPerThread, int),
114
114
  (not_a_vectorized_policy, NotAVectorizedPolicy, int) ) // TODO: remove with C++20
115
115
 
116
- template <int BlockThreads, int ItemsPerThread, int LoadStoreWordSize>
117
- struct vectorized_policy_t : prefetch_policy_t<BlockThreads>
116
+ template <typename Tuning>
117
+ struct vectorized_policy_t : prefetch_policy_t<Tuning::block_threads>
118
118
  {
119
- static constexpr int items_per_thread_vectorized = ItemsPerThread;
120
- static constexpr int load_store_word_size = LoadStoreWordSize;
119
+ static constexpr int items_per_thread_vectorized = Tuning::items_per_thread;
120
+ static constexpr int vec_size = Tuning::vec_size;
121
121
 
122
122
  using not_a_vectorized_policy = void; // TODO: remove with C++20, shadows the variable in prefetch_policy_t
123
123
  };
@@ -130,7 +130,7 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
130
130
  (min_items_per_thread, MinItemsPerThread, int),
131
131
  (max_items_per_thread, MaxItemsPerThread, int),
132
132
  (items_per_thread_vectorized, ItemsPerThreadVectorized, int),
133
- (load_store_word_size, LoadStoreWordSize, int) )
133
+ (vec_size, VecSize, int) )
134
134
 
135
135
  template <int BlockThreads, int BulkCopyAlignment>
136
136
  struct async_copy_policy_t
@@ -282,47 +282,6 @@ _CCCL_HOST_DEVICE constexpr int arch_to_min_bytes_in_flight(int sm_arch)
282
282
  return 12 * 1024; // V100 and below
283
283
  }
284
284
 
285
- template <typename H, typename... Ts>
286
- _CCCL_HOST_DEVICE constexpr bool all_nonzero_equal(H head, Ts... values)
287
- {
288
- size_t first = 0;
289
- for (size_t v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
290
- {
291
- if (v == 0)
292
- {
293
- continue;
294
- }
295
- if (first == 0)
296
- {
297
- first = v;
298
- }
299
- else if (v != first)
300
- {
301
- return false;
302
- }
303
- }
304
- return true;
305
- }
306
-
307
- _CCCL_HOST_DEVICE constexpr bool all_nonzero_equal()
308
- {
309
- return true;
310
- }
311
-
312
- template <typename H, typename... Ts>
313
- _CCCL_HOST_DEVICE constexpr auto first_nonzero_value(H head, Ts... values)
314
- {
315
- for (auto v : ::cuda::std::array<H, 1 + sizeof...(Ts)>{head, values...})
316
- {
317
- if (v != 0)
318
- {
319
- return v;
320
- }
321
- }
322
- // we only reach here when all input are not contiguous and the output has a void value type
323
- return H{1};
324
- }
325
-
326
285
  template <typename T>
327
286
  inline constexpr size_t size_of = sizeof(T);
328
287
 
@@ -337,6 +296,47 @@ _CCCL_HOST_DEVICE static constexpr auto make_sizes_alignments()
337
296
  {{sizeof(it_value_t<RandomAccessIteratorsIn>), alignof(it_value_t<RandomAccessIteratorsIn>)}...}};
338
297
  }
339
298
 
299
+ template <int PtxVersion, int StoreSize, int... LoadSizes>
300
+ struct tuning_vec
301
+ {
302
+ // defaults from fill on RTX 5090, but can be changed
303
+ static constexpr int block_threads = 256;
304
+ static constexpr int vec_size = 4;
305
+ static constexpr int items_per_thread = 8;
306
+ };
307
+
308
+ // manually tuned fill on A100
309
+ template <int StoreSize>
310
+ struct tuning_vec<800, StoreSize>
311
+ {
312
+ static constexpr int block_threads = 256;
313
+ static constexpr int vec_size = ::cuda::std::max(8 / StoreSize, 1); // 64-bit instructions
314
+ static constexpr int items_per_thread = 8;
315
+ };
316
+
317
+ // manually tuned fill on H200
318
+ template <int StoreSize>
319
+ struct tuning_vec<900, StoreSize>
320
+ {
321
+ static constexpr int block_threads = StoreSize > 4 ? 128 : 256;
322
+ static constexpr int vec_size = ::cuda::std::max(8 / StoreSize, 1); // 64-bit instructions
323
+ static constexpr int items_per_thread = 16;
324
+ };
325
+
326
+ // manually tuned fill on B200, same as H200
327
+ template <int StoreSize>
328
+ struct tuning_vec<1000, StoreSize> : tuning_vec<900, StoreSize>
329
+ {};
330
+
331
+ // manually tuned fill on RTX 5090
332
+ template <int StoreSize>
333
+ struct tuning_vec<1200, StoreSize>
334
+ {
335
+ static constexpr int block_threads = 256;
336
+ static constexpr int vec_size = 4;
337
+ static constexpr int items_per_thread = 8;
338
+ };
339
+
340
340
  template <bool RequiresStableAddress,
341
341
  bool DenseOutput,
342
342
  typename RandomAccessIteratorTupleIn,
@@ -367,29 +367,12 @@ struct policy_hub<RequiresStableAddress,
367
367
  || THRUST_NS_QUALIFIER::is_trivially_relocatable_v<it_value_t<RandomAccessIteratorsIn>>)
368
368
  && ...);
369
369
 
370
- // for vectorized policy:
371
- static constexpr bool all_contiguous_input_values_same_size = all_nonzero_equal(
372
- (sizeof(it_value_t<RandomAccessIteratorsIn>)
373
- * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...);
374
- static constexpr int load_store_word_size = 8; // TODO(bgruber): make this 16, and 32 on Blackwell+
375
- // find the value type size of the first contiguous iterator. if there are no inputs, we take the size of the output
376
- // value type
377
- static constexpr int contiguous_value_type_size = first_nonzero_value(
378
- (int{sizeof(it_value_t<RandomAccessIteratorsIn>)}
379
- * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
380
- int{size_of<it_value_t<RandomAccessIteratorOut>>});
381
- static constexpr bool value_type_divides_load_store_size =
382
- load_store_word_size % contiguous_value_type_size == 0; // implicitly checks that value_type_size <=
383
- // load_store_word_size
384
- static constexpr int target_bytes_per_thread =
385
- no_input_streams ? 16 /* by experiment on RTX 5090 */ : 32 /* guestimate by gevtushenko for loading */;
386
- static constexpr int items_per_thread_vec =
387
- ::cuda::round_up(target_bytes_per_thread, load_store_word_size) / contiguous_value_type_size;
388
- using default_vectorized_policy_t = vectorized_policy_t<256, items_per_thread_vec, load_store_word_size>;
370
+ static constexpr bool all_value_types_have_power_of_two_size =
371
+ (::cuda::is_power_of_two(sizeof(it_value_t<RandomAccessIteratorsIn>)) && ...)
372
+ && ::cuda::is_power_of_two(size_of<it_value_t<RandomAccessIteratorOut>>);
389
373
 
390
374
  static constexpr bool fallback_to_prefetch =
391
- RequiresStableAddress || !can_memcpy_contiguous_inputs || !all_contiguous_input_values_same_size
392
- || !value_type_divides_load_store_size || !DenseOutput;
375
+ RequiresStableAddress || !can_memcpy_contiguous_inputs || !all_value_types_have_power_of_two_size || !DenseOutput;
393
376
 
394
377
  // TODO(bgruber): consider a separate kernel for just filling
395
378
 
@@ -398,12 +381,16 @@ struct policy_hub<RequiresStableAddress,
398
381
  static constexpr int min_bif = arch_to_min_bytes_in_flight(300);
399
382
  // TODO(bgruber): we don't need algo, because we can just detect the type of algo_policy
400
383
  static constexpr auto algorithm = fallback_to_prefetch ? Algorithm::prefetch : Algorithm::vectorized;
401
- using algo_policy = ::cuda::std::_If<fallback_to_prefetch, prefetch_policy_t<256>, default_vectorized_policy_t>;
384
+ using vec_policy_t = vectorized_policy_t<
385
+ tuning_vec<500, size_of<it_value_t<RandomAccessIteratorOut>>, sizeof(it_value_t<RandomAccessIteratorsIn>)...>>;
386
+ using algo_policy = ::cuda::std::_If<fallback_to_prefetch, prefetch_policy_t<256>, vec_policy_t>;
402
387
  };
403
388
 
404
389
  struct policy800 : ChainedPolicy<800, policy800, policy300>
405
390
  {
406
391
  private:
392
+ using vec_policy_t = vectorized_policy_t<
393
+ tuning_vec<800, size_of<it_value_t<RandomAccessIteratorOut>>, sizeof(it_value_t<RandomAccessIteratorsIn>)...>>;
407
394
  static constexpr int block_threads = 256;
408
395
  using async_policy = async_copy_policy_t<block_threads, ldgsts_size_and_align>;
409
396
  // We cannot use the architecture-specific amount of SMEM here instead of max_smem_per_block, because this is not
@@ -427,13 +414,17 @@ struct policy_hub<RequiresStableAddress,
427
414
  using algo_policy =
428
415
  ::cuda::std::_If<fallback_to_prefetch,
429
416
  prefetch_policy_t<block_threads>,
430
- ::cuda::std::_If<fallback_to_vectorized, default_vectorized_policy_t, async_policy>>;
417
+ ::cuda::std::_If<fallback_to_vectorized, vec_policy_t, async_policy>>;
431
418
  };
432
419
 
433
420
  template <int AsyncBlockSize, int PtxVersion>
434
421
  struct bulk_copy_policy_base
435
422
  {
436
423
  private:
424
+ using vec_policy_t =
425
+ vectorized_policy_t<tuning_vec<PtxVersion,
426
+ size_of<it_value_t<RandomAccessIteratorOut>>,
427
+ sizeof(it_value_t<RandomAccessIteratorsIn>)...>>;
437
428
  static constexpr int alignment = bulk_copy_alignment(PtxVersion);
438
429
  using async_policy = async_copy_policy_t<AsyncBlockSize, alignment>;
439
430
  // We cannot use the architecture-specific amount of SMEM here instead of max_smem_per_block, because this is not
@@ -469,7 +460,7 @@ struct policy_hub<RequiresStableAddress,
469
460
  using algo_policy =
470
461
  ::cuda::std::_If<fallback_to_prefetch,
471
462
  prefetch_policy_t<256>,
472
- ::cuda::std::_If<fallback_to_vectorized, default_vectorized_policy_t, async_policy>>;
463
+ ::cuda::std::_If<fallback_to_vectorized, vec_policy_t, async_policy>>;
473
464
  };
474
465
 
475
466
  struct policy900
@@ -788,6 +788,16 @@ struct UniqueByKeyPolicyWrapper<StaticPolicyT,
788
788
  {
789
789
  return cub::detail::MakePolicyWrapper(typename StaticPolicyT::UniqueByKeyPolicyT());
790
790
  }
791
+
792
+ #if defined(CUB_ENABLE_POLICY_PTX_JSON)
793
+ _CCCL_DEVICE static constexpr auto EncodedPolicy()
794
+ {
795
+ using namespace ptx_json;
796
+ return object<key<"UniqueByKeyPolicyT">() = UniqueByKey().EncodedPolicy(),
797
+ key<"DelayConstructor">() =
798
+ StaticPolicyT::UniqueByKeyPolicyT::detail::delay_constructor_t::EncodedConstructor()>();
799
+ }
800
+ #endif
791
801
  };
792
802
 
793
803
  template <typename PolicyT>
@@ -136,6 +136,7 @@ CUB_NAMESPACE_BEGIN
136
136
  //! {
137
137
  //! int array[4] = {1, 2, 3, 4};
138
138
  //! int sum = cub::ThreadReduce(array, ::cuda::std::plus<>{}); // sum = 10
139
+ //! }
139
140
  //!
140
141
  //! @endrst
141
142
  //!
@@ -437,10 +438,13 @@ template <typename Input, typename ReductionOp, typename ValueT, typename AccumT
437
438
  "Input must support the subscript operator[] and have a compile-time size");
438
439
  static_assert(has_binary_call_operator<ReductionOp, ValueT>::value,
439
440
  "ReductionOp must have the binary call operator: operator(ValueT, ValueT)");
440
- if constexpr (static_size_v<Input> == 1)
441
+
442
+ static constexpr auto length = static_size_v<Input>;
443
+ if constexpr (length == 1)
441
444
  {
442
445
  return static_cast<AccumT>(input[0]);
443
446
  }
447
+
444
448
  using PromT = ::cuda::std::_If<enable_min_max_promotion_v<ReductionOp, ValueT>, int, AccumT>;
445
449
  // TODO: should be part of the tuning policy
446
450
  if constexpr ((!is_simd_enabled_cuda_operator<ReductionOp, ValueT> && !is_simd_operator_v<ReductionOp>)
@@ -449,38 +453,41 @@ template <typename Input, typename ReductionOp, typename ValueT, typename AccumT
449
453
  return ThreadReduceSequential<AccumT>(input, reduction_op);
450
454
  }
451
455
 
452
- constexpr auto length = static_size_v<Input>;
453
- if constexpr (::cuda::std::is_same_v<Input, AccumT> && enable_sm90_simd_reduction_v<Input, ReductionOp, length>)
456
+ if constexpr (::cuda::std::is_same_v<ValueT, AccumT> && enable_sm90_simd_reduction_v<ValueT, ReductionOp, length>)
454
457
  {
455
458
  NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSimd(input, reduction_op);))
456
459
  }
457
460
 
458
- if constexpr (::cuda::std::is_same_v<Input, AccumT> && enable_sm80_simd_reduction_v<Input, ReductionOp, length>)
461
+ if constexpr (::cuda::std::is_same_v<ValueT, AccumT> && enable_sm80_simd_reduction_v<ValueT, ReductionOp, length>)
459
462
  {
460
463
  NV_IF_TARGET(NV_PROVIDES_SM_80, (return ThreadReduceSimd(input, reduction_op);))
461
464
  }
462
465
 
463
- if constexpr (::cuda::std::is_same_v<Input, AccumT> && enable_sm70_simd_reduction_v<Input, ReductionOp, length>)
466
+ if constexpr (::cuda::std::is_same_v<ValueT, AccumT> && enable_sm70_simd_reduction_v<ValueT, ReductionOp, length>)
464
467
  {
465
468
  NV_IF_TARGET(NV_PROVIDES_SM_70, (return ThreadReduceSimd(input, reduction_op);))
466
469
  }
467
470
 
468
- if constexpr (enable_ternary_reduction_sm90_v<Input, ReductionOp>)
471
+ if constexpr (length >= 6)
469
472
  {
470
- // with the current tuning policies, SM90/int32/+ uses too many registers (TODO: fix tuning policy)
471
- if constexpr ((is_one_of_v<ReductionOp, ::cuda::std::plus<>, ::cuda::std::plus<PromT>>
472
- && is_one_of_v<PromT, int32_t, uint32_t>)
473
- // the compiler generates bad code for int8/uint8 and min/max for SM90
474
- || (is_cuda_minimum_maximum_v<ReductionOp, ValueT> && is_one_of_v<PromT, int8_t, uint8_t>) )
473
+ // apply SM90 min/max ternary reduction only if the input is natively int32/uint32
474
+ if constexpr (enable_ternary_reduction_sm90_v<ValueT, ReductionOp>)
475
475
  {
476
- NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSequential<PromT>(input, reduction_op);));
476
+ // with the current tuning policies, SM90/int32/+ uses too many registers (TODO: fix tuning policy)
477
+ if constexpr ((is_one_of_v<ReductionOp, ::cuda::std::plus<>, ::cuda::std::plus<PromT>>
478
+ && is_one_of_v<PromT, int32_t, uint32_t>)
479
+ // the compiler generates bad code for int8/uint8 and min/max for SM90
480
+ || (is_cuda_minimum_maximum_v<ReductionOp, ValueT> && is_one_of_v<PromT, int8_t, uint8_t>) )
481
+ {
482
+ NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceSequential<PromT>(input, reduction_op);));
483
+ }
484
+ NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceTernaryTree<PromT>(input, reduction_op);));
477
485
  }
478
- NV_IF_TARGET(NV_PROVIDES_SM_90, (return ThreadReduceTernaryTree<PromT>(input, reduction_op);));
479
- }
480
486
 
481
- if constexpr (enable_ternary_reduction_sm50_v<Input, ReductionOp>)
482
- {
483
- NV_IF_TARGET(NV_PROVIDES_SM_50, (return ThreadReduceSequential<PromT>(input, reduction_op);));
487
+ if constexpr (enable_ternary_reduction_sm50_v<ValueT, ReductionOp>)
488
+ {
489
+ NV_IF_TARGET(NV_PROVIDES_SM_50, (return ThreadReduceSequential<PromT>(input, reduction_op);));
490
+ }
484
491
  }
485
492
 
486
493
  return ThreadReduceBinaryTree<PromT>(input, reduction_op);
@@ -51,6 +51,7 @@
51
51
  #include <cuda/__functional/maximum.h>
52
52
  #include <cuda/__functional/minimum.h>
53
53
  #include <cuda/__ptx/instructions/get_sreg.h>
54
+ #include <cuda/std/__bit/countr.h>
54
55
  #include <cuda/std/__functional/operations.h>
55
56
  #include <cuda/std/__type_traits/enable_if.h>
56
57
  #include <cuda/std/__type_traits/integral_constant.h>
@@ -701,7 +702,7 @@ struct WarpReduceShfl
701
702
  _CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
702
703
  {
703
704
  // Get the start flags for each thread in the warp.
704
- int warp_flags = __ballot_sync(member_mask, flag);
705
+ unsigned warp_flags = __ballot_sync(member_mask, flag);
705
706
 
706
707
  // Convert to tail-segmented
707
708
  if (HEAD_SEGMENTED)
@@ -722,7 +723,7 @@ struct WarpReduceShfl
722
723
  warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
723
724
 
724
725
  // Find the next set flag
725
- int last_lane = __clz(__brev(warp_flags));
726
+ int last_lane = ::cuda::std::countr_zero(warp_flags);
726
727
 
727
728
  T output = input;
728
729
  // Template-iterate reduction steps
@@ -49,6 +49,7 @@
49
49
  #include <cub/util_type.cuh>
50
50
 
51
51
  #include <cuda/__ptx/instructions/get_sreg.h>
52
+ #include <cuda/std/__bit/countr.h>
52
53
  #include <cuda/std/__type_traits/integral_constant.h>
53
54
 
54
55
  CUB_NAMESPACE_BEGIN
@@ -215,7 +216,7 @@ struct WarpReduceSmem
215
216
  SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, ::cuda::std::true_type /*has_ballot*/)
216
217
  {
217
218
  // Get the start flags for each thread in the warp.
218
- int warp_flags = __ballot_sync(member_mask, flag);
219
+ unsigned warp_flags = __ballot_sync(member_mask, flag);
219
220
 
220
221
  if (!HEAD_SEGMENTED)
221
222
  {
@@ -232,7 +233,7 @@ struct WarpReduceSmem
232
233
  }
233
234
 
234
235
  // Find next flag
235
- int next_flag = __clz(__brev(warp_flags));
236
+ int next_flag = ::cuda::std::countr_zero(warp_flags);
236
237
 
237
238
  // Clip the next segment at the warp boundary if necessary
238
239
  if (LOGICAL_WARP_THREADS != 32)
@@ -50,8 +50,8 @@
50
50
 
51
51
  #include <cuda/__ptx/instructions/get_sreg.h>
52
52
  #include <cuda/std/__algorithm/clamp.h>
53
- #include <cuda/std/__algorithm/max.h>
54
53
  #include <cuda/std/__bit/has_single_bit.h>
54
+ #include <cuda/std/__bit/integral.h>
55
55
  #include <cuda/std/__functional/operations.h>
56
56
  #include <cuda/std/__type_traits/integral_constant.h>
57
57
  #include <cuda/std/__type_traits/is_integral.h>
@@ -630,7 +630,7 @@ struct WarpScanShfl
630
630
  ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le();
631
631
 
632
632
  // Find index of first set bit
633
- int segment_first_lane = ::cuda::std::max(0, 31 - __clz(ballot));
633
+ int segment_first_lane = ::cuda::std::__bit_log2(ballot);
634
634
 
635
635
  // Iterate scan steps
636
636
  _CCCL_PRAGMA_UNROLL_FULL()
@@ -191,8 +191,8 @@ enum WarpLoadAlgorithm
191
191
  //!
192
192
  //! // Load a segment of consecutive items that are blocked across threads
193
193
  //! int thread_data[items_per_thread];
194
- //! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
195
- //! thread_data);
194
+ //! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data);
195
+ //! }
196
196
  //!
197
197
  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``.
198
198
  //! The set of ``thread_data`` across the first logical warp of threads in those
@@ -484,8 +484,8 @@ public:
484
484
  //!
485
485
  //! // Load a segment of consecutive items that are blocked across threads
486
486
  //! int thread_data[items_per_thread];
487
- //! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
488
- //! thread_data);
487
+ //! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data);
488
+ //! }
489
489
  //!
490
490
  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``,
491
491
  //! The set of ``thread_data`` across the first logical warp of threads in those
@@ -533,9 +533,9 @@ public:
533
533
  //!
534
534
  //! // Load a segment of consecutive items that are blocked across threads
535
535
  //! int thread_data[items_per_thread];
536
- //! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size,
537
- //! thread_data,
536
+ //! WarpLoadT(temp_storage[warp_id]).Load(d_data + warp_id * tile_size, thread_data,
538
537
  //! valid_items);
538
+ //! }
539
539
  //!
540
540
  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...`` and ``valid_items`` is ``5``.
541
541
  //! The set of ``thread_data`` across the first logical warp of threads in those threads will be:
@@ -105,6 +105,7 @@ CUB_NAMESPACE_BEGIN
105
105
  //! // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
106
106
  //! int warp_id = threadIdx.x / 32;
107
107
  //! int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
108
+ //! }
108
109
  //!
109
110
  //! Suppose the set of input ``thread_data`` across the block of threads is ``{0, 1, 2, 3, ..., 127}``.
110
111
  //! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will be
@@ -130,6 +131,8 @@ CUB_NAMESPACE_BEGIN
130
131
  //! int thread_data = ...
131
132
  //! // Return the warp-wide sum to lane0
132
133
  //! int aggregate = WarpReduce(temp_storage).Sum(thread_data);
134
+ //! }
135
+ //! }
133
136
  //!
134
137
  //! Suppose the set of input ``thread_data`` across the warp of threads is ``{0, 1, 2, 3, ..., 31}``.
135
138
  //! The corresponding output ``aggregate`` in thread0 will be ``496`` (and is undefined in other threads).
@@ -218,6 +221,7 @@ public:
218
221
  //! // Return the warp-wide sums to each lane0
219
222
  //! int warp_id = threadIdx.x / 32;
220
223
  //! int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
224
+ //! }
221
225
  //!
222
226
  //! Suppose the set of input ``thread_data`` across the block of threads is ``{0, 1, 2, 3, ..., 127}``.
223
227
  //! The corresponding output ``aggregate`` in threads 0, 32, 64, and 96 will ``496``, ``1520``, ``2544``, and
@@ -299,8 +303,8 @@ public:
299
303
  //! thread_data = d_data[threadIdx.x];
300
304
  //!
301
305
  //! // Return the warp-wide sums to each lane0
302
- //! int aggregate = WarpReduce(temp_storage).Sum(
303
- //! thread_data, valid_items);
306
+ //! int aggregate = WarpReduce(temp_storage).Sum(thread_data, valid_items);
307
+ //! }
304
308
  //!
305
309
  //! Suppose the input ``d_data`` is ``{0, 1, 2, 3, 4, ...`` and ``valid_items`` is ``4``.
306
310
  //! The corresponding output ``aggregate`` in *lane*\ :sub:`0` is ``6``
@@ -363,6 +367,7 @@ public:
363
367
  //! // Return the warp-wide sums to each lane0
364
368
  //! int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
365
369
  //! thread_data, head_flag);
370
+ //! }
366
371
  //!
367
372
  //! Suppose the set of input ``thread_data`` and ``head_flag`` across the block of threads
368
373
  //! is ``{0, 1, 2, 3, ..., 31`` and is ``{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0``,
@@ -114,6 +114,7 @@ CUB_NAMESPACE_BEGIN
114
114
  //! // Compute warp-wide prefix sums
115
115
  //! int warp_id = threadIdx.x / 32;
116
116
  //! WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
117
+ //! }
117
118
  //!
118
119
  //! Suppose the set of input ``thread_data`` across the block of threads is
119
120
  //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps of
@@ -143,6 +144,8 @@ CUB_NAMESPACE_BEGIN
143
144
  //!
144
145
  //! // Compute warp-wide prefix sums
145
146
  //! WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
147
+ //! }
148
+ //! }
146
149
  //!
147
150
  //! Suppose the set of input ``thread_data`` across the warp of threads is
148
151
  //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` will be
@@ -248,6 +251,7 @@ public:
248
251
  //! // Compute inclusive warp-wide prefix sums
249
252
  //! int warp_id = threadIdx.x / 32;
250
253
  //! WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
254
+ //! }
251
255
  //!
252
256
  //! Suppose the set of input ``thread_data`` across the block of threads is
253
257
  //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
@@ -294,9 +298,8 @@ public:
294
298
  //! // Compute inclusive warp-wide prefix sums
295
299
  //! int warp_aggregate;
296
300
  //! int warp_id = threadIdx.x / 32;
297
- //! WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data,
298
- //! thread_data,
299
- //! warp_aggregate);
301
+ //! WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
302
+ //! }
300
303
  //!
301
304
  //! Suppose the set of input ``thread_data`` across the block of threads is
302
305
  //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
@@ -352,6 +355,7 @@ public:
352
355
  //! // Compute exclusive warp-wide prefix sums
353
356
  //! int warp_id = threadIdx.x / 32;
354
357
  //! WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
358
+ //! }
355
359
  //!
356
360
  //! Suppose the set of input ``thread_data`` across the block of threads is
357
361
  //! ``{1, 1, 1, 1, ...}``. The corresponding output ``thread_data`` in each of the four warps
@@ -201,6 +201,7 @@ enum WarpStoreAlgorithm
201
201
  //!
202
202
  //! // Store items to linear memory
203
203
  //! WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data);
204
+ //! }
204
205
  //!
205
206
  //! Suppose the set of ``thread_data`` across the warp threads is
206
207
  //! ``{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }``.
@@ -11,7 +11,7 @@
11
11
  #ifndef __CUDA___ALGORITHM_COMMON
12
12
  #define __CUDA___ALGORITHM_COMMON
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -11,7 +11,7 @@
11
11
  #ifndef __CUDA___ALGORITHM_COPY_H
12
12
  #define __CUDA___ALGORITHM_COPY_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -11,7 +11,7 @@
11
11
  #ifndef __CUDA___ALGORITHM_FILL
12
12
  #define __CUDA___ALGORITHM_FILL
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -26,6 +26,7 @@
26
26
  #if _CCCL_CUDA_COMPILATION()
27
27
  # include <cuda/__ptx/instructions/get_sreg.h>
28
28
  # include <cuda/__ptx/instructions/mbarrier_arrive.h>
29
+ # include <cuda/__ptx/instructions/mbarrier_wait.h>
29
30
  # include <cuda/__ptx/ptx_dot_variants.h>
30
31
  # include <cuda/__ptx/ptx_helper_functions.h>
31
32
  #endif // _CCCL_CUDA_COMPILATION()
@@ -381,12 +382,30 @@ private:
381
382
  public:
382
383
  _CCCL_API inline void wait(arrival_token&& __phase) const
383
384
  {
385
+ // no need to back off on a barrier in SMEM on SM90+, SYNCS unit is taking care of this
386
+ NV_IF_TARGET(NV_PROVIDES_SM_90,
387
+ (if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared)) {
388
+ while (!::cuda::ptx::mbarrier_try_wait(
389
+ reinterpret_cast<uint64_t*>(const_cast<__barrier_base*>(&__barrier)), __phase))
390
+ ;
391
+ return;
392
+ }))
393
+ // fallback implementation
384
394
  ::cuda::std::__cccl_thread_poll_with_backoff(
385
395
  ::cuda::std::__barrier_poll_tester_phase<barrier>(this, ::cuda::std::move(__phase)));
386
396
  }
387
397
 
388
398
  _CCCL_API inline void wait_parity(bool __phase_parity) const
389
399
  {
400
+ // no need to back off on a barrier in SMEM on SM90+, SYNCS unit is taking care of this
401
+ NV_IF_TARGET(NV_PROVIDES_SM_90,
402
+ (if (::cuda::device::is_object_from(__barrier, ::cuda::device::address_space::shared)) {
403
+ while (!::cuda::ptx::mbarrier_try_wait_parity(
404
+ reinterpret_cast<uint64_t*>(const_cast<__barrier_base*>(&__barrier)), __phase_parity))
405
+ ;
406
+ return;
407
+ }))
408
+ // fallback implementation
390
409
  ::cuda::std::__cccl_thread_poll_with_backoff(
391
410
  ::cuda::std::__barrier_poll_tester_parity<barrier>(this, __phase_parity));
392
411
  }
@@ -23,6 +23,7 @@
23
23
  #include <cuda/std/__cccl/exceptions.h> // IWYU pragma: export
24
24
  #include <cuda/std/__cccl/execution_space.h> // IWYU pragma: export
25
25
  #include <cuda/std/__cccl/extended_data_types.h> // IWYU pragma: export
26
+ #include <cuda/std/__cccl/host_std_lib.h> // IWYU pragma: export
26
27
  #include <cuda/std/__cccl/os.h> // IWYU pragma: export
27
28
  #include <cuda/std/__cccl/preprocessor.h> // IWYU pragma: export
28
29
  #include <cuda/std/__cccl/ptx_isa.h> // IWYU pragma: export