cuda-cccl 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -53,14 +53,8 @@ struct agent_t
53
53
  using policy = Policy;
54
54
 
55
55
  // key and value type are taken from the first input sequence (consistent with old Thrust behavior)
56
- using key_type = it_value_t<KeysIt1>;
57
- using item_type = it_value_t<ItemsIt1>;
58
-
59
- using keys_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt1>;
60
- using keys_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt2>;
61
- using items_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt1>;
62
- using items_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt2>;
63
-
56
+ using key_type = it_value_t<KeysIt1>;
57
+ using item_type = it_value_t<ItemsIt1>;
64
58
  using block_store_keys = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
65
59
  using block_store_items = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
66
60
 
@@ -84,11 +78,11 @@ struct agent_t
84
78
 
85
79
  // Per thread data
86
80
  temp_storages& storage;
87
- keys_load_it1 keys1_in;
88
- items_load_it1 items1_in;
81
+ KeysIt1 keys1_in;
82
+ ItemsIt1 items1_in;
89
83
  Offset keys1_count;
90
- keys_load_it2 keys2_in;
91
- items_load_it2 items2_in;
84
+ KeysIt2 keys2_in;
85
+ ItemsIt2 items2_in;
92
86
  Offset keys2_count;
93
87
  KeysOutputIt keys_out;
94
88
  ItemsOutputIt items_out;
@@ -128,10 +122,14 @@ struct agent_t
128
122
  }
129
123
 
130
124
  key_type keys_loc[items_per_thread];
131
- merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
132
- keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, keys1_count_tile, keys2_count_tile);
133
- merge_sort::reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
134
- __syncthreads();
125
+ {
126
+ auto keys1_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(keys1_in);
127
+ auto keys2_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(keys2_in);
128
+ merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
129
+ keys_loc, keys1_in_cm + keys1_beg, keys2_in_cm + keys2_beg, keys1_count_tile, keys2_count_tile);
130
+ merge_sort::reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
131
+ __syncthreads();
132
+ }
135
133
 
136
134
  // now find the merge path for each of thread.
137
135
  // we can use int type here, because the number of items in shared memory is limited
@@ -186,11 +184,15 @@ struct agent_t
186
184
  if constexpr (have_items)
187
185
  {
188
186
  item_type items_loc[items_per_thread];
189
- merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
190
- items_loc, items1_in + keys1_beg, items2_in + keys2_beg, keys1_count_tile, keys2_count_tile);
191
- __syncthreads(); // block_store_keys above uses SMEM, so make sure all threads are done before we write to it
192
- merge_sort::reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
193
- __syncthreads();
187
+ {
188
+ auto items1_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(items1_in);
189
+ auto items2_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(items2_in);
190
+ merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
191
+ items_loc, items1_in_cm + keys1_beg, items2_in_cm + keys2_beg, keys1_count_tile, keys2_count_tile);
192
+ __syncthreads(); // block_store_keys above uses SMEM, so make sure all threads are done before we write to it
193
+ merge_sort::reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
194
+ __syncthreads();
195
+ }
194
196
 
195
197
  // gather items from shared mem
196
198
  _CCCL_PRAGMA_UNROLL_FULL()
@@ -66,9 +66,28 @@ struct AgentMergeSortPolicy
66
66
  static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
67
67
  };
68
68
 
69
+ #if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
69
70
  namespace detail
70
71
  {
71
- namespace merge_sort
72
+ // Only define this when needed.
73
+ // Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
74
+ // either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
75
+ // version is always defined, and that's the only one needed for regular CUB operations.
76
+ //
77
+ // TODO: enable this unconditionally once concepts are always available
78
+ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
79
+ MergeSortAgentPolicy,
80
+ (GenericAgentPolicy),
81
+ (BLOCK_THREADS, BlockThreads, int),
82
+ (ITEMS_PER_THREAD, ItemsPerThread, int),
83
+ (ITEMS_PER_TILE, ItemsPerTile, int),
84
+ (LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
85
+ (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
86
+ (STORE_ALGORITHM, StoreAlgorithm, cub::BlockStoreAlgorithm))
87
+ } // namespace detail
88
+ #endif // defined(CUB_DEFINE_RUNTIME_POLICIES
89
+
90
+ namespace detail::merge_sort
72
91
  {
73
92
 
74
93
  template <typename Policy,
@@ -724,7 +743,6 @@ struct AgentMerge
724
743
  }
725
744
  };
726
745
 
727
- } // namespace merge_sort
728
- } // namespace detail
746
+ } // namespace detail::merge_sort
729
747
 
730
748
  CUB_NAMESPACE_END
@@ -146,9 +146,7 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
146
146
  * Thread block abstractions
147
147
  ******************************************************************************/
148
148
 
149
- namespace detail
150
- {
151
- namespace radix_sort
149
+ namespace detail::radix_sort
152
150
  {
153
151
 
154
152
  /**
@@ -783,7 +781,6 @@ struct AgentRadixSortDownsweep
783
781
  }
784
782
  };
785
783
 
786
- } // namespace radix_sort
787
- } // namespace detail
784
+ } // namespace detail::radix_sort
788
785
 
789
786
  CUB_NAMESPACE_END
@@ -85,9 +85,7 @@ struct AgentRadixSortExclusiveSumPolicy
85
85
  };
86
86
  };
87
87
 
88
- namespace detail
89
- {
90
- namespace radix_sort
88
+ namespace detail::radix_sort
91
89
  {
92
90
 
93
91
  template <typename AgentRadixSortHistogramPolicy,
@@ -283,7 +281,6 @@ struct AgentRadixSortHistogram
283
281
  }
284
282
  };
285
283
 
286
- } // namespace radix_sort
287
- } // namespace detail
284
+ } // namespace detail::radix_sort
288
285
 
289
286
  CUB_NAMESPACE_END
@@ -100,9 +100,7 @@ struct AgentRadixSortOnesweepPolicy : ScalingType
100
100
  static constexpr RadixSortStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
101
101
  };
102
102
 
103
- namespace detail
104
- {
105
- namespace radix_sort
103
+ namespace detail::radix_sort
106
104
  {
107
105
 
108
106
  template <typename AgentRadixSortOnesweepPolicy,
@@ -700,7 +698,6 @@ struct AgentRadixSortOnesweep
700
698
  }
701
699
  };
702
700
 
703
- } // namespace radix_sort
704
- } // namespace detail
701
+ } // namespace detail::radix_sort
705
702
 
706
703
  CUB_NAMESPACE_END
@@ -103,9 +103,7 @@ struct AgentRadixSortUpsweepPolicy : ScalingType
103
103
  * Thread block abstractions
104
104
  ******************************************************************************/
105
105
 
106
- namespace detail
107
- {
108
- namespace radix_sort
106
+ namespace detail::radix_sort
109
107
  {
110
108
 
111
109
  /**
@@ -552,7 +550,6 @@ struct AgentRadixSortUpsweep
552
550
  }
553
551
  };
554
552
 
555
- } // namespace radix_sort
556
- } // namespace detail
553
+ } // namespace detail::radix_sort
557
554
 
558
555
  CUB_NAMESPACE_END
@@ -134,9 +134,7 @@ struct AgentRlePolicy
134
134
  * Thread block abstractions
135
135
  ******************************************************************************/
136
136
 
137
- namespace detail
138
- {
139
- namespace rle
137
+ namespace detail::rle
140
138
  {
141
139
 
142
140
  /**
@@ -1121,7 +1119,6 @@ struct AgentRle
1121
1119
  }
1122
1120
  };
1123
1121
 
1124
- } // namespace rle
1125
- } // namespace detail
1122
+ } // namespace detail::rle
1126
1123
 
1127
1124
  CUB_NAMESPACE_END
@@ -51,6 +51,10 @@
51
51
  #include <cub/iterator/cache_modified_input_iterator.cuh>
52
52
  #include <cub/util_device.cuh>
53
53
 
54
+ #if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
55
+ # include <cub/agent/agent_unique_by_key.cuh> // for UniqueByKeyAgentPolicy
56
+ #endif
57
+
54
58
  #include <cuda/std/__type_traits/conditional.h>
55
59
  #include <cuda/std/__type_traits/is_pointer.h>
56
60
  #include <cuda/std/__type_traits/is_same.h>
@@ -123,7 +127,7 @@ namespace detail
123
127
  // TODO: enable this unconditionally once concepts are always available
124
128
  CUB_DETAIL_POLICY_WRAPPER_DEFINE(
125
129
  ScanAgentPolicy,
126
- (GenericAgentPolicy),
130
+ (UniqueByKeyAgentPolicy),
127
131
  (BLOCK_THREADS, BlockThreads, int),
128
132
  (ITEMS_PER_THREAD, ItemsPerThread, int),
129
133
  (LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
@@ -96,9 +96,7 @@ struct AgentScanByKeyPolicy
96
96
  * Thread block abstractions
97
97
  ******************************************************************************/
98
98
 
99
- namespace detail
100
- {
101
- namespace scan_by_key
99
+ namespace detail::scan_by_key
102
100
  {
103
101
 
104
102
  /**
@@ -471,7 +469,6 @@ struct AgentScanByKey
471
469
  }
472
470
  };
473
471
 
474
- } // namespace scan_by_key
475
- } // namespace detail
472
+ } // namespace detail::scan_by_key
476
473
 
477
474
  CUB_NAMESPACE_END
@@ -45,9 +45,7 @@
45
45
 
46
46
  CUB_NAMESPACE_BEGIN
47
47
 
48
- namespace detail
49
- {
50
- namespace radix_sort
48
+ namespace detail::radix_sort
51
49
  {
52
50
 
53
51
  /**
@@ -286,7 +284,6 @@ struct AgentSegmentedRadixSort
286
284
  }
287
285
  };
288
286
 
289
- } // namespace radix_sort
290
- } // namespace detail
287
+ } // namespace detail::radix_sort
291
288
 
292
289
  CUB_NAMESPACE_END
@@ -126,9 +126,7 @@ struct AgentSelectIfPolicy
126
126
  * Thread block abstractions
127
127
  ******************************************************************************/
128
128
 
129
- namespace detail
130
- {
131
- namespace select
129
+ namespace detail::select
132
130
  {
133
131
 
134
132
  template <typename EqualityOpT>
@@ -1114,7 +1112,6 @@ struct AgentSelectIf
1114
1112
  }
1115
1113
  };
1116
1114
 
1117
- } // namespace select
1118
- } // namespace detail
1115
+ } // namespace detail::select
1119
1116
 
1120
1117
  CUB_NAMESPACE_END
@@ -84,9 +84,7 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
84
84
  } // namespace detail
85
85
  #endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
86
86
 
87
- namespace detail
88
- {
89
- namespace sub_warp_merge_sort
87
+ namespace detail::sub_warp_merge_sort
90
88
  {
91
89
 
92
90
  /**
@@ -343,7 +341,6 @@ private:
343
341
  }
344
342
  };
345
343
 
346
- } // namespace sub_warp_merge_sort
347
- } // namespace detail
344
+ } // namespace detail::sub_warp_merge_sort
348
345
 
349
346
  CUB_NAMESPACE_END
@@ -91,9 +91,7 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
91
91
  } // namespace detail
92
92
  #endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
93
93
 
94
- namespace detail
95
- {
96
- namespace three_way_partition
94
+ namespace detail::three_way_partition
97
95
  {
98
96
 
99
97
  template <class OffsetT>
@@ -603,7 +601,6 @@ struct AgentThreeWayPartition
603
601
  }
604
602
  };
605
603
 
606
- } // namespace three_way_partition
607
- } // namespace detail
604
+ } // namespace detail::three_way_partition
608
605
 
609
606
  CUB_NAMESPACE_END
@@ -85,13 +85,31 @@ struct AgentUniqueByKeyPolicy
85
85
  };
86
86
  };
87
87
 
88
+ #if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
89
+ namespace detail
90
+ {
91
+ // Only define this when needed.
92
+ // Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
93
+ // either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
94
+ // version is always defined, and that's the only one needed for regular CUB operations.
95
+ //
96
+ // TODO: enable this unconditionally once concepts are always available
97
+ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
98
+ UniqueByKeyAgentPolicy,
99
+ (GenericAgentPolicy),
100
+ (BLOCK_THREADS, BlockThreads, int),
101
+ (ITEMS_PER_THREAD, ItemsPerThread, int),
102
+ (LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
103
+ (LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
104
+ (SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
105
+ } // namespace detail
106
+ #endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
107
+
88
108
  /******************************************************************************
89
109
  * Thread block abstractions
90
110
  ******************************************************************************/
91
111
 
92
- namespace detail
93
- {
94
- namespace unique_by_key
112
+ namespace detail::unique_by_key
95
113
  {
96
114
 
97
115
  /**
@@ -608,7 +626,6 @@ struct AgentUniqueByKey
608
626
  }
609
627
  };
610
628
 
611
- } // namespace unique_by_key
612
- } // namespace detail
629
+ } // namespace detail::unique_by_key
613
630
 
614
631
  CUB_NAMESPACE_END
@@ -111,10 +111,9 @@ CUB_NAMESPACE_BEGIN
111
111
  //! // Collectively compute adjacent_difference
112
112
  //! int result[4];
113
113
  //!
114
- //! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
115
- //! thread_data,
116
- //! result,
117
- //! CustomDifference());
114
+ //! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(thread_data, result,
115
+ //! CustomDifference());
116
+ //! }
118
117
  //!
119
118
  //! Suppose the set of input `thread_data` across the block of threads is
120
119
  //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
@@ -283,10 +282,9 @@ public:
283
282
  //! ...
284
283
  //!
285
284
  //! // Collectively compute adjacent_difference
286
- //! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
287
- //! thread_data,
288
- //! thread_data,
289
- //! CustomDifference());
285
+ //! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(thread_data, thread_data,
286
+ //! CustomDifference());
287
+ //! }
290
288
  //!
291
289
  //! Suppose the set of input ``thread_data`` across the block of threads is
292
290
  //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
@@ -96,6 +96,7 @@ CUB_NAMESPACE_BEGIN
96
96
  //! // Collectively compute head flags for discontinuities in the segment
97
97
  //! int head_flags[4];
98
98
  //! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
99
+ //! }
99
100
  //!
100
101
  //! Suppose the set of input ``thread_data`` across the block of threads is
101
102
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
@@ -387,6 +388,7 @@ public:
387
388
  //! // Collectively compute head flags for discontinuities in the segment
388
389
  //! int head_flags[4];
389
390
  //! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
391
+ //! }
390
392
  //!
391
393
  //! Suppose the set of input ``thread_data`` across the block of threads is
392
394
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
@@ -463,8 +465,9 @@ public:
463
465
  //!
464
466
  //! // Collectively compute head flags for discontinuities in the segment
465
467
  //! int head_flags[4];
466
- //! BlockDiscontinuity(temp_storage).FlagHeads(
467
- //! head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
468
+ //! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data,
469
+ //! cub::Inequality(), tile_predecessor_item);
470
+ //! }
468
471
  //!
469
472
  //! Suppose the set of input ``thread_data`` across the block of threads is
470
473
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``,
@@ -549,6 +552,7 @@ public:
549
552
  //! // Collectively compute tail flags for discontinuities in the segment
550
553
  //! int tail_flags[4];
551
554
  //! BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
555
+ //! }
552
556
  //!
553
557
  //! Suppose the set of input ``thread_data`` across the block of threads is
554
558
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``.
@@ -640,8 +644,9 @@ public:
640
644
  //!
641
645
  //! // Collectively compute tail flags for discontinuities in the segment
642
646
  //! int tail_flags[4];
643
- //! BlockDiscontinuity(temp_storage).FlagTails(
644
- //! tail_flags, thread_data, cub::Inequality(), tile_successor_item);
647
+ //! BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data,
648
+ //! cub::Inequality(), tile_successor_item);
649
+ //! }
645
650
  //!
646
651
  //! Suppose the set of input ``thread_data`` across the block of threads is
647
652
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
@@ -742,8 +747,9 @@ public:
742
747
  //! // Collectively compute head and flags for discontinuities in the segment
743
748
  //! int head_flags[4];
744
749
  //! int tail_flags[4];
745
- //! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
746
- //! head_flags, tail_flags, thread_data, cub::Inequality());
750
+ //! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tail_flags, thread_data,
751
+ //! cub::Inequality());
752
+ //! }
747
753
  //!
748
754
  //! Suppose the set of input ``thread_data`` across the block of threads is
749
755
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
@@ -864,8 +870,10 @@ public:
864
870
  //! // Collectively compute head and flags for discontinuities in the segment
865
871
  //! int head_flags[4];
866
872
  //! int tail_flags[4];
867
- //! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
868
- //! head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
873
+ //! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tail_flags,
874
+ //! tile_successor_item, thread_data,
875
+ //! cub::Inequality());
876
+ //! }
869
877
  //!
870
878
  //! Suppose the set of input ``thread_data`` across the block of threads is
871
879
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
@@ -997,9 +1005,10 @@ public:
997
1005
  //! // Collectively compute head and flags for discontinuities in the segment
998
1006
  //! int head_flags[4];
999
1007
  //! int tail_flags[4];
1000
- //! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
1001
- //! head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
1002
- //! thread_data, cub::Inequality());
1008
+ //! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tile_predecessor_item,
1009
+ //! tail_flags, tile_successor_item,
1010
+ //! thread_data, cub::Inequality());
1011
+ //! }
1003
1012
  //!
1004
1013
  //! Suppose the set of input ``thread_data`` across the block of threads is
1005
1014
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
@@ -1126,9 +1135,10 @@ public:
1126
1135
  //! // Collectively compute head and flags for discontinuities in the segment
1127
1136
  //! int head_flags[4];
1128
1137
  //! int tail_flags[4];
1129
- //! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
1130
- //! head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
1131
- //! thread_data, cub::Inequality());
1138
+ //! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tile_predecessor_item,
1139
+ //! tail_flags, tile_successor_item,
1140
+ //! thread_data, cub::Inequality());
1141
+ //! }
1132
1142
  //!
1133
1143
  //! Suppose the set of input ``thread_data`` across the block of threads is
1134
1144
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
@@ -101,6 +101,7 @@ CUB_NAMESPACE_BEGIN
101
101
  //!
102
102
  //! // Collectively exchange data into a blocked arrangement across threads
103
103
  //! BlockExchange(temp_storage).StripedToBlocked(thread_data);
104
+ //! }
104
105
  //!
105
106
  //! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
106
107
  //! [1,129,257,385], ..., [127,255,383,511] }``. The corresponding output ``thread_data`` in those threads will be
@@ -883,6 +884,7 @@ public:
883
884
  //!
884
885
  //! // Collectively exchange data into a blocked arrangement across threads
885
886
  //! BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
887
+ //! }
886
888
  //!
887
889
  //! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
888
890
  //! [1,129,257,385], ..., [127,255,383,511] }`` after loading from device-accessible memory. The corresponding output
@@ -933,6 +935,7 @@ public:
933
935
  //!
934
936
  //! // Store data striped across block threads into an ordered tile
935
937
  //! cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
938
+ //! }
936
939
  //!
937
940
  //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
938
941
  //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be
@@ -983,6 +986,7 @@ public:
983
986
  //!
984
987
  //! // Collectively exchange data into a blocked arrangement across threads
985
988
  //! BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
989
+ //! }
986
990
  //!
987
991
  //! Suppose the set of warp-striped input ``thread_data`` across the block of threads is ``{ [0,32,64,96],
988
992
  //! [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` after loading from device-accessible memory. (The first 128
@@ -1037,6 +1041,7 @@ public:
1037
1041
  //!
1038
1042
  //! // Store data striped across warp threads into an ordered tile
1039
1043
  //! cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
1044
+ //! }
1040
1045
  //!
1041
1046
  //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
1042
1047
  //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be
@@ -140,6 +140,7 @@ enum BlockHistogramAlgorithm
140
140
  //!
141
141
  //! // Compute the block-wide histogram
142
142
  //! BlockHistogram(temp_storage).Histogram(data, smem_histogram);
143
+ //! }
143
144
  //!
144
145
  //! Performance and Usage Considerations
145
146
  //! +++++++++++++++++++++++++++++++++++++++++++++
@@ -281,6 +282,7 @@ public:
281
282
  //!
282
283
  //! // Update the block-wide histogram
283
284
  //! BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
285
+ //! }
284
286
  //!
285
287
  //! @endrst
286
288
  //!
@@ -338,6 +340,7 @@ public:
338
340
  //!
339
341
  //! // Compute the block-wide histogram
340
342
  //! BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
343
+ //! }
341
344
  //!
342
345
  //! @endrst
343
346
  //!
@@ -399,6 +402,7 @@ public:
399
402
  //!
400
403
  //! // Update the block-wide histogram
401
404
  //! BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
405
+ //! }
402
406
  //!
403
407
  //! @endrst
404
408
  //!
@@ -771,6 +771,7 @@ enum BlockLoadAlgorithm
771
771
  //! // Load a segment of consecutive items that are blocked across threads
772
772
  //! int thread_data[4];
773
773
  //! BlockLoad(temp_storage).Load(d_data, thread_data);
774
+ //! }
774
775
  //!
775
776
  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads in
776
777
  //! those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
@@ -1123,6 +1124,7 @@ public:
1123
1124
  //! // Load a segment of consecutive items that are blocked across threads
1124
1125
  //! int thread_data[4];
1125
1126
  //! BlockLoad(temp_storage).Load(d_data, thread_data);
1127
+ //! }
1126
1128
  //!
1127
1129
  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads
1128
1130
  //! in those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
@@ -1170,6 +1172,7 @@ public:
1170
1172
  //! // Load a segment of consecutive items that are blocked across threads
1171
1173
  //! int thread_data[4];
1172
1174
  //! BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end);
1175
+ //! }
1173
1176
  //!
1174
1177
  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``block_items_end`` is ``5``. The set of
1175
1178
  //! ``thread_data`` across the block of threads in those threads will be ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``,
@@ -1222,6 +1225,7 @@ public:
1222
1225
  //! // Load a segment of consecutive items that are blocked across threads
1223
1226
  //! int thread_data[4];
1224
1227
  //! BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end, -1);
1228
+ //! }
1225
1229
  //!
1226
1230
  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...``, ``block_items_end`` is ``5``, and the out-of-bounds
1227
1231
  //! default is ``-1``. The set of ``thread_data`` across the block of threads in those threads will be
@@ -50,6 +50,7 @@
50
50
 
51
51
  #include <cuda/__ptx/instructions/get_sreg.h>
52
52
  #include <cuda/std/__algorithm/max.h>
53
+ #include <cuda/std/__bit/integral.h>
53
54
  #include <cuda/std/__functional/operations.h>
54
55
  #include <cuda/std/__type_traits/conditional.h>
55
56
  #include <cuda/std/__type_traits/is_same.h>
@@ -168,6 +169,7 @@ struct warp_in_block_matcher_t<Bits, 0, PartialWarpId>
168
169
  //! block_radix_rank(temp_storage).RankKeys(keys, ranks, extractor);
169
170
  //!
170
171
  //! ...
172
+ //! }
171
173
  //!
172
174
  //! Suppose the set of input ``keys`` across the block of threads is ``{ [16,10], [9,11] }``.
173
175
  //! The corresponding output ``ranks`` in those threads will be ``{ [3,1], [0,2] }``.
@@ -1072,7 +1074,7 @@ struct BlockRadixRankMatchEarlyCounts
1072
1074
  atomicOr(p_match_mask, lane_mask);
1073
1075
  __syncwarp(WARP_MASK);
1074
1076
  int bin_mask = *p_match_mask;
1075
- int leader = (WARP_THREADS - 1) - __clz(bin_mask);
1077
+ int leader = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
1076
1078
  int warp_offset = 0;
1077
1079
  int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
1078
1080
  if (lane == leader)
@@ -1102,7 +1104,7 @@ struct BlockRadixRankMatchEarlyCounts
1102
1104
  ::cuda::std::uint32_t bin = Digit(keys[u]);
1103
1105
  int bin_mask =
1104
1106
  detail::warp_in_block_matcher_t<RADIX_BITS, PARTIAL_WARP_THREADS, BLOCK_WARPS - 1>::match_any(bin, warp);
1105
- int leader = (WARP_THREADS - 1) - __clz(bin_mask);
1107
+ int leader = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
1106
1108
  int warp_offset = 0;
1107
1109
  int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
1108
1110
  if (lane == leader)