cuda-cccl 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -387,15 +387,13 @@ struct DispatchTopK
387
387
  return error;
388
388
  }
389
389
 
390
- _CubLog("Invoking topk_kernel<<<{%d,%d,%d}, %d, 0, "
390
+ _CubLog("Invoking topk_kernel<<<%d, %d, 0, "
391
391
  "%lld>>>(), %d items per thread, %d SM occupancy\n",
392
- topk_grid_size.x,
393
- topk_grid_size.y,
394
- topk_grid_size.z,
392
+ topk_grid_size,
395
393
  block_threads,
396
394
  (long long) stream,
397
395
  items_per_thread,
398
- topk_blocks_per_sm);
396
+ main_kernel_blocks_per_sm);
399
397
  }
400
398
  #endif // CUB_DEBUG_LOG
401
399
 
@@ -109,8 +109,9 @@ struct TransformKernelSource<Offset,
109
109
  return detail::transform::make_aligned_base_ptr_kernel_arg(it, align);
110
110
  }
111
111
 
112
+ private:
112
113
  template <typename T>
113
- CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static auto IsPointerAligned(T it, [[maybe_unused]] int alignment)
114
+ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static auto is_pointer_aligned(T it, [[maybe_unused]] int alignment)
114
115
  {
115
116
  if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<decltype(it)>)
116
117
  {
@@ -121,6 +122,14 @@ struct TransformKernelSource<Offset,
121
122
  return true; // fancy iterators are aligned, since the vectorized kernel chooses a different code path
122
123
  }
123
124
  }
125
+
126
+ public:
127
+ CUB_RUNTIME_FUNCTION constexpr static bool
128
+ CanVectorize(int vec_size, const RandomAccessIteratorOut& out, const RandomAccessIteratorsIn&... in)
129
+ {
130
+ return is_pointer_aligned(out, sizeof(it_value_t<RandomAccessIteratorOut>) * vec_size)
131
+ && (is_pointer_aligned(in, sizeof(it_value_t<RandomAccessIteratorsIn>) * vec_size) && ...);
132
+ }
124
133
  };
125
134
 
126
135
  enum class requires_stable_address
@@ -384,7 +393,7 @@ struct dispatch_t<StableAddress,
384
393
  }
385
394
 
386
395
  CUB_DEFINE_SFINAE_GETTER(items_per_thread_no_input, prefetch, ItemsPerThreadNoInput)
387
- CUB_DEFINE_SFINAE_GETTER(load_store_word_size, vectorized, LoadStoreWordSize)
396
+ CUB_DEFINE_SFINAE_GETTER(vec_size, vectorized, VecSize)
388
397
  CUB_DEFINE_SFINAE_GETTER(items_per_thread_vectorized, vectorized, ItemsPerThreadVectorized)
389
398
 
390
399
  #undef CUB_DEFINE_SFINAE_GETTER
@@ -441,9 +450,8 @@ struct dispatch_t<StableAddress,
441
450
  // the policy already handles the compile-time checks if we can vectorize. Do the remaining alignment check here
442
451
  if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::vectorized == wrapped_policy.Algorithm())
443
452
  {
444
- const int alignment = load_store_word_size(wrapped_policy.AlgorithmPolicy());
445
- can_vectorize = (kernel_source.IsPointerAligned(::cuda::std::get<Is>(in), alignment) && ...)
446
- && kernel_source.IsPointerAligned(out, alignment);
453
+ const int vs = vec_size(wrapped_policy.AlgorithmPolicy());
454
+ can_vectorize = kernel_source.CanVectorize(vs, out, ::cuda::std::get<Is>(in)...);
447
455
  }
448
456
 
449
457
  int ipt = 0;
@@ -14,19 +14,17 @@
14
14
  #endif // no system header
15
15
 
16
16
  #include <cub/agent/agent_for.cuh>
17
- #include <cub/detail/fast_modulo_division.cuh> // fast_div_mod
18
17
  #include <cub/detail/mdspan_utils.cuh> // is_sub_size_static
19
18
  #include <cub/detail/type_traits.cuh> // implicit_prom_t
20
19
 
21
- #include <cuda/std/__fwd/span.h>
22
20
  #include <cuda/std/__type_traits/enable_if.h>
23
21
  #include <cuda/std/__type_traits/integral_constant.h>
24
22
  #include <cuda/std/__type_traits/is_convertible.h>
25
23
  #include <cuda/std/__type_traits/is_reference.h>
26
24
  #include <cuda/std/__type_traits/is_trivially_constructible.h>
27
- #include <cuda/std/__type_traits/is_trivially_copy_constructible.h>
25
+ #include <cuda/std/__type_traits/is_trivially_copy_assignable.h>
28
26
  #include <cuda/std/__type_traits/is_trivially_destructible.h>
29
- #include <cuda/std/__type_traits/is_trivially_move_constructible.h>
27
+ #include <cuda/std/__type_traits/is_trivially_move_assignable.h>
30
28
  #include <cuda/std/__type_traits/make_unsigned.h>
31
29
  #include <cuda/std/__utility/integer_sequence.h>
32
30
  #include <cuda/std/cstddef> // size_t
@@ -140,16 +138,21 @@ __launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) //
140
138
  * ForEachInExtents
141
139
  **********************************************************************************************************************/
142
140
 
143
- // Returns the extent at the given rank. If the extents is static, returns it, otherwise returns the precomputed value
144
- template <int Rank, typename ExtentType, typename FastDivModType>
145
- _CCCL_DEVICE _CCCL_FORCEINLINE auto extent_at(ExtentType extents, FastDivModType dynamic_extent)
141
+ // Retrieves the extent (dimension size) at a specific position in a multi-dimensional array
142
+ //
143
+ // This function efficiently returns the extent at the given position, optimizing for static extents by returning
144
+ // compile-time constants when possible. For dynamic extents, it returns the precomputed value to avoid runtime
145
+ // computation overhead.
146
+ template <int Position, typename ExtentType, typename FastDivModType>
147
+ _CCCL_DEVICE_API auto extent_at(ExtentType extents, FastDivModType dynamic_extent)
146
148
  {
147
- if constexpr (ExtentType::static_extent(Rank) != ::cuda::std::dynamic_extent)
149
+ if constexpr (ExtentType::static_extent(Position) != ::cuda::std::dynamic_extent)
148
150
  {
149
151
  using extent_index_type = typename ExtentType::index_type;
150
152
  using index_type = implicit_prom_t<extent_index_type>;
151
153
  using unsigned_index_type = ::cuda::std::make_unsigned_t<index_type>;
152
- return static_cast<unsigned_index_type>(extents.static_extent(Rank));
154
+ constexpr auto extent = extents.static_extent(Position);
155
+ return static_cast<unsigned_index_type>(extent);
153
156
  }
154
157
  else
155
158
  {
@@ -157,17 +160,22 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto extent_at(ExtentType extents, FastDivModType
157
160
  }
158
161
  }
159
162
 
160
- // Returns the product of all extents from position Rank. If the result is static, returns it, otherwise returns the
161
- // precomputed value
162
- template <int Rank, typename ExtentType, typename FastDivModType>
163
- _CCCL_DEVICE _CCCL_FORCEINLINE auto get_extents_sub_size(ExtentType extents, FastDivModType extent_sub_size)
163
+ // Computes the product of extents in a specified range for multi-dimensional indexing.
164
+ // This function calculates the product of all extent dimensions from Start (inclusive) to End (exclusive).
165
+ //
166
+ // Performance characteristics:
167
+ // - Static extents in range: Product computed at compile-time, zero runtime cost
168
+ // - Dynamic extents present: Returns precomputed value, avoiding runtime multiplication
169
+ template <int Start, int End, typename ExtentType, typename FastDivModType>
170
+ _CCCL_DEVICE_API auto get_extents_sub_size(ExtentType extents, FastDivModType extent_sub_size)
164
171
  {
165
- if constexpr (cub::detail::is_sub_size_static<Rank + 1, ExtentType>())
172
+ if constexpr (cub::detail::are_extents_in_range_static<ExtentType>(Start, End))
166
173
  {
167
174
  using extent_index_type = typename ExtentType::index_type;
168
175
  using index_type = implicit_prom_t<extent_index_type>;
169
176
  using unsigned_index_type = ::cuda::std::make_unsigned_t<index_type>;
170
- return static_cast<unsigned_index_type>(cub::detail::sub_size<Rank + 1>(extents));
177
+ auto sub_size = cub::detail::size_range(extents, Start, End);
178
+ return static_cast<unsigned_index_type>(sub_size);
171
179
  }
172
180
  else
173
181
  {
@@ -175,49 +183,76 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto get_extents_sub_size(ExtentType extents, Fas
175
183
  }
176
184
  }
177
185
 
178
- template <int Rank, typename IndexType, typename ExtentType, typename FastDivModType>
179
- _CCCL_DEVICE _CCCL_FORCEINLINE auto
186
+ // Converts a linear index to a multi-dimensional coordinate at a specific position.
187
+ //
188
+ // This function performs the mathematical conversion from a linear (flat) index to the coordinate value at a specific
189
+ // position in a multi-dimensional array. It supports both row-major (layout_right) and column-major (layout_left)
190
+ // memory layouts, which affects the indexing calculation order.
191
+ //
192
+ // The mathematical formulation depends on the layout:
193
+ // - Right layout (row-major): index_i = (index / product(extent[j] for j in [i+1, rank-1])) % extent[i]
194
+ // - Left layout (column-major): index_i = (index / product(extent[j] for j in [0, i])) % extent[i]
195
+ //
196
+ // This function leverages precomputed fast division and modulo operations to minimize runtime arithmetic overhead.
197
+ template <bool IsLayoutRight, int Position, typename IndexType, typename ExtentType, typename FastDivModType>
198
+ _CCCL_DEVICE_API auto
180
199
  coordinate_at(IndexType index, ExtentType extents, FastDivModType extent_sub_size, FastDivModType dynamic_extent)
181
200
  {
182
201
  using cub::detail::for_each::extent_at;
183
202
  using cub::detail::for_each::get_extents_sub_size;
184
203
  using extent_index_type = typename ExtentType::index_type;
185
- return static_cast<extent_index_type>(
186
- (index / get_extents_sub_size<Rank>(extents, extent_sub_size)) % extent_at<Rank>(extents, dynamic_extent));
204
+ constexpr auto start = IsLayoutRight ? Position + 1 : 0;
205
+ constexpr auto end = IsLayoutRight ? ExtentType::rank() : Position;
206
+ return static_cast<extent_index_type>((index / get_extents_sub_size<start, end>(extents, extent_sub_size))
207
+ % extent_at<Position>(extents, dynamic_extent));
187
208
  }
188
209
 
189
- template <typename OpT, typename ExtentsT, typename FastDivModArrayT>
210
+ // Function object wrapper for applying operations with multi-dimensional coordinate conversion.
211
+ //
212
+ // The wrapped operation will be called with signature: `op(linear_index, coord_0, coord_1, ..., coord_n)`
213
+ // where the number of coordinate parameters matches the rank of the extents object.
214
+ //
215
+ // This wrapper is used internally by DeviceFor::ForEachInLayout/ForEachInExtents
216
+ template <typename OpT, typename ExtentsType, bool IsLayoutRight, typename FastDivModArrayT>
190
217
  struct op_wrapper_extents_t
191
218
  {
192
- OpT op;
193
- ExtentsT extents;
194
- FastDivModArrayT sub_sizes_div_array;
195
- FastDivModArrayT extents_mod_array;
196
-
197
- template <typename OffsetT, size_t... Ranks>
198
- _CCCL_DEVICE _CCCL_FORCEINLINE void impl(OffsetT i, ::cuda::std::index_sequence<Ranks...>)
219
+ OpT op; ///< The user-provided operation to be called with coordinates
220
+ ExtentsType extents; ///< The multi-dimensional extents defining array dimensions
221
+ FastDivModArrayT sub_sizes_div_array; ///< Precomputed fast division values for extent sub-products
222
+ FastDivModArrayT extents_mod_array; ///< Precomputed fast modulo values for individual extents
223
+
224
+ // Internal implementation that converts linear index to coordinates and calls the user operation
225
+ template <typename IndexType, size_t... Positions>
226
+ _CCCL_DEVICE_API void impl(IndexType i, ::cuda::std::index_sequence<Positions...>)
199
227
  {
200
228
  using cub::detail::for_each::coordinate_at;
201
- op(i, coordinate_at<Ranks>(i, extents, sub_sizes_div_array[Ranks], extents_mod_array[Ranks])...);
229
+ op(i,
230
+ coordinate_at<IsLayoutRight, Positions>(
231
+ i, extents, sub_sizes_div_array[Positions], extents_mod_array[Positions])...);
202
232
  }
203
233
 
204
- template <typename OffsetT, size_t... Ranks>
205
- _CCCL_DEVICE _CCCL_FORCEINLINE void impl(OffsetT i, ::cuda::std::index_sequence<Ranks...>) const
234
+ // Internal implementation that converts linear index to coordinates and calls the user operation
235
+ template <typename IndexType, size_t... Positions>
236
+ _CCCL_DEVICE_API void impl(IndexType i, ::cuda::std::index_sequence<Positions...>) const
206
237
  {
207
238
  using cub::detail::for_each::coordinate_at;
208
- op(i, coordinate_at<Ranks>(i, extents, sub_sizes_div_array[Ranks], extents_mod_array[Ranks])...);
239
+ op(i,
240
+ coordinate_at<IsLayoutRight, Positions>(
241
+ i, extents, sub_sizes_div_array[Positions], extents_mod_array[Positions])...);
209
242
  }
210
243
 
211
- template <typename OffsetT>
212
- _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(OffsetT i)
244
+ // Function call operator that processes a linear index by converting it to multi-dimensional coordinates
245
+ template <typename IndexType>
246
+ _CCCL_DEVICE_API void operator()(IndexType i)
213
247
  {
214
- impl(i, ::cuda::std::make_index_sequence<ExtentsT::rank()>{});
248
+ impl(i, ::cuda::std::make_index_sequence<ExtentsType::rank()>{});
215
249
  }
216
250
 
217
- template <typename OffsetT>
218
- _CCCL_DEVICE _CCCL_FORCEINLINE void operator()(OffsetT i) const
251
+ // Function call operator that processes a linear index by converting it to multi-dimensional coordinates
252
+ template <typename IndexType>
253
+ _CCCL_DEVICE_API void operator()(IndexType i) const
219
254
  {
220
- impl(i, ::cuda::std::make_index_sequence<ExtentsT::rank()>{});
255
+ impl(i, ::cuda::std::make_index_sequence<ExtentsType::rank()>{});
221
256
  }
222
257
  };
223
258
 
@@ -47,9 +47,7 @@
47
47
 
48
48
  CUB_NAMESPACE_BEGIN
49
49
 
50
- namespace detail
51
- {
52
- namespace reduce
50
+ namespace detail::reduce
53
51
  {
54
52
 
55
53
  /**
@@ -580,7 +578,6 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
580
578
  }
581
579
  }
582
580
 
583
- } // namespace reduce
584
- } // namespace detail
581
+ } // namespace detail::reduce
585
582
 
586
583
  CUB_NAMESPACE_END
@@ -42,9 +42,7 @@
42
42
 
43
43
  CUB_NAMESPACE_BEGIN
44
44
 
45
- namespace detail
46
- {
47
- namespace scan
45
+ namespace detail::scan
48
46
  {
49
47
 
50
48
  /******************************************************************************
@@ -186,7 +184,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
186
184
  AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(num_items, tile_state, start_tile);
187
185
  }
188
186
 
189
- } // namespace scan
190
- } // namespace detail
187
+ } // namespace detail::scan
191
188
 
192
189
  CUB_NAMESPACE_END
@@ -43,9 +43,7 @@
43
43
 
44
44
  CUB_NAMESPACE_BEGIN
45
45
 
46
- namespace detail
47
- {
48
- namespace reduce
46
+ namespace detail::reduce
49
47
  {
50
48
 
51
49
  /// Normalize input iterator to segment offset
@@ -318,7 +316,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
318
316
  }
319
317
  }
320
318
 
321
- } // namespace reduce
322
- } // namespace detail
319
+ } // namespace detail::reduce
323
320
 
324
321
  CUB_NAMESPACE_END
@@ -217,6 +217,7 @@ _CCCL_DEVICE void transform_kernel_vectorized(
217
217
  {
218
218
  constexpr int block_dim = VectorizedPolicy::block_threads;
219
219
  constexpr int items_per_thread = VectorizedPolicy::items_per_thread_vectorized;
220
+ constexpr int vec_size = VectorizedPolicy::vec_size;
220
221
  _CCCL_ASSERT(!can_vectorize || (items_per_thread == num_elem_per_thread_prefetch), "");
221
222
  constexpr int tile_size = block_dim * items_per_thread;
222
223
  const Offset offset = static_cast<Offset>(blockIdx.x) * tile_size;
@@ -241,23 +242,13 @@ _CCCL_DEVICE void transform_kernel_vectorized(
241
242
  out += offset;
242
243
  }
243
244
 
244
- constexpr int load_store_size = VectorizedPolicy::load_store_word_size;
245
- using load_store_t = decltype(load_store_type<load_store_size>());
246
- using output_t = it_value_t<RandomAccessIteratorOut>;
245
+ using output_t = it_value_t<RandomAccessIteratorOut>;
247
246
  using result_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<F, const it_value_t<RandomAccessIteratorsIn>&...>>;
248
- // picks output type size if there are no inputs
249
- constexpr int element_size = int{first_nonzero_value(
250
- (sizeof(it_value_t<RandomAccessIteratorsIn>)
251
- * THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorsIn>) ...,
252
- size_of<output_t>)};
253
- constexpr int load_store_count = (items_per_thread * element_size) / load_store_size;
247
+ constexpr int load_store_count = items_per_thread / vec_size;
248
+ static_assert(items_per_thread % vec_size == 0, "The items per thread must be a multiple of the vector size");
254
249
 
255
- static_assert((items_per_thread * element_size) % load_store_size == 0);
256
- static_assert(load_store_size % element_size == 0);
257
-
258
- constexpr bool can_vectorize_store =
259
- THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>
260
- && THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t> && size_of<output_t> == element_size;
250
+ constexpr bool can_vectorize_store = THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>
251
+ && THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t>;
261
252
 
262
253
  // if we can vectorize, we convert f's return type to the output type right away, so we can reinterpret later
263
254
  using THRUST_NS_QUALIFIER::cuda_cub::core::detail::uninitialized_array;
@@ -266,10 +257,15 @@ _CCCL_DEVICE void transform_kernel_vectorized(
266
257
  auto provide_array = [&](auto... inputs) {
267
258
  // load inputs
268
259
  [[maybe_unused]] auto load_tile = [](auto in, auto& input) {
260
+ using it_t = decltype(in);
261
+ using value_t = it_value_t<it_t>;
269
262
  if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<decltype(in)>)
270
263
  {
271
- auto in_vec = reinterpret_cast<const load_store_t*>(in) + threadIdx.x;
272
- auto input_vec = reinterpret_cast<load_store_t*>(input.data());
264
+ // TODO(bgruber): we could add a max_load_store_size to the policy to avoid huge load types and huge alignment
265
+ // requirements
266
+ using load_t = decltype(load_store_type<sizeof(value_t) * vec_size>());
267
+ auto in_vec = reinterpret_cast<const load_t*>(in) + threadIdx.x;
268
+ auto input_vec = reinterpret_cast<load_t*>(input.data());
273
269
  _CCCL_PRAGMA_UNROLL_FULL()
274
270
  for (int i = 0; i < load_store_count; ++i)
275
271
  {
@@ -278,15 +274,14 @@ _CCCL_DEVICE void transform_kernel_vectorized(
278
274
  }
279
275
  else
280
276
  {
281
- constexpr int elems = load_store_size / element_size;
282
- in += threadIdx.x * elems;
277
+ in += threadIdx.x * vec_size;
283
278
  _CCCL_PRAGMA_UNROLL_FULL()
284
279
  for (int i = 0; i < load_store_count; ++i)
285
280
  {
286
281
  _CCCL_PRAGMA_UNROLL_FULL()
287
- for (int j = 0; j < elems; ++j)
282
+ for (int j = 0; j < vec_size; ++j)
288
283
  {
289
- input[i * elems + j] = in[i * elems * VectorizedPolicy::block_threads + j];
284
+ input[i * vec_size + j] = in[i * vec_size * VectorizedPolicy::block_threads + j];
290
285
  }
291
286
  }
292
287
  }
@@ -310,8 +305,9 @@ _CCCL_DEVICE void transform_kernel_vectorized(
310
305
  if constexpr (can_vectorize_store)
311
306
  {
312
307
  // vector path
313
- auto output_vec = reinterpret_cast<const load_store_t*>(output.data());
314
- auto out_vec = reinterpret_cast<load_store_t*>(out) + threadIdx.x;
308
+ using store_t = decltype(load_store_type<sizeof(output_t) * vec_size>());
309
+ auto output_vec = reinterpret_cast<const store_t*>(output.data());
310
+ auto out_vec = reinterpret_cast<store_t*>(out) + threadIdx.x;
315
311
  _CCCL_PRAGMA_UNROLL_FULL()
316
312
  for (int i = 0; i < load_store_count; ++i)
317
313
  {
@@ -321,15 +317,14 @@ _CCCL_DEVICE void transform_kernel_vectorized(
321
317
  else
322
318
  {
323
319
  // serial path
324
- constexpr int elems = load_store_size / element_size;
325
- out += threadIdx.x * elems;
320
+ out += threadIdx.x * vec_size;
326
321
  _CCCL_PRAGMA_UNROLL_FULL()
327
322
  for (int i = 0; i < load_store_count; ++i)
328
323
  {
329
324
  _CCCL_PRAGMA_UNROLL_FULL()
330
- for (int j = 0; j < elems; ++j)
325
+ for (int j = 0; j < vec_size; ++j)
331
326
  {
332
- out[i * elems * VectorizedPolicy::block_threads + j] = output[i * elems + j];
327
+ out[i * vec_size * VectorizedPolicy::block_threads + j] = output[i * vec_size + j];
333
328
  }
334
329
  }
335
330
  }
@@ -43,9 +43,7 @@
43
43
 
44
44
  CUB_NAMESPACE_BEGIN
45
45
 
46
- namespace detail
47
- {
48
- namespace adjacent_difference
46
+ namespace detail::adjacent_difference
49
47
  {
50
48
  template <typename InputIteratorT, bool MayAlias>
51
49
  struct policy_hub
@@ -64,7 +62,6 @@ struct policy_hub
64
62
 
65
63
  using MaxPolicy = Policy500;
66
64
  };
67
- } // namespace adjacent_difference
68
- } // namespace detail
65
+ } // namespace detail::adjacent_difference
69
66
 
70
67
  CUB_NAMESPACE_END
@@ -43,9 +43,7 @@
43
43
 
44
44
  CUB_NAMESPACE_BEGIN
45
45
 
46
- namespace detail
47
- {
48
- namespace batch_memcpy
46
+ namespace detail::batch_memcpy
49
47
  {
50
48
  /**
51
49
  * Parameterizable tuning policy type for AgentBatchMemcpy
@@ -115,7 +113,6 @@ struct policy_hub
115
113
 
116
114
  using MaxPolicy = Policy700;
117
115
  };
118
- } // namespace batch_memcpy
119
- } // namespace detail
116
+ } // namespace detail::batch_memcpy
120
117
 
121
118
  CUB_NAMESPACE_END
@@ -42,9 +42,7 @@
42
42
 
43
43
  CUB_NAMESPACE_BEGIN
44
44
 
45
- namespace detail
46
- {
47
- namespace for_each
45
+ namespace detail::for_each
48
46
  {
49
47
 
50
48
  struct policy_hub_t
@@ -57,7 +55,6 @@ struct policy_hub_t
57
55
  using MaxPolicy = policy_500_t;
58
56
  };
59
57
 
60
- } // namespace for_each
61
- } // namespace detail
58
+ } // namespace detail::for_each
62
59
 
63
60
  CUB_NAMESPACE_END
@@ -46,9 +46,7 @@
46
46
 
47
47
  CUB_NAMESPACE_BEGIN
48
48
 
49
- namespace detail
50
- {
51
- namespace histogram
49
+ namespace detail::histogram
52
50
  {
53
51
  enum class primitive_sample
54
52
  {
@@ -272,7 +270,6 @@ struct policy_hub
272
270
 
273
271
  using MaxPolicy = Policy1000;
274
272
  };
275
- } // namespace histogram
276
- } // namespace detail
273
+ } // namespace detail::histogram
277
274
 
278
275
  CUB_NAMESPACE_END
@@ -42,9 +42,7 @@
42
42
 
43
43
  CUB_NAMESPACE_BEGIN
44
44
 
45
- namespace detail
46
- {
47
- namespace merge
45
+ namespace detail::merge
48
46
  {
49
47
  template <typename KeyT, typename ValueT>
50
48
  struct policy_hub
@@ -73,7 +71,6 @@ struct policy_hub
73
71
 
74
72
  using max_policy = policy600;
75
73
  };
76
- } // namespace merge
77
- } // namespace detail
74
+ } // namespace detail::merge
78
75
 
79
76
  CUB_NAMESPACE_END
@@ -62,6 +62,14 @@ struct MergeSortPolicyWrapper<StaticPolicyT, ::cuda::std::void_t<decltype(Static
62
62
  {}
63
63
 
64
64
  CUB_DEFINE_SUB_POLICY_GETTER(MergeSort);
65
+
66
+ #if defined(CUB_ENABLE_POLICY_PTX_JSON)
67
+ _CCCL_DEVICE static constexpr auto EncodedPolicy()
68
+ {
69
+ using namespace ptx_json;
70
+ return object<key<"MergeSortPolicy">() = MergeSort().EncodedPolicy()>();
71
+ }
72
+ #endif
65
73
  };
66
74
 
67
75
  template <typename PolicyT>
@@ -46,9 +46,7 @@
46
46
 
47
47
  CUB_NAMESPACE_BEGIN
48
48
 
49
- namespace detail
50
- {
51
- namespace radix
49
+ namespace detail::radix
52
50
  {
53
51
  // sm90 default
54
52
  template <size_t KeySize, size_t ValueSize, size_t OffsetSize>
@@ -1062,7 +1060,6 @@ struct policy_hub
1062
1060
  using MaxPolicy = Policy1000;
1063
1061
  };
1064
1062
 
1065
- } // namespace radix
1066
- } // namespace detail
1063
+ } // namespace detail::radix
1067
1064
 
1068
1065
  CUB_NAMESPACE_END
@@ -50,9 +50,7 @@
50
50
 
51
51
  CUB_NAMESPACE_BEGIN
52
52
 
53
- namespace detail
54
- {
55
- namespace reduce_by_key
53
+ namespace detail::reduce_by_key
56
54
  {
57
55
  enum class primitive_key
58
56
  {
@@ -939,7 +937,6 @@ struct policy_hub
939
937
  };
940
938
  using MaxPolicy = Policy1000;
941
939
  };
942
- } // namespace reduce_by_key
943
- } // namespace detail
940
+ } // namespace detail::reduce_by_key
944
941
 
945
942
  CUB_NAMESPACE_END
@@ -52,9 +52,7 @@
52
52
 
53
53
  CUB_NAMESPACE_BEGIN
54
54
 
55
- namespace detail
56
- {
57
- namespace rle
55
+ namespace detail::rle
58
56
  {
59
57
  enum class primitive_key
60
58
  {
@@ -670,7 +668,6 @@ struct policy_hub
670
668
  using MaxPolicy = Policy1000;
671
669
  };
672
670
  } // namespace non_trivial_runs
673
- } // namespace rle
674
- } // namespace detail
671
+ } // namespace detail::rle
675
672
 
676
673
  CUB_NAMESPACE_END
@@ -53,9 +53,7 @@
53
53
 
54
54
  CUB_NAMESPACE_BEGIN
55
55
 
56
- namespace detail
57
- {
58
- namespace scan
56
+ namespace detail::scan
59
57
  {
60
58
  enum class keep_rejects
61
59
  {
@@ -615,7 +613,6 @@ struct policy_hub
615
613
 
616
614
  using MaxPolicy = Policy1000;
617
615
  };
618
- } // namespace scan
619
- } // namespace detail
616
+ } // namespace detail::scan
620
617
 
621
618
  CUB_NAMESPACE_END
@@ -49,9 +49,7 @@
49
49
 
50
50
  CUB_NAMESPACE_BEGIN
51
51
 
52
- namespace detail
53
- {
54
- namespace scan_by_key
52
+ namespace detail::scan_by_key
55
53
  {
56
54
  enum class primitive_accum
57
55
  {
@@ -1007,7 +1005,6 @@ struct policy_hub
1007
1005
 
1008
1006
  using MaxPolicy = Policy1000;
1009
1007
  };
1010
- } // namespace scan_by_key
1011
- } // namespace detail
1008
+ } // namespace detail::scan_by_key
1012
1009
 
1013
1010
  CUB_NAMESPACE_END
@@ -43,9 +43,7 @@
43
43
 
44
44
  CUB_NAMESPACE_BEGIN
45
45
 
46
- namespace detail
47
- {
48
- namespace segmented_sort
46
+ namespace detail::segmented_sort
49
47
  {
50
48
 
51
49
  template <typename PolicyT, typename = void>
@@ -395,7 +393,6 @@ struct policy_hub
395
393
 
396
394
  using MaxPolicy = Policy860;
397
395
  };
398
- } // namespace segmented_sort
399
- } // namespace detail
396
+ } // namespace detail::segmented_sort
400
397
 
401
398
  CUB_NAMESPACE_END