cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -174,10 +174,12 @@ CUB_NAMESPACE_BEGIN
174
174
  //!
175
175
  //! .. code-block:: python
176
176
  //!
177
- //! import cuda.cccl.cooperative.experimental as cudax
177
+ //! from cuda import coop
178
+ //! from pynvjitlink import patch
179
+ //! patch.patch_numba_linker(lto=True)
178
180
  //!
179
181
  //! # Specialize radix sort for a 1D block of 128 threads owning 4 integer items each
180
- //! block_radix_sort = cudax.block.radix_sort_keys(numba.int32, 128, 4)
182
+ //! block_radix_sort = coop.block.radix_sort_keys(numba.int32, 128, 4)
181
183
  //! temp_storage_bytes = block_radix_sort.temp_storage_bytes
182
184
  //!
183
185
  //! @cuda.jit(link=block_radix_sort.files)
@@ -425,6 +425,7 @@ public:
425
425
  //!
426
426
  //! // Compute the block-wide max for thread0
427
427
  //! int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cuda::maximum<>{});
428
+ //! }
428
429
  //!
429
430
  //! @endrst
430
431
  //!
@@ -190,6 +190,7 @@ enum BlockScanAlgorithm
190
190
  //!
191
191
  //! // Collectively compute the block-wide exclusive prefix sum
192
192
  //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
193
+ //! }
193
194
  //!
194
195
  //! Suppose the set of input ``thread_data`` across the block of threads is
195
196
  //! ``{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}``.
@@ -333,6 +334,7 @@ public:
333
334
  //!
334
335
  //! // Collectively compute the block-wide exclusive prefix sum
335
336
  //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
337
+ //! }
336
338
  //!
337
339
  //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
338
340
  //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
@@ -386,6 +388,7 @@ public:
386
388
  //! // Collectively compute the block-wide exclusive prefix sum
387
389
  //! int block_aggregate;
388
390
  //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
391
+ //! }
389
392
  //!
390
393
  //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
391
394
  //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
@@ -479,6 +482,7 @@ public:
479
482
  //! // Store scanned items to output segment
480
483
  //! d_data[block_offset + threadIdx.x] = thread_data;
481
484
  //! }
485
+ //! }
482
486
  //!
483
487
  //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
484
488
  //! The corresponding output for the first segment will be ``0, 1, ..., 127``.
@@ -545,6 +549,7 @@ public:
545
549
  //!
546
550
  //! // Collectively compute the block-wide exclusive prefix sum
547
551
  //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
552
+ //! }
548
553
  //!
549
554
  //! Suppose the set of input ``thread_data`` across the block of threads is
550
555
  //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
@@ -606,6 +611,7 @@ public:
606
611
  //! // Collectively compute the block-wide exclusive prefix sum
607
612
  //! int block_aggregate;
608
613
  //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
614
+ //! }
609
615
  //!
610
616
  //! Suppose the set of input ``thread_data`` across the block of threads is
611
617
  //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
@@ -720,6 +726,7 @@ public:
720
726
  //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
721
727
  //! __syncthreads();
722
728
  //! }
729
+ //! }
723
730
  //!
724
731
  //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
725
732
  //! The corresponding output for the first segment will be ``0, 1, 2, 3, ..., 510, 511``.
@@ -788,6 +795,7 @@ public:
788
795
  //!
789
796
  //! // Collectively compute the block-wide exclusive prefix max scan
790
797
  //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
798
+ //! }
791
799
  //!
792
800
  //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
793
801
  //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
@@ -849,8 +857,9 @@ public:
849
857
  //!
850
858
  //! // Collectively compute the block-wide exclusive prefix max scan
851
859
  //! int block_aggregate;
852
- //! BlockScan(temp_storage).ExclusiveScan(
853
- //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, block_aggregate);
860
+ //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data,
861
+ //! INT_MIN, cuda::maximum<>{}, block_aggregate);
862
+ //! }
854
863
  //!
855
864
  //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
856
865
  //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
@@ -960,6 +969,7 @@ public:
960
969
  //! // Store scanned items to output segment
961
970
  //! d_data[block_offset + threadIdx.x] = thread_data;
962
971
  //! }
972
+ //! }
963
973
  //!
964
974
  //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
965
975
  //! The corresponding output for the first segment will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
@@ -616,6 +616,7 @@ enum BlockStoreAlgorithm
616
616
  //!
617
617
  //! // Store items to linear memory
618
618
  //! BlockStore(temp_storage).Store(d_data, thread_data);
619
+ //! }
619
620
  //!
620
621
  //! Suppose the set of ``thread_data`` across the block of threads is
621
622
  //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
@@ -1156,8 +1157,8 @@ public:
1156
1157
  //! ...
1157
1158
  //!
1158
1159
  //! // Store items to linear memory
1159
- //! int thread_data[4];
1160
1160
  //! BlockStore(temp_storage).Store(d_data, thread_data);
1161
+ //! }
1161
1162
  //!
1162
1163
  //! Suppose the set of ``thread_data`` across the block of threads is
1163
1164
  //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
@@ -1208,8 +1209,8 @@ public:
1208
1209
  //! ...
1209
1210
  //!
1210
1211
  //! // Store items to linear memory
1211
- //! int thread_data[4];
1212
1212
  //! BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
1213
+ //! }
1213
1214
  //!
1214
1215
  //! Suppose the set of ``thread_data`` across the block of threads is
1215
1216
  //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }`` and ``valid_items`` is ``5``.
@@ -14,6 +14,7 @@
14
14
  #endif // no system header
15
15
 
16
16
  #include <cuda/__stream/stream_ref.h>
17
+ #include <cuda/std/__cuda/api_wrapper.h>
17
18
  #include <cuda/std/cstdint>
18
19
 
19
20
  CUB_NAMESPACE_BEGIN
@@ -15,71 +15,76 @@
15
15
 
16
16
  #include <cub/detail/fast_modulo_division.cuh> // fast_div_mod
17
17
 
18
+ #include <cuda/std/__mdspan/extents.h>
18
19
  #include <cuda/std/__type_traits/make_unsigned.h>
19
20
  #include <cuda/std/__utility/integer_sequence.h>
20
21
  #include <cuda/std/array>
21
22
  #include <cuda/std/cstddef>
22
- #include <cuda/std/mdspan>
23
23
 
24
24
  CUB_NAMESPACE_BEGIN
25
-
26
25
  namespace detail
27
26
  {
28
27
 
28
+ _CCCL_DIAG_PUSH
29
+ _CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code (even if there are no branches!)
30
+
29
31
  // Compute the submdspan size of a given rank
30
- template <size_t Rank, typename IndexType, size_t Extent0, size_t... Extents>
31
- [[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
32
- sub_size(const ::cuda::std::extents<IndexType, Extent0, Extents...>& ext)
32
+ template <typename IndexType, size_t... Extents>
33
+ [[nodiscard]] _CCCL_API constexpr ::cuda::std::make_unsigned_t<IndexType>
34
+ size_range(const ::cuda::std::extents<IndexType, Extents...>& ext, int start, int end)
33
35
  {
36
+ _CCCL_ASSERT(start >= 0 && end <= static_cast<int>(ext.rank()), "invalid start or end");
34
37
  ::cuda::std::make_unsigned_t<IndexType> s = 1;
35
- for (IndexType i = Rank; i < IndexType{1 + sizeof...(Extents)}; i++) // <- pointless comparison with zero-rank extent
38
+ for (auto i = start; i < end; i++)
36
39
  {
37
40
  s *= ext.extent(i);
38
41
  }
39
42
  return s;
40
43
  }
41
44
 
42
- // avoid pointless comparison of unsigned integer with zero (nvcc 11.x doesn't support nv_diag warning suppression)
43
- template <size_t Rank, typename IndexType>
44
- [[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
45
- sub_size(const ::cuda::std::extents<IndexType>&)
45
+ _CCCL_DIAG_POP // MSVC(4702)
46
+
47
+ template <typename IndexType, size_t... Extents>
48
+ [[nodiscard]] _CCCL_API constexpr ::cuda::std::make_unsigned_t<IndexType>
49
+ size(const ::cuda::std::extents<IndexType, Extents...>& ext)
46
50
  {
47
- return ::cuda::std::make_unsigned_t<IndexType>{1};
51
+ return cub::detail::size_range(ext, 0, static_cast<int>(ext.rank()));
48
52
  }
49
53
 
50
- // TODO: move to cuda::std
51
- template <typename IndexType, size_t... Extents>
52
- [[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
53
- size(const ::cuda::std::extents<IndexType, Extents...>& ext)
54
+ template <bool IsLayoutRight, int Position, typename IndexType, size_t... E>
55
+ [[nodiscard]] _CCCL_API auto sub_size_fast_div_mod_impl(const ::cuda::std::extents<IndexType, E...>& ext)
54
56
  {
55
- return cub::detail::sub_size<0>(ext);
57
+ using fast_mod_div_t = fast_div_mod<IndexType>;
58
+ constexpr auto start = IsLayoutRight ? Position + 1 : 0;
59
+ constexpr auto end = IsLayoutRight ? sizeof...(E) : Position;
60
+ return fast_mod_div_t(cub::detail::size_range(ext, start, end));
56
61
  }
57
62
 
58
63
  // precompute modulo/division for each submdspan size (by rank)
59
- template <typename IndexType, size_t... E, size_t... Ranks>
60
- [[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
61
- sub_sizes_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Ranks...> = {})
64
+ template <bool IsLayoutRight, typename IndexType, size_t... E, size_t... Positions>
65
+ [[nodiscard]] _CCCL_API auto
66
+ sub_sizes_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Positions...> = {})
62
67
  {
63
- // deduction guides don't work with nvcc 11.x
64
68
  using fast_mod_div_t = fast_div_mod<IndexType>;
65
- return ::cuda::std::array<fast_mod_div_t, sizeof...(Ranks)>{fast_mod_div_t(sub_size<Ranks + 1>(ext))...};
69
+ using array_t = ::cuda::std::array<fast_mod_div_t, sizeof...(Positions)>;
70
+ return array_t{cub::detail::sub_size_fast_div_mod_impl<IsLayoutRight, Positions>(ext)...};
66
71
  }
67
72
 
68
73
  // precompute modulo/division for each mdspan extent
69
- template <typename IndexType, size_t... E, size_t... Ranks>
70
- [[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
71
- extents_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Ranks...> = {})
74
+ template <typename IndexType, size_t... E, size_t... Positions>
75
+ [[nodiscard]] _CCCL_API auto
76
+ extents_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Positions...> = {})
72
77
  {
73
78
  using fast_mod_div_t = fast_div_mod<IndexType>;
74
- return ::cuda::std::array<fast_mod_div_t, sizeof...(Ranks)>{fast_mod_div_t(ext.extent(Ranks))...};
79
+ using array_t = ::cuda::std::array<fast_mod_div_t, sizeof...(Positions)>;
80
+ return array_t{fast_mod_div_t(ext.extent(Positions))...};
75
81
  }
76
82
 
77
83
  // GCC <= 9 constexpr workaround: Extent must be passed as type only, even const Extent& doesn't work
78
- template <int Rank, typename Extents>
79
- [[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool is_sub_size_static()
84
+ template <typename Extents>
85
+ [[nodiscard]] _CCCL_API constexpr bool are_extents_in_range_static(int start, int end)
80
86
  {
81
- using index_type = typename Extents::index_type;
82
- for (index_type i = Rank; i < Extents::rank(); i++)
87
+ for (auto i = start; i < end; i++)
83
88
  {
84
89
  if (Extents::static_extent(i) == ::cuda::std::dynamic_extent)
85
90
  {
@@ -106,5 +111,4 @@ template <typename MappingTypeLhs, typename MappingTypeRhs>
106
111
  }
107
112
 
108
113
  } // namespace detail
109
-
110
114
  CUB_NAMESPACE_END
@@ -29,7 +29,7 @@
29
29
 
30
30
  #include <cub/config.cuh>
31
31
 
32
- #include <thrust/detail/algorithm_wrapper.h>
32
+ #include <cuda/std/__cccl/algorithm_wrapper.h>
33
33
 
34
34
  #include <format>
35
35
  #include <string_view>
@@ -1,29 +1,5 @@
1
- /******************************************************************************
2
- * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3
- *
4
- * Redistribution and use in source and binary forms, with or without
5
- * modification, are permitted provided that the following conditions are met:
6
- * * Redistributions of source code must retain the above copyright
7
- * notice, this list of conditions and the following disclaimer.
8
- * * Redistributions in binary form must reproduce the above copyright
9
- * notice, this list of conditions and the following disclaimer in the
10
- * documentation and/or other materials provided with the distribution.
11
- * * Neither the name of the NVIDIA CORPORATION nor the
12
- * names of its contributors may be used to endorse or promote products
13
- * derived from this software without specific prior written permission.
14
- *
15
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
- *
26
- ******************************************************************************/
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: BSD-3-Clause
27
3
 
28
4
  #pragma once
29
5
 
@@ -41,24 +17,23 @@
41
17
  #include <cub/util_namespace.cuh>
42
18
 
43
19
  #include <thrust/detail/raw_reference_cast.h>
44
- #include <thrust/distance.h>
45
20
  #include <thrust/type_traits/is_contiguous_iterator.h>
46
21
  #include <thrust/type_traits/unwrap_contiguous_iterator.h>
47
22
 
48
23
  #include <cuda/__cmath/ceil_div.h>
24
+ #include <cuda/std/__concepts/concept_macros.h>
25
+ #include <cuda/std/__fwd/mdspan.h>
49
26
  #include <cuda/std/__iterator/distance.h>
50
27
  #include <cuda/std/__mdspan/extents.h>
28
+ #include <cuda/std/__mdspan/layout_left.h>
29
+ #include <cuda/std/__mdspan/layout_right.h>
51
30
  #include <cuda/std/__memory/is_sufficiently_aligned.h>
52
31
  #include <cuda/std/__type_traits/is_integral.h>
53
- #include <cuda/std/__utility/integer_sequence.h>
54
32
  #include <cuda/std/array>
55
33
 
56
34
  CUB_NAMESPACE_BEGIN
57
35
 
58
- namespace detail
59
- {
60
-
61
- namespace for_each
36
+ namespace detail::for_each
62
37
  {
63
38
 
64
39
  /**
@@ -122,8 +97,7 @@ struct op_wrapper_vectorized_t
122
97
  }
123
98
  };
124
99
 
125
- } // namespace for_each
126
- } // namespace detail
100
+ } // namespace detail::for_each
127
101
 
128
102
  struct DeviceFor
129
103
  {
@@ -568,6 +542,10 @@ public:
568
542
  {
569
543
  _CCCL_NVTX_RANGE_SCOPE("cub::DeviceFor::Bulk");
570
544
  static_assert(::cuda::std::is_integral_v<ShapeT>, "ShapeT must be an integral type");
545
+ if (shape == 0)
546
+ {
547
+ return cudaSuccess;
548
+ }
571
549
  using offset_t = ShapeT;
572
550
  return detail::for_each::dispatch_t<offset_t, OpT>::dispatch(static_cast<offset_t>(shape), op, stream);
573
551
  }
@@ -833,7 +811,8 @@ public:
833
811
  //! Overview
834
812
  //! +++++++++++++++++++++++++++++++++++++++++++++
835
813
  //!
836
- //! Iterate through a multi-dimensional extents into
814
+ //! Iterate through a multi-dimensional extents into a single linear index and a list of indices for each extent
815
+ //! dimension.
837
816
  //!
838
817
  //! - a single linear index that represents the current iteration
839
818
  //! - indices of each extent dimension
@@ -899,8 +878,6 @@ public:
899
878
  OpType op,
900
879
  cudaStream_t stream = {})
901
880
  {
902
- // TODO: check dimensions overflows
903
- // TODO: check tha arity of OpType is equal to sizeof...(ExtentsType)
904
881
  if (d_temp_storage == nullptr)
905
882
  {
906
883
  temp_storage_bytes = 1;
@@ -967,19 +944,120 @@ public:
967
944
  template <typename IndexType, size_t... Extents, typename OpType>
968
945
  CUB_RUNTIME_FUNCTION static cudaError_t
969
946
  ForEachInExtents(const ::cuda::std::extents<IndexType, Extents...>& extents, OpType op, cudaStream_t stream = {})
947
+ {
948
+ using extents_type = ::cuda::std::extents<IndexType, Extents...>;
949
+ return cub::DeviceFor::ForEachInLayout(::cuda::std::layout_right::mapping<extents_type>{extents}, op, stream);
950
+ }
951
+
952
+ /*********************************************************************************************************************
953
+ * ForEachInLayout
954
+ ********************************************************************************************************************/
955
+
956
+ //! @rst
957
+ //! Overview
958
+ //! +++++++++++++++++++++++++++++++++++++++++++++
959
+ //!
960
+ //! Iterate through multi-dimensional extents using a specific mdspan layout, applying a function object for each
961
+ //! element, passing
962
+ //!
963
+ //! - a single linear index that represents the current iteration
964
+ //! - a list of indices containing the coordinates for each extent dimension
965
+ //!
966
+ //! The iteration order depends on the layout type:
967
+ //!
968
+ //! - ``layout_right``: Iterates in row-major order (rightmost index varies fastest)
969
+ //! - ``layout_left``: Iterates in column-major order (leftmost index varies fastest)
970
+ //!
971
+ //! - The return value of ``op``, if any, is ignored.
972
+ //!
973
+ //! A Simple Example
974
+ //! +++++++++++++++++++++++++++++++++++++++++++++
975
+ //!
976
+ //! The following code snippet demonstrates how to use ``ForEachInLayout`` to iterate through a 2D matrix in
977
+ //! column-major order using ``layout_left``.
978
+ //!
979
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_each_in_layout_api.cu
980
+ //! :language: c++
981
+ //! :dedent:
982
+ //! :start-after: example-begin for-each-in-layout-op
983
+ //! :end-before: example-end for-each-in-layout-op
984
+ //!
985
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_each_in_layout_api.cu
986
+ //! :language: c++
987
+ //! :dedent:
988
+ //! :start-after: example-begin for-each-in-layout-example
989
+ //! :end-before: example-end for-each-in-layout-example
990
+ //!
991
+ //! @endrst
992
+ //!
993
+ //! @tparam Layout
994
+ //! **[inferred]** The mdspan layout type, must be either ``cuda::std::layout_left`` or ``cuda::std::layout_right``
995
+ //!
996
+ //! @tparam IndexType
997
+ //! **[inferred]** An integral type that represents the extent index space
998
+ //!
999
+ //! @tparam Extents
1000
+ //! **[inferred]** The extent sizes for each rank index
1001
+ //!
1002
+ //! @tparam OpType
1003
+ //! **[inferred]** A function object with arity equal to the number of extents + 1 for the linear index (iteration).
1004
+ //! The first parameter is the linear index, followed by one parameter for each dimension coordinate.
1005
+ //!
1006
+ //! @param[in] layout
1007
+ //! Layout object that determines the iteration order (layout_left for column-major, layout_right for row-major)
1008
+ //!
1009
+ //! @param[in] extents
1010
+ //! Extents object that represents a multi-dimensional index space
1011
+ //!
1012
+ //! @param[in] op
1013
+ //! Function object to apply to each linear index (iteration) and multi-dimensional coordinates.
1014
+ //! Called as ``op(linear_index, coord_0, coord_1, ..., coord_n)``
1015
+ //!
1016
+ //! @param[in] stream
1017
+ //! CUDA stream to launch kernels within. Default stream is `nullptr`
1018
+ //!
1019
+ //! @return cudaError_t
1020
+ //! error status
1021
+ _CCCL_TEMPLATE(typename LayoutMapping, typename OpType)
1022
+ _CCCL_REQUIRES(::cuda::std::__is_any_mdspan_layout_mapping_left_or_right_v<LayoutMapping>)
1023
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
1024
+ ForEachInLayout(const LayoutMapping& layout_mapping, OpType op, cudaStream_t stream = {})
970
1025
  {
971
1026
  using namespace cub::detail;
972
- using extents_type = ::cuda::std::extents<IndexType, Extents...>;
1027
+ using extents_type = typename LayoutMapping::extents_type;
973
1028
  using extent_index_type = typename extents_type::index_type;
974
1029
  using fast_mod_array_t = ::cuda::std::array<fast_div_mod<extent_index_type>, extents_type::rank()>;
975
1030
  _CCCL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachInExtents");
976
1031
  static constexpr auto seq = ::cuda::std::make_index_sequence<extents_type::rank()>{};
977
- fast_mod_array_t sub_sizes_div_array = cub::detail::sub_sizes_fast_div_mod(extents, seq);
1032
+ constexpr bool is_layout_right = ::cuda::std::__is_any_mdspan_layout_mapping_right_v<LayoutMapping>;
1033
+ auto extents = layout_mapping.extents();
1034
+ fast_mod_array_t sub_sizes_div_array = cub::detail::sub_sizes_fast_div_mod<is_layout_right>(extents, seq);
978
1035
  fast_mod_array_t extents_div_array = cub::detail::extents_fast_div_mod(extents, seq);
979
- for_each::op_wrapper_extents_t<OpType, extents_type, fast_mod_array_t> op_wrapper{
1036
+ for_each::op_wrapper_extents_t<OpType, extents_type, is_layout_right, fast_mod_array_t> op_wrapper{
980
1037
  op, extents, sub_sizes_div_array, extents_div_array};
981
1038
  return Bulk(static_cast<implicit_prom_t<extent_index_type>>(cub::detail::size(extents)), op_wrapper, stream);
982
1039
  }
1040
+
1041
+ #ifndef _CCCL_DOXYGEN_INVOKED
1042
+
1043
+ _CCCL_TEMPLATE(typename LayoutMapping, typename OpType)
1044
+ _CCCL_REQUIRES(::cuda::std::__is_any_mdspan_layout_mapping_left_or_right_v<LayoutMapping>)
1045
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ForEachInLayout(
1046
+ void* d_temp_storage,
1047
+ size_t& temp_storage_bytes,
1048
+ const LayoutMapping& layout_mapping,
1049
+ OpType op,
1050
+ cudaStream_t stream = {})
1051
+ {
1052
+ if (d_temp_storage == nullptr)
1053
+ {
1054
+ temp_storage_bytes = 1;
1055
+ return cudaSuccess;
1056
+ }
1057
+ return ForEachInLayout(layout_mapping, op, stream);
1058
+ }
1059
+
1060
+ #endif // !_CCCL_DOXYGEN_INVOKED
983
1061
  };
984
1062
 
985
1063
  CUB_NAMESPACE_END
@@ -52,15 +52,15 @@
52
52
  #include <cub/thread/thread_operators.cuh>
53
53
  #include <cub/util_type.cuh>
54
54
 
55
- #include <thrust/iterator/tabulate_output_iterator.h>
56
-
57
55
  #include <cuda/__execution/determinism.h>
58
56
  #include <cuda/__execution/require.h>
59
57
  #include <cuda/__execution/tune.h>
60
58
  #include <cuda/__functional/maximum.h>
61
59
  #include <cuda/__functional/minimum.h>
60
+ #include <cuda/__iterator/tabulate_output_iterator.h>
62
61
  #include <cuda/__memory_resource/get_memory_resource.h>
63
62
  #include <cuda/__stream/get_stream.h>
63
+ #include <cuda/__stream/stream_ref.h>
64
64
  #include <cuda/std/__execution/env.h>
65
65
  #include <cuda/std/__functional/identity.h>
66
66
  #include <cuda/std/__functional/invoke.h>
@@ -70,7 +70,6 @@
70
70
  #include <cuda/std/__type_traits/is_same.h>
71
71
  #include <cuda/std/cstdint>
72
72
  #include <cuda/std/limits>
73
- #include <cuda/stream_ref>
74
73
 
75
74
  CUB_NAMESPACE_BEGIN
76
75
 
@@ -1215,7 +1214,7 @@ public:
1215
1214
  OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
1216
1215
 
1217
1216
  // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1218
- auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
1217
+ auto out_it = ::cuda::make_tabulate_output_iterator(
1219
1218
  detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
1220
1219
 
1221
1220
  return detail::reduce::dispatch_streaming_arg_reduce_t<
@@ -1341,7 +1340,7 @@ public:
1341
1340
  OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
1342
1341
 
1343
1342
  // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1344
- auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
1343
+ auto out_it = ::cuda::make_tabulate_output_iterator(
1345
1344
  detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
1346
1345
 
1347
1346
  // Query the required temporary storage size
@@ -1883,7 +1882,7 @@ public:
1883
1882
  OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::lowest()};
1884
1883
 
1885
1884
  // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1886
- auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
1885
+ auto out_it = ::cuda::make_tabulate_output_iterator(
1887
1886
  detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
1888
1887
 
1889
1888
  return detail::reduce::dispatch_streaming_arg_reduce_t<
@@ -2133,7 +2132,7 @@ public:
2133
2132
  OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
2134
2133
 
2135
2134
  // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
2136
- auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
2135
+ auto out_it = ::cuda::make_tabulate_output_iterator(
2137
2136
  detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
2138
2137
 
2139
2138
  // Query the required temporary storage size