cuda-cccl 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -18,6 +18,8 @@
18
18
  #include <cub/util_namespace.cuh>
19
19
 
20
20
  #include <cuda/__functional/address_stability.h>
21
+ #include <cuda/__stream/get_stream.h>
22
+ #include <cuda/std/__execution/env.h>
21
23
  #include <cuda/std/tuple>
22
24
 
23
25
  CUB_NAMESPACE_BEGIN
@@ -49,13 +51,20 @@ CUB_NAMESPACE_BEGIN
49
51
  struct DeviceTransform
50
52
  {
51
53
  private:
52
- template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename NumItemsT, typename TransformOp>
54
+ template <typename... RandomAccessIteratorsIn,
55
+ typename RandomAccessIteratorOut,
56
+ typename NumItemsT,
57
+ typename Predicate,
58
+ typename TransformOp,
59
+ typename StableAddress = cuda::std::false_type>
53
60
  CUB_RUNTIME_FUNCTION static cudaError_t TransformInternal(
54
61
  ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
55
62
  RandomAccessIteratorOut output,
56
63
  NumItemsT num_items,
64
+ Predicate predicate,
57
65
  TransformOp transform_op,
58
- cudaStream_t stream = nullptr)
66
+ cudaStream_t stream,
67
+ StableAddress = {})
59
68
  {
60
69
  using choose_offset_t = detail::choose_signed_offset<NumItemsT>;
61
70
  using offset_t = typename choose_offset_t::type;
@@ -66,18 +75,28 @@ private:
66
75
  return error;
67
76
  }
68
77
 
69
- return detail::transform::dispatch_t<
70
- detail::transform::requires_stable_address::no,
71
- offset_t,
72
- ::cuda::std::tuple<RandomAccessIteratorsIn...>,
73
- RandomAccessIteratorOut,
74
- detail::transform::always_true_predicate,
75
- TransformOp>::dispatch(::cuda::std::move(inputs),
76
- ::cuda::std::move(output),
77
- num_items,
78
- detail::transform::always_true_predicate{},
79
- ::cuda::std::move(transform_op),
80
- stream);
78
+ return detail::transform::dispatch_t < StableAddress::value
79
+ ? detail::transform::requires_stable_address::yes
80
+ : detail::transform::requires_stable_address::no,
81
+ offset_t, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, Predicate,
82
+ TransformOp > ::dispatch(
83
+ ::cuda::std::move(inputs),
84
+ ::cuda::std::move(output),
85
+ num_items,
86
+ ::cuda::std::move(predicate),
87
+ ::cuda::std::move(transform_op),
88
+ stream);
89
+ }
90
+
91
+ template <typename Env>
92
+ CUB_RUNTIME_FUNCTION static auto get_stream(Env env) -> cudaStream_t
93
+ {
94
+ return ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}).get();
95
+ }
96
+
97
+ CUB_RUNTIME_FUNCTION static auto get_stream(cudaStream_t stream) -> cudaStream_t
98
+ {
99
+ return stream;
81
100
  }
82
101
 
83
102
  public:
@@ -108,18 +127,28 @@ public:
108
127
  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
109
128
  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
110
129
  //! operator must be assignable to the dereferenced output iterator.
111
- //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
112
- template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename NumItemsT, typename TransformOp>
130
+ //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
131
+ //! stream\ :sub:`0`
132
+ template <typename... RandomAccessIteratorsIn,
133
+ typename RandomAccessIteratorOut,
134
+ typename NumItemsT,
135
+ typename TransformOp,
136
+ typename Env = ::cuda::std::execution::env<>>
113
137
  CUB_RUNTIME_FUNCTION static cudaError_t Transform(
114
138
  ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
115
139
  RandomAccessIteratorOut output,
116
140
  NumItemsT num_items,
117
141
  TransformOp transform_op,
118
- cudaStream_t stream = nullptr)
142
+ Env env = {})
119
143
  {
120
144
  _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
121
145
  return TransformInternal(
122
- ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
146
+ ::cuda::std::move(inputs),
147
+ ::cuda::std::move(output),
148
+ num_items,
149
+ detail::transform::always_true_predicate{},
150
+ ::cuda::std::move(transform_op),
151
+ get_stream(env));
123
152
  }
124
153
 
125
154
  #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -160,21 +189,26 @@ public:
160
189
  //! @param transform_op A unary function object. The input iterator's value type must be convertible to the parameter
161
190
  //! of the function object's call operator. The return type of the call operator must be assignable to the
162
191
  //! dereferenced output iterator.
163
- //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
164
- template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename NumItemsT, typename TransformOp>
192
+ //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
193
+ //! stream\ :sub:`0`
194
+ template <typename RandomAccessIteratorIn,
195
+ typename RandomAccessIteratorOut,
196
+ typename NumItemsT,
197
+ typename TransformOp,
198
+ typename Env = ::cuda::std::execution::env<>>
165
199
  CUB_RUNTIME_FUNCTION static cudaError_t Transform(
166
200
  RandomAccessIteratorIn input,
167
201
  RandomAccessIteratorOut output,
168
202
  NumItemsT num_items,
169
203
  TransformOp transform_op,
170
- cudaStream_t stream = nullptr)
204
+ Env env = {})
171
205
  {
172
206
  return Transform(
173
207
  ::cuda::std::make_tuple(::cuda::std::move(input)),
174
208
  ::cuda::std::move(output),
175
209
  num_items,
176
210
  ::cuda::std::move(transform_op),
177
- stream);
211
+ ::cuda::std::move(env));
178
212
  }
179
213
 
180
214
  #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -215,10 +249,14 @@ public:
215
249
  //! @param num_items The number of elements to write to the output sequence.
216
250
  //! @param generator A nullary function object. The return type of the call operator must be assignable to the
217
251
  //! dereferenced output iterator.
218
- //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
219
- template <typename RandomAccessIteratorOut, typename NumItemsT, typename Generator>
252
+ //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
253
+ //! stream\ :sub:`0`
254
+ template <typename RandomAccessIteratorOut,
255
+ typename NumItemsT,
256
+ typename Generator,
257
+ typename Env = ::cuda::std::execution::env<>>
220
258
  CUB_RUNTIME_FUNCTION static cudaError_t
221
- Generate(RandomAccessIteratorOut output, NumItemsT num_items, Generator generator, cudaStream_t stream = nullptr)
259
+ Generate(RandomAccessIteratorOut output, NumItemsT num_items, Generator generator, Env env = {})
222
260
  {
223
261
  static_assert(::cuda::std::is_invocable_v<Generator>, "The passed generator must be a nullary function object");
224
262
  static_assert(
@@ -228,7 +266,12 @@ public:
228
266
 
229
267
  _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Generate");
230
268
  return TransformInternal(
231
- ::cuda::std::make_tuple(), ::cuda::std::move(output), num_items, ::cuda::std::move(generator), stream);
269
+ ::cuda::std::make_tuple(),
270
+ ::cuda::std::move(output),
271
+ num_items,
272
+ detail::transform::always_true_predicate{},
273
+ ::cuda::std::move(generator),
274
+ get_stream(env));
232
275
  }
233
276
 
234
277
  #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -262,10 +305,14 @@ public:
262
305
  //! @param output An iterator to the output sequence where num_items results are written to.
263
306
  //! @param num_items The number of elements to write to the output sequence.
264
307
  //! @param value The value to write. Must be assignable to the dereferenced output iterator.
265
- //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
266
- template <typename RandomAccessIteratorOut, typename NumItemsT, typename Value>
308
+ //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
309
+ //! stream\ :sub:`0`
310
+ template <typename RandomAccessIteratorOut,
311
+ typename NumItemsT,
312
+ typename Value,
313
+ typename Env = ::cuda::std::execution::env<>>
267
314
  CUB_RUNTIME_FUNCTION static cudaError_t
268
- Fill(RandomAccessIteratorOut output, NumItemsT num_items, Value value, cudaStream_t stream = nullptr)
315
+ Fill(RandomAccessIteratorOut output, NumItemsT num_items, Value value, Env env = {})
269
316
  {
270
317
  static_assert(::cuda::std::is_assignable_v<detail::it_reference_t<RandomAccessIteratorOut>, Value>,
271
318
  "The passed value must be assignable to the dereferenced output iterator");
@@ -275,8 +322,9 @@ public:
275
322
  ::cuda::std::make_tuple(),
276
323
  ::cuda::std::move(output),
277
324
  num_items,
325
+ detail::transform::always_true_predicate{},
278
326
  detail::__return_constant<Value>{::cuda::std::move(value)},
279
- stream);
327
+ get_stream(env));
280
328
  }
281
329
 
282
330
  #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -296,8 +344,7 @@ public:
296
344
  return cudaSuccess;
297
345
  }
298
346
 
299
- return Generate(
300
- ::cuda::std::move(output), num_items, detail::__return_constant<Value>{::cuda::std::move(value)}, stream);
347
+ return Fill(::cuda::std::move(output), num_items, ::cuda::std::move(value), stream);
301
348
  }
302
349
  #endif // _CCCL_DOXYGEN_INVOKED
303
350
 
@@ -333,43 +380,30 @@ public:
333
380
  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
334
381
  //! operator must be assignable to the dereferenced output iterator. Will only be invoked if \p predicate returns
335
382
  //! true.
336
- //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
383
+ //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
384
+ //! stream\ :sub:`0`
337
385
  template <typename... RandomAccessIteratorsIn,
338
386
  typename RandomAccessIteratorOut,
339
387
  typename NumItemsT,
340
388
  typename Predicate,
341
- typename TransformOp>
389
+ typename TransformOp,
390
+ typename Env = ::cuda::std::execution::env<>>
342
391
  CUB_RUNTIME_FUNCTION static cudaError_t TransformIf(
343
392
  ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
344
393
  RandomAccessIteratorOut output,
345
394
  NumItemsT num_items,
346
395
  Predicate predicate,
347
396
  TransformOp transform_op,
348
- cudaStream_t stream = nullptr)
397
+ Env env = {})
349
398
  {
350
399
  _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformIf");
351
-
352
- using choose_offset_t = detail::choose_signed_offset<NumItemsT>;
353
- using offset_t = typename choose_offset_t::type;
354
-
355
- // Check if the number of items exceeds the range covered by the selected signed offset type
356
- if (const cudaError_t error = choose_offset_t::is_exceeding_offset_type(num_items); error != cudaSuccess)
357
- {
358
- return error;
359
- }
360
-
361
- return detail::transform::dispatch_t<
362
- detail::transform::requires_stable_address::no,
363
- offset_t,
364
- ::cuda::std::tuple<RandomAccessIteratorsIn...>,
365
- RandomAccessIteratorOut,
366
- Predicate,
367
- TransformOp>::dispatch(::cuda::std::move(inputs),
368
- ::cuda::std::move(output),
369
- num_items,
370
- ::cuda::std::move(predicate),
371
- ::cuda::std::move(transform_op),
372
- stream);
400
+ return TransformInternal(
401
+ ::cuda::std::move(inputs),
402
+ ::cuda::std::move(output),
403
+ num_items,
404
+ ::cuda::std::move(predicate),
405
+ ::cuda::std::move(transform_op),
406
+ get_stream(env));
373
407
  }
374
408
 
375
409
  #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -435,19 +469,21 @@ public:
435
469
  //! @param transform_op A unary function object. The input iterator's value type must be convertible to the
436
470
  //! parameter of the function object's call operator. The return type of the call operator must be assignable to the
437
471
  //! dereferenced output iterator. Will only be invoked if \p predicate returns true.
438
- //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
472
+ //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
473
+ //! stream\ :sub:`0`
439
474
  template <typename RandomAccessIteratorIn,
440
475
  typename RandomAccessIteratorOut,
441
476
  typename NumItemsT,
442
477
  typename Predicate,
443
- typename TransformOp>
478
+ typename TransformOp,
479
+ typename Env = ::cuda::std::execution::env<>>
444
480
  CUB_RUNTIME_FUNCTION static cudaError_t TransformIf(
445
481
  RandomAccessIteratorIn input,
446
482
  RandomAccessIteratorOut output,
447
483
  NumItemsT num_items,
448
484
  Predicate predicate,
449
485
  TransformOp transform_op,
450
- cudaStream_t stream = nullptr)
486
+ Env env = {})
451
487
  {
452
488
  return TransformIf(
453
489
  ::cuda::std::make_tuple(::cuda::std::move(input)),
@@ -455,7 +491,7 @@ public:
455
491
  num_items,
456
492
  ::cuda::std::move(predicate),
457
493
  ::cuda::std::move(transform_op),
458
- stream);
494
+ get_stream(env));
459
495
  }
460
496
 
461
497
  #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -518,39 +554,29 @@ public:
518
554
  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
519
555
  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
520
556
  //! operator must be assignable to the dereferenced output iterator.
521
- //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
522
- template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename NumItemsT, typename TransformOp>
557
+ //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
558
+ //! stream\ :sub:`0`
559
+ template <typename... RandomAccessIteratorsIn,
560
+ typename RandomAccessIteratorOut,
561
+ typename NumItemsT,
562
+ typename TransformOp,
563
+ typename Env = ::cuda::std::execution::env<>>
523
564
  CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
524
565
  ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
525
566
  RandomAccessIteratorOut output,
526
567
  NumItemsT num_items,
527
568
  TransformOp transform_op,
528
- cudaStream_t stream = nullptr)
569
+ Env env = {})
529
570
  {
530
571
  _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses");
531
-
532
- using choose_offset_t = detail::choose_signed_offset<NumItemsT>;
533
- using offset_t = typename choose_offset_t::type;
534
-
535
- // Check if the number of items exceeds the range covered by the selected signed offset type
536
- cudaError_t error = choose_offset_t::is_exceeding_offset_type(num_items);
537
- if (error)
538
- {
539
- return error;
540
- }
541
-
542
- return detail::transform::dispatch_t<
543
- detail::transform::requires_stable_address::yes,
544
- offset_t,
545
- ::cuda::std::tuple<RandomAccessIteratorsIn...>,
546
- RandomAccessIteratorOut,
547
- detail::transform::always_true_predicate,
548
- TransformOp>::dispatch(::cuda::std::move(inputs),
549
- ::cuda::std::move(output),
550
- num_items,
551
- detail::transform::always_true_predicate{},
552
- ::cuda::std::move(transform_op),
553
- stream);
572
+ return TransformInternal(
573
+ ::cuda::std::move(inputs),
574
+ ::cuda::std::move(output),
575
+ num_items,
576
+ detail::transform::always_true_predicate{},
577
+ ::cuda::std::move(transform_op),
578
+ get_stream(env),
579
+ ::cuda::std::true_type{});
554
580
  }
555
581
 
556
582
  #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -590,21 +616,26 @@ public:
590
616
  //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
591
617
  //! types must be convertible to the parameters of the function object's call operator. The return type of the call
592
618
  //! operator must be assignable to the dereferenced output iterator.
593
- //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
594
- template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename NumItemsT, typename TransformOp>
619
+ //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
620
+ //! stream\ :sub:`0`
621
+ template <typename RandomAccessIteratorIn,
622
+ typename RandomAccessIteratorOut,
623
+ typename NumItemsT,
624
+ typename TransformOp,
625
+ typename Env = ::cuda::std::execution::env<>>
595
626
  CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
596
627
  RandomAccessIteratorIn input,
597
628
  RandomAccessIteratorOut output,
598
629
  NumItemsT num_items,
599
630
  TransformOp transform_op,
600
- cudaStream_t stream = nullptr)
631
+ Env env = {})
601
632
  {
602
633
  return TransformStableArgumentAddresses(
603
634
  ::cuda::std::make_tuple(::cuda::std::move(input)),
604
635
  ::cuda::std::move(output),
605
636
  num_items,
606
637
  ::cuda::std::move(transform_op),
607
- stream);
638
+ get_stream(env));
608
639
  }
609
640
 
610
641
  #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -122,9 +122,8 @@ __launch_bounds__(
122
122
  {
123
123
  // the merge agent loads keys into a local array of KeyIt1::value_type, on which the comparisons are performed
124
124
  using key_t = it_value_t<KeyIt1>;
125
- static_assert(::cuda::std::__invocable<CompareOp, key_t, key_t>::value,
126
- "Comparison operator cannot compare two keys");
127
- static_assert(::cuda::std::is_convertible_v<typename ::cuda::std::__invoke_of<CompareOp, key_t, key_t>::type, bool>,
125
+ static_assert(::cuda::std::is_invocable_v<CompareOp, key_t, key_t>, "Comparison operator cannot compare two keys");
126
+ static_assert(::cuda::std::is_convertible_v<::cuda::std::invoke_result_t<CompareOp, key_t, key_t>, bool>,
128
127
  "Comparison operator must be convertible to bool");
129
128
 
130
129
  using MergeAgent = typename choose_merge_agent<
@@ -144,11 +143,11 @@ __launch_bounds__(
144
143
  auto& temp_storage = vsmem_helper_t::get_temp_storage(shared_temp_storage, global_temp_storage);
145
144
  MergeAgent{
146
145
  temp_storage.Alias(),
147
- try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(keys1),
148
- try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(items1),
146
+ keys1,
147
+ items1,
149
148
  num_keys1,
150
- try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(keys2),
151
- try_make_cache_modified_iterator<MergePolicy::LOAD_MODIFIER>(items2),
149
+ keys2,
150
+ items2,
152
151
  num_keys2,
153
152
  keys_result,
154
153
  items_result,
@@ -44,7 +44,6 @@
44
44
  # pragma system_header
45
45
  #endif // no system header
46
46
 
47
- #include <cub/device/dispatch/dispatch_advance_iterators.cuh>
48
47
  #include <cub/device/dispatch/kernels/radix_sort.cuh>
49
48
  #include <cub/device/dispatch/tuning/tuning_radix_sort.cuh>
50
49
  #include <cub/util_debug.cuh>
@@ -1379,14 +1378,6 @@ struct DispatchSegmentedRadixSort
1379
1378
  // Number of radix sort invocations until all segments have been processed
1380
1379
  const auto num_invocations = ::cuda::ceil_div(num_segments, max_num_segments_per_invocation);
1381
1380
 
1382
- // If d_begin_offsets and d_end_offsets do not support operator+ then we can't have more than
1383
- // max_num_segments_per_invocation segments per invocation
1384
- if (num_invocations > 1
1385
- && !detail::all_iterators_support_add_assign_operator(::cuda::std::int64_t{}, d_begin_offsets, d_end_offsets))
1386
- {
1387
- return cudaErrorInvalidValue;
1388
- }
1389
-
1390
1381
  BeginOffsetIteratorT begin_offsets_current_it = d_begin_offsets;
1391
1382
  EndOffsetIteratorT end_offsets_current_it = d_end_offsets;
1392
1383
 
@@ -1435,8 +1426,8 @@ struct DispatchSegmentedRadixSort
1435
1426
 
1436
1427
  if (invocation_index + 1 < num_invocations)
1437
1428
  {
1438
- detail::advance_iterators_inplace_if_supported(begin_offsets_current_it, num_current_segments);
1439
- detail::advance_iterators_inplace_if_supported(end_offsets_current_it, num_current_segments);
1429
+ begin_offsets_current_it += num_current_segments;
1430
+ end_offsets_current_it += num_current_segments;
1440
1431
  }
1441
1432
 
1442
1433
  // Sync the stream if specified to flush runtime errors
@@ -46,7 +46,6 @@
46
46
 
47
47
  #include <cub/detail/launcher/cuda_runtime.cuh>
48
48
  #include <cub/detail/type_traits.cuh> // for cub::detail::invoke_result_t
49
- #include <cub/device/dispatch/dispatch_advance_iterators.cuh>
50
49
  #include <cub/device/dispatch/kernels/reduce.cuh>
51
50
  #include <cub/device/dispatch/kernels/segmented_reduce.cuh>
52
51
  #include <cub/device/dispatch/tuning/tuning_reduce.cuh>
@@ -791,7 +790,7 @@ struct DispatchSegmentedReduce
791
790
  * Function type of cub::DeviceSegmentedReduceKernel
792
791
  *
793
792
  * @param[in] segmented_reduce_kernel
794
- * Kernel function pointer to parameterization of
793
+ * Kernel function pointer to instantiation of
795
794
  * cub::DeviceSegmentedReduceKernel
796
795
  */
797
796
  template <typename ActivePolicyT, typename DeviceSegmentedReduceKernelT>
@@ -810,7 +809,8 @@ struct DispatchSegmentedReduce
810
809
  return cudaSuccess;
811
810
  }
812
811
 
813
- // Init kernel configuration
812
+ // Init kernel configuration (computes kernel occupancy)
813
+ // maybe only used inside CUB_DEBUG_LOG code sections
814
814
  [[maybe_unused]] detail::KernelConfig segmented_reduce_config;
815
815
  error =
816
816
  CubDebug(segmented_reduce_config.Init(segmented_reduce_kernel, policy.SegmentedReduce(), launcher_factory));
@@ -823,17 +823,6 @@ struct DispatchSegmentedReduce
823
823
  static_cast<::cuda::std::int64_t>(::cuda::std::numeric_limits<::cuda::std::int32_t>::max());
824
824
  const ::cuda::std::int64_t num_invocations = ::cuda::ceil_div(num_segments, num_segments_per_invocation);
825
825
 
826
- // If we need multiple passes over the segments but the iterators do not support the + operator, we cannot use the
827
- // streaming approach and have to fail, returning cudaErrorInvalidValue. This is because c.parallel passes
828
- // indirect_arg_t as the iterator type, which does not support the + operator.
829
- // TODO (elstehle): Remove this check once https://github.com/NVIDIA/cccl/issues/4148 is resolved.
830
- if (num_invocations > 1
831
- && !detail::all_iterators_support_add_assign_operator(
832
- ::cuda::std::int64_t{}, d_out, d_begin_offsets, d_end_offsets))
833
- {
834
- return cudaErrorInvalidValue;
835
- }
836
-
837
826
  for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
838
827
  {
839
828
  const auto current_seg_offset = invocation_index * num_segments_per_invocation;
@@ -851,7 +840,7 @@ struct DispatchSegmentedReduce
851
840
  segmented_reduce_config.sm_occupancy);
852
841
  #endif // CUB_DEBUG_LOG
853
842
 
854
- // Invoke DeviceReduceKernel
843
+ // Invoke DeviceSegmentedReduceKernel
855
844
  launcher_factory(
856
845
  static_cast<::cuda::std::uint32_t>(num_current_segments), policy.SegmentedReduce().BlockThreads(), 0, stream)
857
846
  .doit(segmented_reduce_kernel, d_in, d_out, d_begin_offsets, d_end_offsets, reduction_op, init);
@@ -865,9 +854,9 @@ struct DispatchSegmentedReduce
865
854
 
866
855
  if (invocation_index + 1 < num_invocations)
867
856
  {
868
- detail::advance_iterators_inplace_if_supported(d_out, num_current_segments);
869
- detail::advance_iterators_inplace_if_supported(d_begin_offsets, num_current_segments);
870
- detail::advance_iterators_inplace_if_supported(d_end_offsets, num_current_segments);
857
+ d_out += num_current_segments;
858
+ d_begin_offsets += num_current_segments;
859
+ d_end_offsets += num_current_segments;
871
860
  }
872
861
 
873
862
  // Sync the stream if specified to flush runtime errors
@@ -1182,15 +1171,6 @@ struct DispatchFixedSizeSegmentedReduce
1182
1171
 
1183
1172
  const ::cuda::std::int64_t num_invocations = ::cuda::ceil_div(num_segments, num_segments_per_invocation);
1184
1173
 
1185
- // If we need multiple passes over the segments but the iterators do not support the + operator, we cannot use the
1186
- // streaming approach and have to fail, returning cudaErrorInvalidValue. This is because c.parallel passes
1187
- // indirect_arg_t as the iterator type, which does not support the + operator.
1188
- // TODO (srinivas/elstehle): Remove this check once https://github.com/NVIDIA/cccl/issues/4148 is resolved.
1189
- if (num_invocations > 1 && !detail::all_iterators_support_plus_operator(::cuda::std::int64_t{}, d_in, d_out))
1190
- {
1191
- return cudaErrorInvalidValue;
1192
- }
1193
-
1194
1174
  cudaError error = cudaSuccess;
1195
1175
  for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++)
1196
1176
  {
@@ -1204,13 +1184,16 @@ struct DispatchFixedSizeSegmentedReduce
1204
1184
  launcher_factory(
1205
1185
  static_cast<::cuda::std::int32_t>(num_current_blocks), ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream)
1206
1186
  .doit(fixed_size_segmented_reduce_kernel,
1207
- detail::advance_iterators_if_supported(d_in, current_seg_offset * segment_size),
1208
- detail::advance_iterators_if_supported(d_out, current_seg_offset),
1187
+ d_in,
1188
+ d_out,
1209
1189
  segment_size,
1210
1190
  static_cast<::cuda::std::int32_t>(num_current_segments),
1211
1191
  reduction_op,
1212
1192
  init);
1213
1193
 
1194
+ d_in += num_segments_per_invocation * segment_size;
1195
+ d_out += num_segments_per_invocation;
1196
+
1214
1197
  error = CubDebug(cudaPeekAtLastError());
1215
1198
  if (cudaSuccess != error)
1216
1199
  {
@@ -77,7 +77,7 @@ namespace rfa
77
77
  {
78
78
 
79
79
  template <typename Invocable, typename InputT>
80
- using transformed_input_t = ::cuda::std::decay_t<typename ::cuda::std::__invoke_of<Invocable, InputT>::type>;
80
+ using transformed_input_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<Invocable, InputT>>;
81
81
 
82
82
  template <typename InitT, typename InputIteratorT, typename TransformOpT>
83
83
  using accum_t =
@@ -328,11 +328,6 @@ struct DispatchReduceDeterministic
328
328
  // Alias the allocation for the privatized per-block reductions
329
329
  deterministic_accum_t* d_block_reductions = (deterministic_accum_t*) allocations[0];
330
330
 
331
- if (num_chunks > 1 && !detail::all_iterators_support_add_assign_operator(::cuda::std::int32_t{}, d_in))
332
- {
333
- return cudaErrorInvalidValue;
334
- }
335
-
336
331
  auto d_chunk_block_reductions = d_block_reductions;
337
332
  for (int chunk_index = 0; chunk_index < num_chunks; chunk_index++)
338
333
  {
@@ -372,7 +367,7 @@ struct DispatchReduceDeterministic
372
367
 
373
368
  if (chunk_index + 1 < num_chunks)
374
369
  {
375
- detail::advance_iterators_inplace_if_supported(d_in, num_current_items);
370
+ d_in += num_current_items;
376
371
  d_chunk_block_reductions += current_grid_size;
377
372
  }
378
373
 
@@ -20,7 +20,6 @@
20
20
 
21
21
  #include <cub/detail/launcher/cuda_runtime.cuh>
22
22
  #include <cub/detail/type_traits.cuh> // for cub::detail::invoke_result_t
23
- #include <cub/device/dispatch/dispatch_advance_iterators.cuh>
24
23
  #include <cub/device/dispatch/kernels/reduce.cuh>
25
24
  #include <cub/device/dispatch/tuning/tuning_reduce.cuh>
26
25
  #include <cub/grid/grid_even_share.cuh>
@@ -40,7 +40,6 @@
40
40
  #include <cub/detail/device_double_buffer.cuh>
41
41
  #include <cub/detail/temporary_storage.cuh>
42
42
  #include <cub/device/device_partition.cuh>
43
- #include <cub/device/dispatch/dispatch_advance_iterators.cuh>
44
43
  #include <cub/device/dispatch/kernels/segmented_sort.cuh>
45
44
  #include <cub/device/dispatch/tuning/tuning_segmented_sort.cuh>
46
45
  #include <cub/util_debug.cuh>
@@ -764,8 +763,8 @@ private:
764
763
  BeginOffsetIteratorT current_begin_offset = d_begin_offsets;
765
764
  EndOffsetIteratorT current_end_offset = d_end_offsets;
766
765
 
767
- detail::advance_iterators_inplace_if_supported(current_begin_offset, current_seg_offset);
768
- detail::advance_iterators_inplace_if_supported(current_end_offset, current_seg_offset);
766
+ current_begin_offset += current_seg_offset;
767
+ current_end_offset += current_seg_offset;
769
768
 
770
769
  auto medium_indices_iterator =
771
770
  ::cuda::std::make_reverse_iterator(large_and_medium_segments_indices.get() + current_num_segments);
@@ -18,8 +18,8 @@
18
18
 
19
19
  #include <thrust/iterator/constant_iterator.h>
20
20
  #include <thrust/iterator/iterator_adaptor.h>
21
- #include <thrust/iterator/tabulate_output_iterator.h>
22
21
 
22
+ #include <cuda/__iterator/tabulate_output_iterator.h>
23
23
  #include <cuda/std/__functional/identity.h>
24
24
  #include <cuda/std/__utility/swap.h>
25
25
  #include <cuda/std/limits>
@@ -217,8 +217,7 @@ struct dispatch_streaming_arg_reduce_t
217
217
 
218
218
  // The output iterator that implements the logic to accumulate per-partition result to a global aggregate and,
219
219
  // eventually, write to the user-provided output iterators
220
- using accumulating_transform_out_it_t =
221
- THRUST_NS_QUALIFIER::tabulate_output_iterator<accumulating_transform_output_op_t>;
220
+ using accumulating_transform_out_it_t = ::cuda::tabulate_output_iterator<accumulating_transform_output_op_t>;
222
221
 
223
222
  // Empty problem initialization type
224
223
  using empty_problem_init_t = empty_problem_init_t<per_partition_accum_t>;
@@ -270,7 +269,7 @@ struct dispatch_streaming_arg_reduce_t
270
269
  nullptr,
271
270
  allocation_sizes[0],
272
271
  d_indexed_offset_in,
273
- THRUST_NS_QUALIFIER::make_tabulate_output_iterator(accumulating_out_op),
272
+ ::cuda::make_tabulate_output_iterator(accumulating_out_op),
274
273
  static_cast<PerPartitionOffsetT>(largest_partition_size),
275
274
  reduce_op,
276
275
  initial_value,
@@ -315,7 +314,7 @@ struct dispatch_streaming_arg_reduce_t
315
314
  d_temp_storage,
316
315
  temp_storage_bytes,
317
316
  d_indexed_offset_in,
318
- THRUST_NS_QUALIFIER::make_tabulate_output_iterator(accumulating_out_op),
317
+ ::cuda::make_tabulate_output_iterator(accumulating_out_op),
319
318
  static_cast<PerPartitionOffsetT>(current_num_items),
320
319
  reduce_op,
321
320
  initial_value,
@@ -23,7 +23,6 @@
23
23
  #include <cub/util_type.cuh>
24
24
 
25
25
  #include <thrust/iterator/offset_iterator.h>
26
- #include <thrust/iterator/tabulate_output_iterator.h>
27
26
  #include <thrust/iterator/transform_iterator.h>
28
27
  #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
29
28