cuda-cccl 0.3.0__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -84,70 +84,6 @@ CUB_NAMESPACE_BEGIN
84
84
  //! @endrst
85
85
  struct DeviceSegmentedReduce
86
86
  {
87
- private:
88
- template <typename InputIteratorT,
89
- typename OutputIteratorT,
90
- typename BeginOffsetIteratorT,
91
- typename EndOffsetIteratorT,
92
- typename OffsetT,
93
- typename ReductionOpT,
94
- typename InitT,
95
- typename... Ts>
96
- CUB_RUNTIME_FUNCTION static cudaError_t segmented_reduce(
97
- ::cuda::std::false_type,
98
- void* d_temp_storage,
99
- size_t& temp_storage_bytes,
100
- InputIteratorT d_in,
101
- OutputIteratorT d_out,
102
- ::cuda::std::int64_t num_segments,
103
- BeginOffsetIteratorT d_begin_offsets,
104
- EndOffsetIteratorT d_end_offsets,
105
- ReductionOpT reduction_op,
106
- InitT initial_value,
107
- cudaStream_t stream);
108
-
109
- template <typename InputIteratorT,
110
- typename OutputIteratorT,
111
- typename BeginOffsetIteratorT,
112
- typename EndOffsetIteratorT,
113
- typename OffsetT,
114
- typename ReductionOpT,
115
- typename InitT,
116
- typename... Ts>
117
- CUB_RUNTIME_FUNCTION static cudaError_t segmented_reduce(
118
- ::cuda::std::true_type,
119
- void* d_temp_storage,
120
- size_t& temp_storage_bytes,
121
- InputIteratorT d_in,
122
- OutputIteratorT d_out,
123
- ::cuda::std::int64_t num_segments,
124
- BeginOffsetIteratorT d_begin_offsets,
125
- EndOffsetIteratorT d_end_offsets,
126
- ReductionOpT reduction_op,
127
- InitT initial_value,
128
- cudaStream_t stream)
129
- {
130
- return DispatchSegmentedReduce<
131
- InputIteratorT,
132
- OutputIteratorT,
133
- BeginOffsetIteratorT,
134
- EndOffsetIteratorT,
135
- OffsetT,
136
- ReductionOpT,
137
- InitT,
138
- Ts...>::Dispatch(d_temp_storage,
139
- temp_storage_bytes,
140
- d_in,
141
- d_out,
142
- num_segments,
143
- d_begin_offsets,
144
- d_end_offsets,
145
- reduction_op,
146
- initial_value,
147
- stream);
148
- }
149
-
150
- public:
151
87
  //! @rst
152
88
  //! Computes a device-wide segmented reduction using the specified
153
89
  //! binary ``reduction_op`` functor.
@@ -220,14 +156,14 @@ public:
220
156
  //! @rst
221
157
  //! Random-access input iterator to the sequence of beginning offsets of
222
158
  //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
223
- //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
159
+ //! element of the *i*\ :sup:`th` data segment in ``d_in``
224
160
  //! @endrst
225
161
  //!
226
162
  //! @param[in] d_end_offsets
227
163
  //! @rst
228
164
  //! Random-access input iterator to the sequence of ending offsets of length
229
165
  //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
230
- //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
166
+ //! the *i*\ :sup:`th` data segment in ``d_in``.
231
167
  //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
232
168
  //! @endrst
233
169
  //!
@@ -261,24 +197,29 @@ public:
261
197
  {
262
198
  _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Reduce");
263
199
 
264
- // Integer type for global offsets
265
- using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
266
- using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
267
-
268
- static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
269
-
270
- return segmented_reduce<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT, ReductionOpT>(
271
- integral_offset_check{},
272
- d_temp_storage,
273
- temp_storage_bytes,
274
- d_in,
275
- d_out,
276
- num_segments,
277
- d_begin_offsets,
278
- d_end_offsets,
279
- reduction_op,
280
- initial_value, // zero-initialize
281
- stream);
200
+ using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
201
+ static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
202
+ if constexpr (::cuda::std::is_integral_v<OffsetT>)
203
+ {
204
+ return DispatchSegmentedReduce<
205
+ InputIteratorT,
206
+ OutputIteratorT,
207
+ BeginOffsetIteratorT,
208
+ EndOffsetIteratorT,
209
+ OffsetT,
210
+ ReductionOpT,
211
+ T>::Dispatch(d_temp_storage,
212
+ temp_storage_bytes,
213
+ d_in,
214
+ d_out,
215
+ num_segments,
216
+ d_begin_offsets,
217
+ d_end_offsets,
218
+ reduction_op,
219
+ initial_value, // zero-initialize
220
+ stream);
221
+ }
222
+ _CCCL_UNREACHABLE();
282
223
  }
283
224
 
284
225
  //! @rst
@@ -431,15 +372,14 @@ public:
431
372
  //! @rst
432
373
  //! Random-access input iterator to the sequence of beginning offsets of
433
374
  //! length ``num_segments`, such that ``d_begin_offsets[i]`` is the first
434
- //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
435
- //! ``d_values_*``
375
+ //! element of the *i*\ :sup:`th` data segment in ``d_in``
436
376
  //! @endrst
437
377
  //!
438
378
  //! @param[in] d_end_offsets
439
379
  //! @rst
440
380
  //! Random-access input iterator to the sequence of ending offsets of length
441
381
  //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
442
- //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
382
+ //! the *i*\ :sup:`th` data segment in ``d_in``.
443
383
  //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
444
384
  //! @endrst
445
385
  //!
@@ -465,32 +405,31 @@ public:
465
405
  {
466
406
  _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Sum");
467
407
 
468
- // Integer type for global offsets
469
408
  using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
470
-
471
- // The output value type
472
- using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
473
- using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
474
-
475
- static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
476
-
477
- return segmented_reduce<InputIteratorT,
478
- OutputIteratorT,
479
- BeginOffsetIteratorT,
480
- EndOffsetIteratorT,
481
- OffsetT,
482
- ::cuda::std::plus<>>(
483
- integral_offset_check{},
484
- d_temp_storage,
485
- temp_storage_bytes,
486
- d_in,
487
- d_out,
488
- num_segments,
489
- d_begin_offsets,
490
- d_end_offsets,
491
- ::cuda::std::plus<>{},
492
- OutputT(), // zero-initialize
493
- stream);
409
+ using OutputT = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
410
+ using init_t = OutputT;
411
+ static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
412
+ if constexpr (::cuda::std::is_integral_v<OffsetT>)
413
+ {
414
+ return DispatchSegmentedReduce<
415
+ InputIteratorT,
416
+ OutputIteratorT,
417
+ BeginOffsetIteratorT,
418
+ EndOffsetIteratorT,
419
+ OffsetT,
420
+ ::cuda::std::plus<>,
421
+ init_t>::Dispatch(d_temp_storage,
422
+ temp_storage_bytes,
423
+ d_in,
424
+ d_out,
425
+ num_segments,
426
+ d_begin_offsets,
427
+ d_end_offsets,
428
+ ::cuda::std::plus<>{},
429
+ init_t{}, // zero-initialize
430
+ stream);
431
+ }
432
+ _CCCL_UNREACHABLE();
494
433
  }
495
434
 
496
435
  //! @rst
@@ -556,9 +495,7 @@ public:
556
495
  // `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
557
496
  // integral constant or larger integral types
558
497
  using offset_t = int;
559
-
560
- // The output value type
561
- using output_t = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;
498
+ using output_t = detail::non_void_value_t<OutputIteratorT, detail::it_value_t<InputIteratorT>>;
562
499
 
563
500
  return detail::reduce::
564
501
  DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::std::plus<>, output_t>::Dispatch(
@@ -640,14 +577,14 @@ public:
640
577
  //! @rst
641
578
  //! Random-access input iterator to the sequence of beginning offsets of
642
579
  //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
643
- //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
580
+ //! element of the *i*\ :sup:`th` data segment in ``d_in``
644
581
  //! @endrst
645
582
  //!
646
583
  //! @param[in] d_end_offsets
647
584
  //! @rst
648
585
  //! Random-access input iterator to the sequence of ending offsets of length
649
586
  //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
650
- //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
587
+ //! the *i*\ :sup:`th` data segment in ``d_in``.
651
588
  //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
652
589
  //! @endrst
653
590
  //!
@@ -673,32 +610,31 @@ public:
673
610
  {
674
611
  _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Min");
675
612
 
676
- // Integer type for global offsets
677
613
  using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
678
-
679
- // The input value type
680
- using InputT = cub::detail::it_value_t<InputIteratorT>;
681
- using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
682
-
683
- static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
684
-
685
- return segmented_reduce<InputIteratorT,
686
- OutputIteratorT,
687
- BeginOffsetIteratorT,
688
- EndOffsetIteratorT,
689
- OffsetT,
690
- ::cuda::minimum<>>(
691
- integral_offset_check{},
692
- d_temp_storage,
693
- temp_storage_bytes,
694
- d_in,
695
- d_out,
696
- num_segments,
697
- d_begin_offsets,
698
- d_end_offsets,
699
- ::cuda::minimum<>{},
700
- ::cuda::std::numeric_limits<InputT>::max(),
701
- stream);
614
+ using InputT = detail::it_value_t<InputIteratorT>;
615
+ using init_t = InputT;
616
+ static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
617
+ if constexpr (::cuda::std::is_integral_v<OffsetT>)
618
+ {
619
+ return DispatchSegmentedReduce<
620
+ InputIteratorT,
621
+ OutputIteratorT,
622
+ BeginOffsetIteratorT,
623
+ EndOffsetIteratorT,
624
+ OffsetT,
625
+ ::cuda::minimum<>,
626
+ init_t>::Dispatch(d_temp_storage,
627
+ temp_storage_bytes,
628
+ d_in,
629
+ d_out,
630
+ num_segments,
631
+ d_begin_offsets,
632
+ d_end_offsets,
633
+ ::cuda::minimum<>{},
634
+ ::cuda::std::numeric_limits<init_t>::max(),
635
+ stream);
636
+ }
637
+ _CCCL_UNREACHABLE();
702
638
  }
703
639
 
704
640
  //! @rst
@@ -769,9 +705,7 @@ public:
769
705
  // `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
770
706
  // integral constant or larger integral types
771
707
  using offset_t = int;
772
-
773
- // The input value type
774
- using input_t = cub::detail::it_value_t<InputIteratorT>;
708
+ using input_t = detail::it_value_t<InputIteratorT>;
775
709
 
776
710
  return detail::reduce::
777
711
  DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::minimum<>, input_t>::Dispatch(
@@ -857,14 +791,14 @@ public:
857
791
  //! @rst
858
792
  //! Random-access input iterator to the sequence of beginning offsets of
859
793
  //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
860
- //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
794
+ //! element of the *i*\ :sup:`th` data segment in ``d_in``
861
795
  //! @endrst
862
796
  //!
863
797
  //! @param[in] d_end_offsets
864
798
  //! @rst
865
799
  //! Random-access input iterator to the sequence of ending offsets of length
866
800
  //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
867
- //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
801
+ //! the *i*\ :sup:`th` data segment in ``d_in``.
868
802
  //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
869
803
  //! @endrst
870
804
  //!
@@ -890,54 +824,45 @@ public:
890
824
  {
891
825
  _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMin");
892
826
 
893
- // Integer type for global offsets
894
827
  // Using common iterator value type is a breaking change, see:
895
828
  // https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
896
829
  using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
897
830
 
898
- // The input type
899
- using InputValueT = cub::detail::it_value_t<InputIteratorT>;
900
-
901
- // The output tuple type
902
- using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
903
-
904
- // The output value type
831
+ using InputValueT = detail::it_value_t<InputIteratorT>;
832
+ using OutputTupleT = detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
905
833
  using OutputValueT = typename OutputTupleT::Value;
906
-
907
- using AccumT = OutputTupleT;
908
-
909
- using InitT = detail::reduce::empty_problem_init_t<AccumT>;
834
+ using AccumT = OutputTupleT;
835
+ using InitT = detail::reduce::empty_problem_init_t<AccumT>;
910
836
 
911
837
  // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
912
838
  using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
913
-
914
839
  ArgIndexInputIteratorT d_indexed_in(d_in);
915
840
 
916
- // Initial value
917
841
  InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
918
842
 
919
- using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
920
- static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
921
-
922
- return segmented_reduce<ArgIndexInputIteratorT,
923
- OutputIteratorT,
924
- BeginOffsetIteratorT,
925
- EndOffsetIteratorT,
926
- OffsetT,
927
- cub::ArgMin,
928
- InitT,
929
- AccumT>(
930
- integral_offset_check{},
931
- d_temp_storage,
932
- temp_storage_bytes,
933
- d_indexed_in,
934
- d_out,
935
- num_segments,
936
- d_begin_offsets,
937
- d_end_offsets,
938
- cub::ArgMin(),
939
- initial_value,
940
- stream);
843
+ static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
844
+ if constexpr (::cuda::std::is_integral_v<OffsetT>)
845
+ {
846
+ return DispatchSegmentedReduce<
847
+ ArgIndexInputIteratorT,
848
+ OutputIteratorT,
849
+ BeginOffsetIteratorT,
850
+ EndOffsetIteratorT,
851
+ OffsetT,
852
+ cub::ArgMin,
853
+ InitT,
854
+ AccumT>::Dispatch(d_temp_storage,
855
+ temp_storage_bytes,
856
+ d_indexed_in,
857
+ d_out,
858
+ num_segments,
859
+ d_begin_offsets,
860
+ d_end_offsets,
861
+ cub::ArgMin{},
862
+ initial_value,
863
+ stream);
864
+ }
865
+ _CCCL_UNREACHABLE();
941
866
  }
942
867
 
943
868
  //! @rst
@@ -1111,14 +1036,14 @@ public:
1111
1036
  //! @rst
1112
1037
  //! Random-access input iterator to the sequence of beginning offsets of
1113
1038
  //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
1114
- //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
1039
+ //! element of the *i*\ :sup:`th` data segment in ``d_in``
1115
1040
  //! @endrst
1116
1041
  //!
1117
1042
  //! @param[in] d_end_offsets
1118
1043
  //! @rst
1119
1044
  //! Random-access input iterator to the sequence of ending offsets of length
1120
1045
  //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
1121
- //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
1046
+ //! the *i*\ :sup:`th` data segment in ``d_in``.
1122
1047
  //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
1123
1048
  //! @endrst
1124
1049
  //!
@@ -1144,27 +1069,32 @@ public:
1144
1069
  {
1145
1070
  _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Max");
1146
1071
 
1147
- // Integer type for global offsets
1148
1072
  using OffsetT = detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
1149
-
1150
- // The input value type
1151
- using InputT = cub::detail::it_value_t<InputIteratorT>;
1152
-
1153
- using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
1154
- static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
1155
-
1156
- return segmented_reduce<InputIteratorT, OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>(
1157
- integral_offset_check{},
1158
- d_temp_storage,
1159
- temp_storage_bytes,
1160
- d_in,
1161
- d_out,
1162
- num_segments,
1163
- d_begin_offsets,
1164
- d_end_offsets,
1165
- ::cuda::maximum<>{},
1166
- ::cuda::std::numeric_limits<InputT>::lowest(),
1167
- stream);
1073
+ using InputT = cub::detail::it_value_t<InputIteratorT>;
1074
+ using init_t = InputT;
1075
+
1076
+ static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
1077
+ if constexpr (::cuda::std::is_integral_v<OffsetT>)
1078
+ {
1079
+ return DispatchSegmentedReduce<
1080
+ InputIteratorT,
1081
+ OutputIteratorT,
1082
+ BeginOffsetIteratorT,
1083
+ EndOffsetIteratorT,
1084
+ OffsetT,
1085
+ ::cuda::maximum<>,
1086
+ init_t>::Dispatch(d_temp_storage,
1087
+ temp_storage_bytes,
1088
+ d_in,
1089
+ d_out,
1090
+ num_segments,
1091
+ d_begin_offsets,
1092
+ d_end_offsets,
1093
+ ::cuda::maximum<>{},
1094
+ ::cuda::std::numeric_limits<init_t>::lowest(),
1095
+ stream);
1096
+ }
1097
+ _CCCL_UNREACHABLE();
1168
1098
  }
1169
1099
 
1170
1100
  //! @rst
@@ -1229,9 +1159,7 @@ public:
1229
1159
  // `offset_t` a.k.a `SegmentSizeT` is fixed to `int` type now, but later can be changed to accept
1230
1160
  // integral constant or larger integral types
1231
1161
  using offset_t = int;
1232
-
1233
- // The input value type
1234
- using input_t = cub::detail::it_value_t<InputIteratorT>;
1162
+ using input_t = detail::it_value_t<InputIteratorT>;
1235
1163
 
1236
1164
  return detail::reduce::
1237
1165
  DispatchFixedSizeSegmentedReduce<InputIteratorT, OutputIteratorT, offset_t, ::cuda::maximum<>, input_t>::Dispatch(
@@ -1320,14 +1248,14 @@ public:
1320
1248
  //! @rst
1321
1249
  //! Random-access input iterator to the sequence of beginning offsets of
1322
1250
  //! length `num_segments`, such that ``d_begin_offsets[i]`` is the first
1323
- //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
1251
+ //! element of the *i*\ :sup:`th` data segment in ``d_in``
1324
1252
  //! @endrst
1325
1253
  //!
1326
1254
  //! @param[in] d_end_offsets
1327
1255
  //! @rst
1328
1256
  //! Random-access input iterator to the sequence of ending offsets of length
1329
1257
  //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
1330
- //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
1258
+ //! the *i*\ :sup:`th` data segment in ``d_in``.
1331
1259
  //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
1332
1260
  //! @endrst
1333
1261
  //!
@@ -1353,54 +1281,45 @@ public:
1353
1281
  {
1354
1282
  _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::ArgMax");
1355
1283
 
1356
- // Integer type for global offsets
1357
1284
  // Using common iterator value type is a breaking change, see:
1358
1285
  // https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615
1359
1286
  using OffsetT = int; // detail::common_iterator_value_t<BeginOffsetIteratorT, EndOffsetIteratorT>;
1360
1287
 
1361
- // The input type
1362
- using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1363
-
1364
- // The output tuple type
1288
+ using InputValueT = cub::detail::it_value_t<InputIteratorT>;
1365
1289
  using OutputTupleT = cub::detail::non_void_value_t<OutputIteratorT, KeyValuePair<OffsetT, InputValueT>>;
1366
-
1367
- using AccumT = OutputTupleT;
1368
-
1369
- using InitT = detail::reduce::empty_problem_init_t<AccumT>;
1370
-
1371
- // The output value type
1290
+ using AccumT = OutputTupleT;
1291
+ using InitT = detail::reduce::empty_problem_init_t<AccumT>;
1372
1292
  using OutputValueT = typename OutputTupleT::Value;
1373
1293
 
1374
1294
  // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
1375
1295
  using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;
1376
-
1377
1296
  ArgIndexInputIteratorT d_indexed_in(d_in);
1378
1297
 
1379
- // Initial value
1380
1298
  InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
1381
1299
 
1382
- using integral_offset_check = ::cuda::std::is_integral<OffsetT>;
1383
- static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
1384
-
1385
- return segmented_reduce<ArgIndexInputIteratorT,
1386
- OutputIteratorT,
1387
- BeginOffsetIteratorT,
1388
- EndOffsetIteratorT,
1389
- OffsetT,
1390
- cub::ArgMax,
1391
- InitT,
1392
- AccumT>(
1393
- integral_offset_check{},
1394
- d_temp_storage,
1395
- temp_storage_bytes,
1396
- d_indexed_in,
1397
- d_out,
1398
- num_segments,
1399
- d_begin_offsets,
1400
- d_end_offsets,
1401
- cub::ArgMax(),
1402
- initial_value,
1403
- stream);
1300
+ static_assert(::cuda::std::is_integral_v<OffsetT>, "Offset iterator value type should be integral.");
1301
+ if constexpr (::cuda::std::is_integral_v<OffsetT>)
1302
+ {
1303
+ return DispatchSegmentedReduce<
1304
+ ArgIndexInputIteratorT,
1305
+ OutputIteratorT,
1306
+ BeginOffsetIteratorT,
1307
+ EndOffsetIteratorT,
1308
+ OffsetT,
1309
+ cub::ArgMax,
1310
+ InitT,
1311
+ AccumT>::Dispatch(d_temp_storage,
1312
+ temp_storage_bytes,
1313
+ d_indexed_in,
1314
+ d_out,
1315
+ num_segments,
1316
+ d_begin_offsets,
1317
+ d_end_offsets,
1318
+ cub::ArgMax{},
1319
+ initial_value,
1320
+ stream);
1321
+ }
1322
+ _CCCL_UNREACHABLE();
1404
1323
  }
1405
1324
 
1406
1325
  //! @rst
@@ -1476,34 +1395,25 @@ public:
1476
1395
  // integral constant or larger integral types
1477
1396
  using input_t = int;
1478
1397
 
1479
- // The input type
1480
- using input_value_t = cub::detail::it_value_t<InputIteratorT>;
1481
-
1482
- // The output tuple type
1483
- using output_tuple_t = cub::detail::non_void_value_t<OutputIteratorT, ::cuda::std::pair<input_t, input_value_t>>;
1484
-
1485
- using accum_t = output_tuple_t;
1486
-
1487
- using init_t = detail::reduce::empty_problem_init_t<accum_t>;
1488
-
1489
- // The output value type
1398
+ using input_value_t = detail::it_value_t<InputIteratorT>;
1399
+ using output_tuple_t = detail::non_void_value_t<OutputIteratorT, ::cuda::std::pair<input_t, input_value_t>>;
1400
+ using accum_t = output_tuple_t;
1401
+ using init_t = detail::reduce::empty_problem_init_t<accum_t>;
1490
1402
  using output_value_t = typename output_tuple_t::second_type;
1491
1403
 
1492
1404
  // Wrapped input iterator to produce index-value <input_t, InputT> tuples
1493
1405
  auto d_indexed_in = THRUST_NS_QUALIFIER::make_transform_iterator(
1494
1406
  THRUST_NS_QUALIFIER::counting_iterator<::cuda::std::int64_t>{0},
1495
1407
  detail::reduce::generate_idx_value<InputIteratorT, output_value_t>(d_in, segment_size));
1496
-
1497
1408
  using arg_index_input_iterator_t = decltype(d_indexed_in);
1498
1409
 
1499
- // Initial value
1500
1410
  init_t initial_value{accum_t(1, ::cuda::std::numeric_limits<input_value_t>::lowest())};
1501
1411
 
1502
1412
  return detail::reduce::DispatchFixedSizeSegmentedReduce<
1503
1413
  arg_index_input_iterator_t,
1504
1414
  OutputIteratorT,
1505
1415
  input_t,
1506
- cub::detail::arg_max,
1416
+ detail::arg_max,
1507
1417
  init_t,
1508
1418
  accum_t>::Dispatch(d_temp_storage,
1509
1419
  temp_storage_bytes,
@@ -1511,7 +1421,7 @@ public:
1511
1421
  d_out,
1512
1422
  num_segments,
1513
1423
  segment_size,
1514
- cub::detail::arg_max(),
1424
+ detail::arg_max(),
1515
1425
  initial_value,
1516
1426
  stream);
1517
1427
  }