cuda-cccl 0.1.3.2.0.dev271__cp312-cp312-manylinux_2_26_x86_64.whl → 0.2.1__cp312-cp312-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (342) hide show
  1. cuda/cccl/_cuda_version_utils.py +0 -22
  2. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +6 -2
  3. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +3 -1
  4. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +5 -2
  5. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +4 -2
  6. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +3 -2
  7. cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +6 -3
  8. cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +4 -2
  9. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +6 -3
  10. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +3 -2
  11. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +5 -1
  12. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +8 -2
  13. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +4 -1
  14. cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +5 -4
  15. cuda/cccl/headers/include/cub/block/block_exchange.cuh +3 -1
  16. cuda/cccl/headers/include/cub/block/block_histogram.cuh +1 -1
  17. cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +23 -24
  18. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +5 -3
  19. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +5 -2
  20. cuda/cccl/headers/include/cub/block/block_reduce.cuh +2 -1
  21. cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +0 -3
  22. cuda/cccl/headers/include/cub/block/block_scan.cuh +2 -1
  23. cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +9 -5
  24. cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +1 -1
  25. cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +2 -2
  26. cuda/cccl/headers/include/cub/detail/array_utils.cuh +6 -5
  27. cuda/cccl/headers/include/cub/detail/choose_offset.cuh +8 -2
  28. cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +11 -4
  29. cuda/cccl/headers/include/cub/detail/integer_utils.cuh +7 -3
  30. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +4 -4
  31. cuda/cccl/headers/include/cub/detail/ptx-json/json.h +5 -4
  32. cuda/cccl/headers/include/cub/detail/ptx-json/object.h +1 -1
  33. cuda/cccl/headers/include/cub/detail/ptx-json/string.h +0 -18
  34. cuda/cccl/headers/include/cub/detail/ptx-json/value.h +4 -3
  35. cuda/cccl/headers/include/cub/detail/rfa.cuh +9 -2
  36. cuda/cccl/headers/include/cub/detail/type_traits.cuh +15 -7
  37. cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +3 -2
  38. cuda/cccl/headers/include/cub/device/device_for.cuh +5 -2
  39. cuda/cccl/headers/include/cub/device/device_histogram.cuh +3 -1
  40. cuda/cccl/headers/include/cub/device/device_memcpy.cuh +1 -1
  41. cuda/cccl/headers/include/cub/device/device_merge.cuh +2 -1
  42. cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3 -1
  43. cuda/cccl/headers/include/cub/device/device_reduce.cuh +10 -1
  44. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +9 -2
  45. cuda/cccl/headers/include/cub/device/device_select.cuh +5 -1
  46. cuda/cccl/headers/include/cub/device/device_transform.cuh +109 -26
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +2 -1
  48. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +3 -1
  49. cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +4 -3
  50. cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +0 -2
  51. cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +2 -1
  52. cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +7 -3
  53. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +3 -2
  54. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +6 -2
  55. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +13 -2
  56. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +0 -1
  57. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +79 -40
  58. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +4 -3
  59. cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +5 -2
  60. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +6 -3
  61. cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +6 -2
  62. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +5 -4
  63. cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +6 -2
  64. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +3 -2
  65. cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +6 -1
  66. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +8 -12
  67. cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +3 -1
  68. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +11 -3
  69. cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +5 -1
  70. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +7 -17
  71. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +0 -2
  72. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +26 -4
  73. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +3 -2
  74. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +4 -2
  75. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +2 -1
  76. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +10 -4
  77. cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +3 -2
  78. cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +3 -5
  79. cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +6 -7
  80. cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +4 -6
  81. cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +4 -6
  82. cuda/cccl/headers/include/cub/thread/thread_load.cuh +3 -1
  83. cuda/cccl/headers/include/cub/thread/thread_operators.cuh +7 -3
  84. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +11 -17
  85. cuda/cccl/headers/include/cub/thread/thread_search.cuh +2 -1
  86. cuda/cccl/headers/include/cub/thread/thread_simd.cuh +7 -13
  87. cuda/cccl/headers/include/cub/thread/thread_sort.cuh +3 -2
  88. cuda/cccl/headers/include/cub/thread/thread_store.cuh +2 -1
  89. cuda/cccl/headers/include/cub/util_device.cuh +9 -9
  90. cuda/cccl/headers/include/cub/util_macro.cuh +0 -2
  91. cuda/cccl/headers/include/cub/util_math.cuh +4 -1
  92. cuda/cccl/headers/include/cub/util_type.cuh +18 -29
  93. cuda/cccl/headers/include/cub/util_vsmem.cuh +5 -3
  94. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +1 -1
  95. cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +1 -1
  96. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +9 -2
  97. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -1
  98. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +8 -6
  99. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +5 -3
  100. cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +1 -1
  101. cuda/cccl/headers/include/cub/warp/warp_load.cuh +1 -1
  102. cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +2 -2
  103. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +5 -3
  104. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +3 -2
  105. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -1
  106. cuda/cccl/headers/include/cub/warp/warp_utils.cuh +3 -3
  107. cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
  108. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +93 -0
  109. cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
  110. cuda/cccl/headers/include/cuda/__complex_ +28 -0
  111. cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
  112. cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
  113. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +98 -60
  114. cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +132 -114
  115. cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +46 -36
  116. cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +86 -56
  117. cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +66 -29
  118. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +123 -63
  119. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +71 -62
  120. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +95 -99
  121. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +98 -99
  122. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +101 -99
  123. cuda/cccl/headers/include/cuda/__iterator/zip_function.h +14 -6
  124. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +198 -103
  125. cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +2 -2
  126. cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +3 -3
  127. cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +1 -2
  128. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  129. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  130. cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +1 -1
  131. cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +27 -0
  132. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +8 -0
  133. cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +3 -3
  134. cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
  135. cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +60 -0
  136. cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
  137. cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +15 -0
  138. cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +105 -153
  139. cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
  140. cuda/cccl/headers/include/cuda/std/__complex/complex.h +5 -7
  141. cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +1 -0
  142. cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +186 -119
  143. cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +5 -3
  144. cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
  145. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -0
  146. cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +6 -0
  147. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +32 -0
  148. cuda/cccl/headers/include/cuda/std/__internal/features.h +6 -0
  149. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +3 -3
  150. cuda/cccl/headers/include/cuda/std/__iterator/advance.h +2 -2
  151. cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +2 -3
  152. cuda/cccl/headers/include/cuda/std/__iterator/distance.h +1 -1
  153. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +26 -100
  154. cuda/cccl/headers/include/cuda/std/__iterator/next.h +2 -2
  155. cuda/cccl/headers/include/cuda/std/__iterator/prev.h +2 -2
  156. cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +2 -2
  157. cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +2 -3
  158. cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +6 -0
  159. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +10 -12
  160. cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +1 -1
  161. cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +8 -8
  162. cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +0 -1
  163. cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +0 -2
  164. cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +0 -1
  165. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +0 -1
  166. cuda/cccl/headers/include/cuda/std/cmath +63 -1
  167. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1 -2
  168. cuda/cccl/headers/include/cuda/std/inplace_vector +9 -9
  169. cuda/cccl/headers/include/cuda/std/numbers +0 -1
  170. cuda/cccl/headers/include/thrust/detail/pointer.h +1 -1
  171. cuda/cccl/headers/include/thrust/detail/vector_base.h +2 -2
  172. cuda/cccl/headers/include/thrust/detail/vector_base.inl +2 -4
  173. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +9 -0
  174. cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +4 -1
  175. cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +39 -56
  176. cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +6 -10
  177. cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +1 -2
  178. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +12 -1
  179. cuda/cccl/headers/include/thrust/iterator/{detail/iterator_traversal_tags.h → iterator_traversal_tags.h} +14 -0
  180. cuda/cccl/headers/include/thrust/iterator/retag.h +5 -5
  181. cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +7 -7
  182. cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +2 -2
  183. cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +77 -107
  184. cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +2 -5
  185. cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +2 -5
  186. cuda/cccl/headers/include/thrust/system/cpp/memory.h +2 -5
  187. cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +2 -5
  188. cuda/cccl/headers/include/thrust/system/cpp/pointer.h +2 -5
  189. cuda/cccl/headers/include/thrust/system/cpp/vector.h +2 -5
  190. cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +0 -16
  191. cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +36 -18
  192. cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +26 -51
  193. cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +25 -14
  194. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +1 -0
  195. cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +18 -21
  196. cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +0 -1
  197. cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +19 -23
  198. cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +0 -11
  199. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +7 -4
  200. cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +15 -1
  201. cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +0 -1
  202. cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +2 -5
  203. cuda/cccl/headers/include/thrust/system/detail/errno.h +2 -7
  204. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +2 -8
  205. cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +2 -8
  206. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +2 -8
  207. cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +3 -10
  208. cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +2 -8
  209. cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +2 -8
  210. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +2 -8
  211. cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +2 -8
  212. cuda/cccl/headers/include/thrust/system/detail/generic/count.h +2 -8
  213. cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +2 -8
  214. cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +2 -8
  215. cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +2 -8
  216. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +2 -8
  217. cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +2 -8
  218. cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +2 -8
  219. cuda/cccl/headers/include/thrust/system/detail/generic/find.h +2 -8
  220. cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +2 -8
  221. cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +2 -8
  222. cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +2 -8
  223. cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +2 -8
  224. cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +2 -8
  225. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +2 -8
  226. cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +2 -8
  227. cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +2 -8
  228. cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +2 -8
  229. cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +2 -8
  230. cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +2 -8
  231. cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +2 -8
  232. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +2 -8
  233. cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +2 -8
  234. cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +2 -8
  235. cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +2 -8
  236. cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +2 -8
  237. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +2 -8
  238. cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +2 -8
  239. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +2 -8
  240. cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +2 -8
  241. cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +2 -8
  242. cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +2 -8
  243. cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +2 -8
  244. cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +2 -8
  245. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +2 -8
  246. cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +2 -8
  247. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +2 -17
  248. cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +2 -17
  249. cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +2 -8
  250. cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +2 -8
  251. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +2 -8
  252. cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +2 -8
  253. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +2 -8
  254. cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +2 -8
  255. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +2 -8
  256. cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +2 -8
  257. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +2 -8
  258. cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +2 -8
  259. cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +2 -8
  260. cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +2 -8
  261. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +2 -8
  262. cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +2 -8
  263. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +2 -8
  264. cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +2 -8
  265. cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +2 -8
  266. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +2 -8
  267. cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +2 -8
  268. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +2 -8
  269. cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +2 -8
  270. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +2 -8
  271. cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +2 -8
  272. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +2 -8
  273. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +2 -8
  274. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +2 -8
  275. cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +2 -8
  276. cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +2 -8
  277. cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +2 -8
  278. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +2 -8
  279. cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +2 -8
  280. cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +2 -8
  281. cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +2 -8
  282. cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +2 -8
  283. cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +2 -8
  284. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +2 -8
  285. cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +2 -8
  286. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +2 -8
  287. cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +2 -8
  288. cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +2 -9
  289. cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +2 -8
  290. cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +2 -8
  291. cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +2 -8
  292. cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +2 -8
  293. cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +2 -8
  294. cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +2 -8
  295. cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +2 -8
  296. cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +3 -9
  297. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +2 -8
  298. cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +2 -8
  299. cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +2 -10
  300. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +2 -8
  301. cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +2 -8
  302. cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +2 -8
  303. cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +2 -8
  304. cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +2 -8
  305. cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +2 -8
  306. cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +2 -8
  307. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +2 -8
  308. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +2 -8
  309. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +2 -8
  310. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +3 -9
  311. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +2 -8
  312. cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +3 -9
  313. cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +2 -8
  314. cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +2 -8
  315. cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +2 -8
  316. cuda/cccl/headers/include_paths.py +6 -9
  317. cuda/cccl/parallel/experimental/__init__.py +2 -4
  318. cuda/cccl/parallel/experimental/_cccl_interop.py +53 -27
  319. cuda/cccl/parallel/experimental/algorithms/_histogram.py +2 -2
  320. cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +4 -4
  321. cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +4 -4
  322. cuda/cccl/parallel/experimental/algorithms/_reduce.py +2 -2
  323. cuda/cccl/parallel/experimental/algorithms/_scan.py +4 -4
  324. cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +4 -4
  325. cuda/cccl/parallel/experimental/algorithms/_transform.py +5 -5
  326. cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +5 -5
  327. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  328. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  329. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
  330. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  331. cuda/cccl/parallel/experimental/iterators/__init__.py +2 -4
  332. cuda/cccl/parallel/experimental/iterators/_factories.py +28 -51
  333. cuda/cccl/parallel/experimental/iterators/_iterators.py +189 -204
  334. cuda/cccl/parallel/experimental/iterators/_zip_iterator.py +4 -12
  335. cuda/cccl/parallel/experimental/numba_utils.py +47 -0
  336. {cuda_cccl-0.1.3.2.0.dev271.dist-info → cuda_cccl-0.2.1.dist-info}/METADATA +8 -5
  337. {cuda_cccl-0.1.3.2.0.dev271.dist-info → cuda_cccl-0.2.1.dist-info}/RECORD +339 -332
  338. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +0 -520
  339. cuda/cccl/headers/include/thrust/detail/mpl/math.h +0 -164
  340. cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +0 -44
  341. {cuda_cccl-0.1.3.2.0.dev271.dist-info → cuda_cccl-0.2.1.dist-info}/WHEEL +0 -0
  342. {cuda_cccl-0.1.3.2.0.dev271.dist-info → cuda_cccl-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -6,9 +6,6 @@
6
6
  CUDA version detection utilities shared across the cccl package.
7
7
  """
8
8
 
9
- import os
10
- import shutil
11
- from pathlib import Path
12
9
  from typing import Optional
13
10
 
14
11
  import cuda.bindings
@@ -19,25 +16,6 @@ def detect_cuda_version() -> Optional[int]:
19
16
  return int(cuda_version.split(".")[0])
20
17
 
21
18
 
22
- def get_cuda_path() -> Optional[Path]:
23
- """Get the CUDA installation path."""
24
- cuda_path_str = os.environ.get("CUDA_PATH")
25
- if cuda_path_str:
26
- cuda_path = Path(cuda_path_str)
27
- if cuda_path.exists():
28
- return cuda_path
29
-
30
- nvcc_path = shutil.which("nvcc")
31
- if nvcc_path:
32
- return Path(nvcc_path).parent.parent
33
-
34
- default_path = Path("/usr/local/cuda")
35
- if default_path.exists():
36
- return default_path
37
-
38
- return None
39
-
40
-
41
19
  def get_recommended_extra(cuda_version: Optional[int]) -> str:
42
20
  """Get the recommended pip extra for the detected CUDA version."""
43
21
  if cuda_version == 13:
@@ -52,9 +52,13 @@
52
52
  #include <cub/util_ptx.cuh>
53
53
  #include <cub/util_type.cuh>
54
54
 
55
- #include <cuda/cmath>
55
+ #include <cuda/__cmath/ceil_div.h>
56
+ #include <cuda/__cmath/round_up.h>
57
+ #include <cuda/std/__functional/operations.h>
58
+ #include <cuda/std/__type_traits/conditional.h>
59
+ #include <cuda/std/__type_traits/enable_if.h>
60
+ #include <cuda/std/__type_traits/type_identity.h>
56
61
  #include <cuda/std/cstdint>
57
- #include <cuda/std/type_traits>
58
62
 
59
63
  CUB_NAMESPACE_BEGIN
60
64
 
@@ -49,7 +49,9 @@
49
49
  #include <cub/iterator/cache_modified_input_iterator.cuh>
50
50
  #include <cub/util_type.cuh>
51
51
 
52
- #include <cuda/std/type_traits>
52
+ #include <cuda/std/__type_traits/conditional.h>
53
+ #include <cuda/std/__type_traits/integral_constant.h>
54
+ #include <cuda/std/__type_traits/is_pointer.h>
53
55
 
54
56
  CUB_NAMESPACE_BEGIN
55
57
 
@@ -50,8 +50,11 @@
50
50
  #include <cub/util_math.cuh>
51
51
  #include <cub/util_type.cuh>
52
52
 
53
- #include <cuda/ptx>
54
- #include <cuda/std/__algorithm_>
53
+ #include <cuda/__cmath/ceil_div.h>
54
+ #include <cuda/__ptx/instructions/get_sreg.h>
55
+ #include <cuda/std/__algorithm/max.h>
56
+ #include <cuda/std/__algorithm/min.h>
57
+ #include <cuda/std/__functional/operations.h>
55
58
 
56
59
  CUB_NAMESPACE_BEGIN
57
60
 
@@ -49,8 +49,10 @@
49
49
  #include <cub/util_ptx.cuh>
50
50
  #include <cub/util_type.cuh>
51
51
 
52
- #include <cuda/ptx>
53
- #include <cuda/std/type_traits>
52
+ #include <cuda/__ptx/instructions/get_sreg.h>
53
+ #include <cuda/std/__type_traits/conditional.h>
54
+ #include <cuda/std/__type_traits/integral_constant.h>
55
+ #include <cuda/std/__type_traits/is_same.h>
54
56
 
55
57
  CUB_NAMESPACE_BEGIN
56
58
 
@@ -52,8 +52,9 @@
52
52
  #include <cub/util_type.cuh>
53
53
  #include <cub/warp/warp_reduce.cuh>
54
54
 
55
- #include <cuda/ptx>
56
- #include <cuda/std/__algorithm_>
55
+ #include <cuda/__ptx/instructions/get_sreg.h>
56
+ #include <cuda/std/__algorithm/max.h>
57
+ #include <cuda/std/__algorithm/min.h>
57
58
 
58
59
  CUB_NAMESPACE_BEGIN
59
60
 
@@ -50,9 +50,12 @@
50
50
  #include <cub/util_device.cuh>
51
51
  #include <cub/util_type.cuh>
52
52
 
53
- #include <cuda/memory>
54
- #include <cuda/std/functional>
55
- #include <cuda/std/type_traits>
53
+ #include <cuda/__memory/is_aligned.h>
54
+ #include <cuda/std/__algorithm/min.h>
55
+ #include <cuda/std/__functional/identity.h>
56
+ #include <cuda/std/__functional/operations.h>
57
+ #include <cuda/std/__type_traits/conditional.h>
58
+ #include <cuda/std/__type_traits/is_pointer.h>
56
59
 
57
60
  CUB_NAMESPACE_BEGIN
58
61
 
@@ -50,8 +50,10 @@
50
50
  #include <cub/block/block_store.cuh>
51
51
  #include <cub/iterator/cache_modified_input_iterator.cuh>
52
52
 
53
- #include <cuda/std/type_traits>
54
-
53
+ #include <cuda/std/__functional/operations.h>
54
+ #include <cuda/std/__type_traits/conditional.h>
55
+ #include <cuda/std/__type_traits/is_pointer.h>
56
+ #include <cuda/std/__type_traits/is_same.h>
55
57
  CUB_NAMESPACE_BEGIN
56
58
 
57
59
  /******************************************************************************
@@ -50,11 +50,14 @@
50
50
  #include <cub/block/block_load.cuh>
51
51
  #include <cub/block/block_scan.cuh>
52
52
  #include <cub/block/block_store.cuh>
53
- #include <cub/grid/grid_queue.cuh>
54
53
  #include <cub/iterator/cache_modified_input_iterator.cuh>
55
54
 
56
- #include <cuda/ptx>
57
- #include <cuda/std/type_traits>
55
+ #include <cuda/__ptx/instructions/get_sreg.h>
56
+ #include <cuda/std/__functional/operations.h>
57
+ #include <cuda/std/__type_traits/conditional.h>
58
+ #include <cuda/std/__type_traits/integral_constant.h>
59
+ #include <cuda/std/__type_traits/is_pointer.h>
60
+ #include <cuda/std/__type_traits/is_same.h>
58
61
 
59
62
  CUB_NAMESPACE_BEGIN
60
63
 
@@ -47,10 +47,11 @@
47
47
  #include <cub/block/block_load.cuh>
48
48
  #include <cub/block/block_scan.cuh>
49
49
  #include <cub/block/block_store.cuh>
50
- #include <cub/grid/grid_queue.cuh>
51
50
  #include <cub/iterator/cache_modified_input_iterator.cuh>
52
51
 
53
- #include <cuda/std/type_traits>
52
+ #include <cuda/std/__type_traits/conditional.h>
53
+ #include <cuda/std/__type_traits/is_pointer.h>
54
+ #include <cuda/std/__type_traits/is_same.h>
54
55
 
55
56
  CUB_NAMESPACE_BEGIN
56
57
 
@@ -50,7 +50,11 @@
50
50
  #include <cub/iterator/cache_modified_input_iterator.cuh>
51
51
  #include <cub/util_type.cuh>
52
52
 
53
- #include <cuda/std/type_traits>
53
+ #include <cuda/std/__type_traits/conditional.h>
54
+ #include <cuda/std/__type_traits/enable_if.h>
55
+ #include <cuda/std/__type_traits/integral_constant.h>
56
+ #include <cuda/std/__type_traits/is_pointer.h>
57
+ #include <cuda/std/__type_traits/is_same.h>
54
58
 
55
59
  CUB_NAMESPACE_BEGIN
56
60
 
@@ -50,11 +50,17 @@
50
50
  #include <cub/block/block_scan.cuh>
51
51
  #include <cub/block/block_store.cuh>
52
52
  #include <cub/device/dispatch/dispatch_common.cuh>
53
- #include <cub/grid/grid_queue.cuh>
54
53
  #include <cub/iterator/cache_modified_input_iterator.cuh>
55
54
  #include <cub/util_type.cuh>
56
55
 
57
- #include <cuda/std/type_traits>
56
+ #include <cuda/std/__functional/operations.h>
57
+ #include <cuda/std/__type_traits/conditional.h>
58
+ #include <cuda/std/__type_traits/enable_if.h>
59
+ #include <cuda/std/__type_traits/integral_constant.h>
60
+ #include <cuda/std/__type_traits/is_callable.h>
61
+ #include <cuda/std/__type_traits/is_pointer.h>
62
+ #include <cuda/std/__type_traits/is_same.h>
63
+ #include <cuda/std/cstdint>
58
64
 
59
65
  CUB_NAMESPACE_BEGIN
60
66
 
@@ -45,7 +45,10 @@
45
45
  #include <cub/block/block_store.cuh>
46
46
  #include <cub/iterator/cache_modified_input_iterator.cuh>
47
47
 
48
- #include <cuda/std/type_traits>
48
+ #include <cuda/std/__functional/operations.h>
49
+ #include <cuda/std/__type_traits/conditional.h>
50
+ #include <cuda/std/__type_traits/enable_if.h>
51
+ #include <cuda/std/__type_traits/is_pointer.h>
49
52
 
50
53
  CUB_NAMESPACE_BEGIN
51
54
 
@@ -51,7 +51,8 @@
51
51
  #include <cub/util_temporary_storage.cuh>
52
52
  #include <cub/warp/warp_reduce.cuh>
53
53
 
54
- #include <cuda/std/type_traits>
54
+ #include <cuda/std/__type_traits/conditional.h>
55
+ #include <cuda/std/__type_traits/enable_if.h>
55
56
 
56
57
  #include <nv/target>
57
58
 
@@ -1178,13 +1179,13 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
1178
1179
  #endif // _CCCL_DOXYGEN_INVOKED
1179
1180
 
1180
1181
  /******************************************************************************
1181
- * Prefix call-back operator for coupling local block scan within a
1182
+ * Prefix callback operator for coupling local block scan within a
1182
1183
  * block-cooperative scan
1183
1184
  ******************************************************************************/
1184
1185
 
1185
1186
  /**
1186
- * Stateful block-scan prefix functor. Provides the the running prefix for
1187
- * the current tile by using the call-back warp to wait on on
1187
+ * Stateful block-scan prefix functor. Provides the running prefix for
1188
+ * the current tile by using the callback warp to wait for
1188
1189
  * aggregates/prefixes from predecessor tiles to become available.
1189
1190
  *
1190
1191
  * @tparam DelayConstructorT
@@ -47,7 +47,9 @@
47
47
  #include <cub/util_type.cuh>
48
48
  #include <cub/warp/warp_exchange.cuh>
49
49
 
50
- #include <cuda/ptx>
50
+ #include <cuda/__ptx/instructions/get_sreg.h>
51
+ #include <cuda/std/__algorithm/min.h>
52
+ #include <cuda/std/__type_traits/integral_constant.h>
51
53
 
52
54
  CUB_NAMESPACE_BEGIN
53
55
 
@@ -48,7 +48,7 @@
48
48
  #include <cub/block/specializations/block_histogram_sort.cuh>
49
49
  #include <cub/util_ptx.cuh>
50
50
 
51
- #include <cuda/std/type_traits>
51
+ #include <cuda/std/__type_traits/conditional.h>
52
52
 
53
53
  CUB_NAMESPACE_BEGIN
54
54
 
@@ -43,9 +43,8 @@
43
43
  #include <cub/util_ptx.cuh>
44
44
  #include <cub/util_type.cuh>
45
45
 
46
- #include <cuda/std/__algorithm/max.h>
47
46
  #include <cuda/std/__algorithm/min.h>
48
- #include <cuda/std/type_traits>
47
+ #include <cuda/std/__type_traits/is_same.h>
49
48
 
50
49
  CUB_NAMESPACE_BEGIN
51
50
 
@@ -384,7 +383,7 @@ public:
384
383
  int valid_items,
385
384
  KeyT oob_default)
386
385
  {
387
- if (IS_LAST_TILE)
386
+ if constexpr (IS_LAST_TILE)
388
387
  {
389
388
  // if last tile, find valid max_key
390
389
  // and fill the remaining keys with it
@@ -418,8 +417,8 @@ public:
418
417
  for (int target_merged_threads_number = 2; target_merged_threads_number <= NUM_THREADS;
419
418
  target_merged_threads_number *= 2)
420
419
  {
421
- int merged_threads_number = target_merged_threads_number / 2;
422
- int mask = target_merged_threads_number - 1;
420
+ const int merged_threads_number = target_merged_threads_number / 2;
421
+ const int mask = target_merged_threads_number - 1;
423
422
 
424
423
  Sync();
425
424
 
@@ -436,23 +435,23 @@ public:
436
435
 
437
436
  int indices[ITEMS_PER_THREAD];
438
437
 
439
- int first_thread_idx_in_thread_group_being_merged = ~mask & linear_tid;
440
- int start = ITEMS_PER_THREAD * first_thread_idx_in_thread_group_being_merged;
441
- int size = ITEMS_PER_THREAD * merged_threads_number;
438
+ const int first_thread_idx_in_thread_group_being_merged = ~mask & linear_tid;
439
+ const int start = ITEMS_PER_THREAD * first_thread_idx_in_thread_group_being_merged;
440
+ const int size = ITEMS_PER_THREAD * merged_threads_number;
442
441
 
443
- int thread_idx_in_thread_group_being_merged = mask & linear_tid;
442
+ const int thread_idx_in_thread_group_being_merged = mask & linear_tid;
444
443
 
445
- int diag = (::cuda::std::min) (valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged);
444
+ const int diag = (::cuda::std::min) (valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged);
446
445
 
447
- int keys1_beg = (::cuda::std::min) (valid_items, start);
448
- int keys1_end = (::cuda::std::min) (valid_items, keys1_beg + size);
449
- int keys2_beg = keys1_end;
450
- int keys2_end = (::cuda::std::min) (valid_items, keys2_beg + size);
446
+ const int keys1_beg = (::cuda::std::min) (valid_items, start);
447
+ const int keys1_end = (::cuda::std::min) (valid_items, keys1_beg + size);
448
+ const int keys2_beg = keys1_end;
449
+ const int keys2_end = (::cuda::std::min) (valid_items, keys2_beg + size);
451
450
 
452
- int keys1_count = keys1_end - keys1_beg;
453
- int keys2_count = keys2_end - keys2_beg;
451
+ const int keys1_count = keys1_end - keys1_beg;
452
+ const int keys2_count = keys2_end - keys2_beg;
454
453
 
455
- int partition_diag = MergePath(
454
+ const int partition_diag = MergePath(
456
455
  &temp_storage.keys_shared[keys1_beg],
457
456
  &temp_storage.keys_shared[keys2_beg],
458
457
  keys1_count,
@@ -460,12 +459,12 @@ public:
460
459
  diag,
461
460
  compare_op);
462
461
 
463
- int keys1_beg_loc = keys1_beg + partition_diag;
464
- int keys1_end_loc = keys1_end;
465
- int keys2_beg_loc = keys2_beg + diag - partition_diag;
466
- int keys2_end_loc = keys2_end;
467
- int keys1_count_loc = keys1_end_loc - keys1_beg_loc;
468
- int keys2_count_loc = keys2_end_loc - keys2_beg_loc;
462
+ const int keys1_beg_loc = keys1_beg + partition_diag;
463
+ const int keys1_end_loc = keys1_end;
464
+ const int keys2_beg_loc = keys2_beg + diag - partition_diag;
465
+ const int keys2_end_loc = keys2_end;
466
+ const int keys1_count_loc = keys1_end_loc - keys1_beg_loc;
467
+ const int keys2_count_loc = keys2_end_loc - keys2_beg_loc;
469
468
  SerialMerge(
470
469
  &temp_storage.keys_shared[0],
471
470
  keys1_beg_loc,
@@ -477,7 +476,7 @@ public:
477
476
  compare_op,
478
477
  oob_default);
479
478
 
480
- if (!KEYS_ONLY)
479
+ if constexpr (!KEYS_ONLY)
481
480
  {
482
481
  Sync();
483
482
 
@@ -48,12 +48,14 @@
48
48
  #include <cub/util_ptx.cuh>
49
49
  #include <cub/util_type.cuh>
50
50
 
51
- #include <cuda/ptx>
52
- #include <cuda/std/__algorithm_>
51
+ #include <cuda/__ptx/instructions/get_sreg.h>
52
+ #include <cuda/std/__algorithm/max.h>
53
+ #include <cuda/std/__functional/operations.h>
54
+ #include <cuda/std/__type_traits/conditional.h>
55
+ #include <cuda/std/__type_traits/is_same.h>
53
56
  #include <cuda/std/cstdint>
54
57
  #include <cuda/std/limits>
55
58
  #include <cuda/std/span>
56
- #include <cuda/std/type_traits>
57
59
 
58
60
  CUB_NAMESPACE_BEGIN
59
61
 
@@ -50,8 +50,11 @@
50
50
  #include <cub/util_ptx.cuh>
51
51
  #include <cub/util_type.cuh>
52
52
 
53
- #include <cuda/std/__algorithm_>
54
- #include <cuda/std/type_traits>
53
+ #include <cuda/std/__algorithm/min.h>
54
+ #include <cuda/std/__type_traits/enable_if.h>
55
+ #include <cuda/std/__type_traits/integral_constant.h>
56
+ #include <cuda/std/__type_traits/is_convertible.h>
57
+ #include <cuda/std/__type_traits/is_same.h>
55
58
 
56
59
  CUB_NAMESPACE_BEGIN
57
60
 
@@ -49,7 +49,8 @@
49
49
  #include <cub/util_ptx.cuh>
50
50
  #include <cub/util_type.cuh>
51
51
 
52
- #include <cuda/std/type_traits>
52
+ #include <cuda/std/__functional/operations.h>
53
+ #include <cuda/std/__type_traits/conditional.h>
53
54
 
54
55
  CUB_NAMESPACE_BEGIN
55
56
 
@@ -44,10 +44,7 @@
44
44
  #include <cub/util_ptx.cuh>
45
45
  #include <cub/util_type.cuh>
46
46
 
47
- #include <cuda/std/__algorithm/max.h>
48
47
  #include <cuda/std/__algorithm/min.h>
49
- #include <cuda/std/limits>
50
- #include <cuda/std/type_traits>
51
48
 
52
49
  CUB_NAMESPACE_BEGIN
53
50
 
@@ -47,7 +47,8 @@
47
47
  #include <cub/util_ptx.cuh>
48
48
  #include <cub/util_type.cuh>
49
49
 
50
- #include <cuda/std/type_traits>
50
+ #include <cuda/std/__functional/operations.h>
51
+ #include <cuda/std/__type_traits/conditional.h>
51
52
 
52
53
  CUB_NAMESPACE_BEGIN
53
54
 
@@ -49,14 +49,18 @@
49
49
 
50
50
  #include <thrust/type_traits/integer_sequence.h>
51
51
 
52
- #include <cuda/bit>
53
- #include <cuda/functional>
54
- #include <cuda/std/__algorithm/max.h>
52
+ #include <cuda/__bit/bitfield.h>
53
+ #include <cuda/__type_traits/is_floating_point.h>
54
+ #include <cuda/__utility/static_for.h>
55
55
  #include <cuda/std/__algorithm/min.h>
56
+ #include <cuda/std/__functional/invoke.h>
57
+ #include <cuda/std/__type_traits/enable_if.h>
58
+ #include <cuda/std/__type_traits/integral_constant.h>
59
+ #include <cuda/std/__type_traits/is_same.h>
60
+ #include <cuda/std/__type_traits/remove_cv.h>
61
+ #include <cuda/std/__type_traits/void_t.h>
56
62
  #include <cuda/std/cstdint>
57
63
  #include <cuda/std/tuple>
58
- #include <cuda/std/type_traits>
59
- #include <cuda/type_traits>
60
64
 
61
65
  CUB_NAMESPACE_BEGIN
62
66
 
@@ -49,8 +49,8 @@
49
49
  #include <cub/warp/warp_reduce.cuh>
50
50
 
51
51
  #include <cuda/__cmath/ceil_div.h>
52
+ #include <cuda/__ptx/instructions/get_sreg.h>
52
53
  #include <cuda/atomic>
53
- #include <cuda/ptx>
54
54
  #include <cuda/std/__algorithm/min.h>
55
55
 
56
56
  CUB_NAMESPACE_BEGIN
@@ -47,8 +47,8 @@
47
47
  #include <cub/util_ptx.cuh>
48
48
  #include <cub/warp/warp_scan.cuh>
49
49
 
50
- #include <cuda/cmath>
51
- #include <cuda/ptx>
50
+ #include <cuda/__cmath/ceil_div.h>
51
+ #include <cuda/__ptx/instructions/get_sreg.h>
52
52
 
53
53
  CUB_NAMESPACE_BEGIN
54
54
  namespace detail
@@ -38,11 +38,12 @@
38
38
  #include <cub/detail/type_traits.cuh> // static_size_v
39
39
  #include <cub/util_namespace.cuh>
40
40
 
41
- #include <cuda/std/array> // array
42
- #include <cuda/std/cstddef> // size_t
43
- #include <cuda/std/iterator> // ::cuda::std::iter_value_t
44
- #include <cuda/std/type_traits> // _If
45
- #include <cuda/std/utility> // index_sequence
41
+ #include <cuda/std/__iterator/iterator_traits.h>
42
+ #include <cuda/std/__type_traits/conditional.h>
43
+ #include <cuda/std/__type_traits/is_same.h>
44
+ #include <cuda/std/__utility/integer_sequence.h>
45
+ #include <cuda/std/array>
46
+ #include <cuda/std/cstddef>
46
47
 
47
48
  CUB_NAMESPACE_BEGIN
48
49
  namespace detail
@@ -37,9 +37,15 @@
37
37
  # pragma system_header
38
38
  #endif // no system header
39
39
 
40
+ #include <cuda/std/__iterator/iterator_traits.h>
41
+ #include <cuda/std/__type_traits/common_type.h>
42
+ #include <cuda/std/__type_traits/conditional.h>
43
+ #include <cuda/std/__type_traits/is_integral.h>
44
+ #include <cuda/std/__type_traits/is_same.h>
45
+ #include <cuda/std/__type_traits/is_unsigned.h>
46
+ #include <cuda/std/__type_traits/remove_cv.h>
40
47
  #include <cuda/std/cstdint>
41
- #include <cuda/std/iterator>
42
- #include <cuda/std/type_traits>
48
+ #include <cuda/std/limits>
43
49
 
44
50
  CUB_NAMESPACE_BEGIN
45
51
 
@@ -40,12 +40,19 @@
40
40
  #include <cub/detail/type_traits.cuh> // implicit_prom_t
41
41
  #include <cub/util_type.cuh> // _CCCL_HAS_INT128()
42
42
 
43
- #include <cuda/cmath> // cuda::std::ceil_div
44
- #include <cuda/std/bit> // cuda::std::has_single_bit
43
+ #include <cuda/__cmath/ceil_div.h>
44
+ #include <cuda/std/__bit/has_single_bit.h>
45
+ #include <cuda/std/__bit/integral.h>
46
+ #include <cuda/std/__type_traits/conditional.h>
47
+ #include <cuda/std/__type_traits/enable_if.h>
48
+ #include <cuda/std/__type_traits/integral_constant.h>
49
+ #include <cuda/std/__type_traits/is_integral.h>
50
+ #include <cuda/std/__type_traits/is_same.h>
51
+ #include <cuda/std/__type_traits/is_signed.h>
52
+ #include <cuda/std/__type_traits/make_unsigned.h>
45
53
  #include <cuda/std/climits> // CHAR_BIT
46
54
  #include <cuda/std/cstdint> // uint64_t
47
- #include <cuda/std/limits> // numeric_limits
48
- #include <cuda/std/type_traits> // ::cuda::std::is_integral
55
+ #include <cuda/std/limits>
49
56
 
50
57
  #if defined(CCCL_ENABLE_DEVICE_ASSERTIONS)
51
58
  _CCCL_BEGIN_NV_DIAG_SUPPRESS(186) // pointless comparison of unsigned integer with zero
@@ -15,9 +15,13 @@
15
15
  #include <cub/detail/unsafe_bitcast.cuh>
16
16
  #include <cub/thread/thread_operators.cuh> // is_cuda_minimum_maximum_v
17
17
 
18
- #include <cuda/std/cmath> // isnan
19
- #include <cuda/std/limits> // numeric_limits
20
- #include <cuda/type_traits> // is_floating_point_v
18
+ #include <cuda/__type_traits/is_floating_point.h>
19
+ #include <cuda/std/__cmath/isnan.h>
20
+ #include <cuda/std/__type_traits/is_integer.h>
21
+ #include <cuda/std/__type_traits/is_signed.h>
22
+ #include <cuda/std/__type_traits/make_nbit_int.h>
23
+ #include <cuda/std/__type_traits/make_unsigned.h>
24
+ #include <cuda/std/limits>
21
25
 
22
26
  CUB_NAMESPACE_BEGIN
23
27
 
@@ -39,11 +39,11 @@
39
39
 
40
40
  #include <cub/detail/fast_modulo_division.cuh> // fast_div_mod
41
41
 
42
- #include <cuda/std/array> // cuda::std::array
43
- #include <cuda/std/cstddef> // size_t
42
+ #include <cuda/std/__type_traits/make_unsigned.h>
43
+ #include <cuda/std/__utility/integer_sequence.h>
44
+ #include <cuda/std/array>
45
+ #include <cuda/std/cstddef>
44
46
  #include <cuda/std/mdspan>
45
- #include <cuda/std/type_traits> // make_unsigned_t
46
- #include <cuda/std/utility> // index_sequence
47
47
 
48
48
  CUB_NAMESPACE_BEGIN
49
49
 
@@ -32,9 +32,9 @@
32
32
  #include <cub/detail/ptx-json/string.h>
33
33
  #include <cub/detail/ptx-json/value.h>
34
34
 
35
+ #include <cuda/std/__type_traits/enable_if.h>
36
+ #include <cuda/std/__utility/integer_sequence.h>
35
37
  #include <cuda/std/cstddef>
36
- #include <cuda/std/type_traits>
37
- #include <cuda/std/utility>
38
38
 
39
39
  namespace ptx_json
40
40
  {
@@ -47,9 +47,10 @@ struct tagged_json<T, cuda::std::index_sequence<Is...>>
47
47
  template <typename V, typename = cuda::std::enable_if_t<is_object<V>::value || is_array<V>::value>>
48
48
  __noinline__ __device__ void operator=(V)
49
49
  {
50
- asm volatile("cccl.ptx_json.begin(%0)\n\n" ::"C"(storage_helper<T.str[Is]...>::value) : "memory");
50
+ static constexpr char str[]{T.str[Is]...};
51
+ asm volatile("cccl.ptx_json.begin(%0)\n\n" ::"C"(str) : "memory");
51
52
  V::emit();
52
- asm volatile("\ncccl.ptx_json.end(%0)" ::"C"(storage_helper<T.str[Is]...>::value) : "memory");
53
+ asm volatile("\ncccl.ptx_json.end(%0)" ::"C"(str) : "memory");
53
54
  }
54
55
  };
55
56
 
@@ -30,7 +30,7 @@
30
30
  #include <cub/detail/ptx-json/string.h>
31
31
  #include <cub/detail/ptx-json/value.h>
32
32
 
33
- #include <cuda/std/type_traits>
33
+ #include <cuda/std/__type_traits/integral_constant.h>
34
34
 
35
35
  namespace ptx_json
36
36
  {
@@ -50,22 +50,4 @@ __forceinline__ __device__ void comma()
50
50
  {
51
51
  asm volatile("," ::: "memory");
52
52
  }
53
-
54
- #pragma nv_diag_suppress 177
55
- template <char... Cs>
56
- struct storage_helper
57
- {
58
- // This, and the dance to invoke this through value_traits elsewhere, is necessary because the "C" inline assembly
59
- // constraint supported by NVCC requires that its argument is a pointer to a constant array of type char; NVCC also
60
- // doesn't allow passing raw character literals as pointer template arguments; and *also* it seems to look at the type
61
- // of a containing object, not a subobject it is given, when passed in a pointer to an array inside a literal type.
62
- // All of this means that we can't just pass strings, and *also* we can't just use the string<N>::array member above
63
- // as the string literal; therefore, using the fact that the length of the string is a core constant expression in the
64
- // definition of value_traits, we can generate a variadic pack that allows us to expand the contents of
65
- // string<N>::array into a comma separated list of N chars. We can then plug that in as template arguments to
66
- // storage_helper, which then can, as below, turn that into its own char array that NVCC accepts as an argument for a
67
- // "C" inline assembly constraint.
68
- static const constexpr char value[] = {Cs...};
69
- };
70
- #pragma nv_diag_default 177
71
53
  } // namespace ptx_json