cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -23,6 +23,7 @@
23
23
 
24
24
  #include <cuda/__cmath/ceil_div.h>
25
25
  #include <cuda/__cmath/ilog.h>
26
+ #include <cuda/__cmath/mul_hi.h>
26
27
  #include <cuda/__cmath/pow2.h>
27
28
  #include <cuda/std/__type_traits/common_type.h>
28
29
  #include <cuda/std/__type_traits/is_integer.h>
@@ -30,7 +31,6 @@
30
31
  #include <cuda/std/__type_traits/make_nbit_int.h>
31
32
  #include <cuda/std/__type_traits/make_unsigned.h>
32
33
  #include <cuda/std/__type_traits/num_bits.h>
33
- #include <cuda/std/__type_traits/promote.h>
34
34
  #include <cuda/std/__utility/pair.h>
35
35
  #include <cuda/std/cstdint>
36
36
  #include <cuda/std/limits>
@@ -39,78 +39,6 @@
39
39
 
40
40
  _CCCL_BEGIN_NAMESPACE_CUDA
41
41
 
42
- /***********************************************************************************************************************
43
- * Extract higher bits after multiplication
44
- **********************************************************************************************************************/
45
-
46
- template <typename _Tp, typename _Lhs>
47
- [[nodiscard]] _CCCL_API constexpr ::cuda::std::common_type_t<_Tp, _Lhs>
48
- __multiply_extract_higher_bits_fallback(_Tp __x, _Lhs __y)
49
- {
50
- using __ret_t = ::cuda::std::common_type_t<_Tp, _Lhs>;
51
- constexpr int __shift = ::cuda::std::__num_bits_v<__ret_t> / 2;
52
- using __half_bits_t = ::cuda::std::__make_nbit_uint_t<::cuda::std::__num_bits_v<__ret_t>>;
53
- auto __x_high = static_cast<__half_bits_t>(__x >> __shift);
54
- auto __x_low = static_cast<__half_bits_t>(__x);
55
- auto __y_high = static_cast<__half_bits_t>(__y >> __shift);
56
- auto __y_low = static_cast<__half_bits_t>(__y);
57
- auto __p0 = __x_low * __y_low;
58
- auto __p1 = __x_low * __y_high;
59
- auto __p2 = __x_high * __y_low;
60
- auto __p3 = __x_high * __y_high;
61
- auto __mid = __p1 + __p2;
62
- __half_bits_t __carry = (__mid < __p1);
63
- auto __po_half = __p0 >> __shift;
64
- __mid = __mid + __po_half;
65
- __carry += (__mid < __po_half);
66
- return __p3 + (__mid >> __shift) + (__carry << __shift);
67
- }
68
-
69
- template <typename _Tp, typename _Lhs>
70
- [[nodiscard]] _CCCL_API constexpr ::cuda::std::common_type_t<_Tp, _Lhs> __multiply_extract_higher_bits(_Tp __x, _Lhs __y)
71
- {
72
- using ::cuda::std::__cccl_is_integer_v;
73
- using ::cuda::std::__num_bits_v;
74
- using ::cuda::std::is_signed_v;
75
- static_assert(__cccl_is_integer_v<_Tp>, "__multiply_extract_higher_bits: T is required to be an integer type");
76
- static_assert(__cccl_is_integer_v<_Lhs>, "__multiply_extract_higher_bits: T is required to be an integer type");
77
- if constexpr (is_signed_v<_Tp>)
78
- {
79
- _CCCL_ASSERT(__x >= 0, "__x must be non-negative");
80
- _CCCL_ASSUME(__x >= 0);
81
- }
82
- if constexpr (is_signed_v<_Lhs>)
83
- {
84
- _CCCL_ASSERT(__y >= 0, "__y must be non-negative");
85
- _CCCL_ASSUME(__y >= 0);
86
- }
87
- using __ret_t = ::cuda::std::common_type_t<_Tp, _Lhs>;
88
- if (!::cuda::std::__cccl_default_is_constant_evaluated())
89
- {
90
- if constexpr (sizeof(_Tp) == sizeof(uint32_t) && sizeof(_Lhs) == sizeof(uint32_t))
91
- {
92
- NV_IF_TARGET(NV_IS_DEVICE, (return ::__umulhi(static_cast<uint32_t>(__x), static_cast<uint32_t>(__y));));
93
- }
94
- #if !_CCCL_HAS_INT128()
95
- else if constexpr (sizeof(_Tp) == sizeof(uint64_t) && sizeof(_Lhs) == sizeof(uint64_t))
96
- {
97
- NV_DISPATCH_TARGET(NV_IS_DEVICE, (return ::__umul64hi(static_cast<uint64_t>(__x), static_cast<uint64_t>(__y));));
98
- }
99
- #endif // !_CCCL_HAS_INT128()
100
- }
101
- if constexpr (sizeof(__ret_t) < sizeof(uint64_t) || (sizeof(__ret_t) == sizeof(uint64_t) && _CCCL_HAS_INT128()))
102
- {
103
- constexpr auto __mul_bits = ::cuda::next_power_of_two(__num_bits_v<_Tp> + __num_bits_v<_Lhs>);
104
- using __larger_t = ::cuda::std::__make_nbit_uint_t<__mul_bits>;
105
- auto __ret = (static_cast<__larger_t>(__x) * __y) >> (__mul_bits / 2);
106
- return static_cast<__ret_t>(__ret);
107
- }
108
- else
109
- {
110
- return ::cuda::__multiply_extract_higher_bits_fallback(__x, __y);
111
- }
112
- }
113
-
114
42
  /***********************************************************************************************************************
115
43
  * Fast Modulo/Division based on Precomputation
116
44
  **********************************************************************************************************************/
@@ -184,6 +112,7 @@ public:
184
112
  _CCCL_ASSERT(__dividend >= 0, "dividend must be non-negative");
185
113
  }
186
114
  using __common_t = ::cuda::std::common_type_t<_Tp, _Lhs>;
115
+ using __ucommon_t = ::cuda::std::make_unsigned_t<__common_t>;
187
116
  using _Up = ::cuda::std::make_unsigned_t<_Lhs>;
188
117
  const auto __div = __divisor1.__divisor; // cannot use structure binding because of clang-14
189
118
  const auto __mul = __divisor1.__multiplier;
@@ -205,7 +134,7 @@ public:
205
134
  {
206
135
  return static_cast<__common_t>(__dividend);
207
136
  }
208
- auto __higher_bits = ::cuda::__multiply_extract_higher_bits(__udividend, __mul);
137
+ auto __higher_bits = ::cuda::mul_hi(static_cast<__ucommon_t>(__udividend), static_cast<__ucommon_t>(__mul));
209
138
  auto __quotient = static_cast<__common_t>(__higher_bits >> __shift_);
210
139
  _CCCL_ASSERT(__quotient == static_cast<__common_t>(__dividend / __div), "wrong __quotient");
211
140
  return __quotient;
@@ -0,0 +1,146 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef _CUDA___CMATH_MUL_HI_H
12
+ #define _CUDA___CMATH_MUL_HI_H
13
+
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
24
+ #include <cuda/std/__type_traits/is_constant_evaluated.h>
25
+ #include <cuda/std/__type_traits/is_integer.h>
26
+ #include <cuda/std/__type_traits/is_signed.h>
27
+ #include <cuda/std/__type_traits/make_nbit_int.h>
28
+ #include <cuda/std/__type_traits/make_unsigned.h>
29
+ #include <cuda/std/__type_traits/num_bits.h>
30
+ #include <cuda/std/cstdint>
31
+
32
+ #if _CCCL_COMPILER(MSVC)
33
+ # include <intrin.h>
34
+ #endif // _CCCL_COMPILER(MSVC)
35
+
36
+ #include <cuda/std/__cccl/prologue.h>
37
+
38
+ _CCCL_BEGIN_NAMESPACE_CUDA
39
+
40
+ /***********************************************************************************************************************
41
+ * Extract higher bits after multiplication
42
+ **********************************************************************************************************************/
43
+
44
+ template <typename _Tp>
45
+ [[nodiscard]] _CCCL_API constexpr _Tp __mul_hi_fallback(_Tp __lhs, _Tp __rhs) noexcept
46
+ {
47
+ static_assert(::cuda::std::is_unsigned_v<_Tp>, "__mul_hi_fallback: T is required to be a unsigned integer type");
48
+ constexpr int __half_bits = ::cuda::std::__num_bits_v<_Tp> / 2;
49
+ using __half_bits_t = ::cuda::std::__make_nbit_uint_t<__half_bits>;
50
+ const auto __lhs_low = static_cast<__half_bits_t>(__lhs); // 32-bit
51
+ const auto __lhs_high = static_cast<__half_bits_t>(__lhs >> __half_bits); // 32-bit
52
+ const auto __rhs_low = static_cast<__half_bits_t>(__rhs); // 32-bit
53
+ const auto __rhs_high = static_cast<__half_bits_t>(__rhs >> __half_bits); // 32-bit
54
+ const auto __po_half = (static_cast<_Tp>(__lhs_low) * __rhs_low) >> __half_bits;
55
+ const auto __p1 = static_cast<_Tp>(__lhs_low) * __rhs_high; // 64-bit
56
+ const auto __p2 = static_cast<_Tp>(__lhs_high) * __rhs_low; // 64-bit
57
+ const auto __p3 = static_cast<_Tp>(__lhs_high) * __rhs_high; // 64-bit
58
+ const auto __p1_half = static_cast<__half_bits_t>(__p1); // 32-bit
59
+ const auto __p2_half = static_cast<__half_bits_t>(__p2); // 32-bit
60
+ const auto __carry = (__po_half + __p1_half + __p2_half) >> __half_bits; // 64-bit
61
+ return __p3 + (__p1 >> __half_bits) + (__p2 >> __half_bits) + __carry;
62
+ }
63
+
64
+ _CCCL_TEMPLATE(typename _Tp)
65
+ _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp>)
66
+ [[nodiscard]]
67
+ _CCCL_API constexpr _Tp mul_hi(_Tp __lhs, _Tp __rhs) noexcept
68
+ {
69
+ using ::cuda::std::int64_t;
70
+ using ::cuda::std::is_signed_v;
71
+ if (!::cuda::std::__cccl_default_is_constant_evaluated())
72
+ {
73
+ if constexpr (sizeof(_Tp) == sizeof(int))
74
+ {
75
+ if constexpr (is_signed_v<_Tp>)
76
+ {
77
+ [[maybe_unused]] const auto __lhs1 = static_cast<int>(__lhs);
78
+ [[maybe_unused]] const auto __rhs1 = static_cast<int>(__rhs);
79
+ NV_IF_TARGET(NV_IS_DEVICE, (return ::__mulhi(__lhs1, __rhs1);));
80
+ }
81
+ else // is_unsigned_v<_Tp>
82
+ {
83
+ [[maybe_unused]] const auto __lhs1 = static_cast<unsigned>(__lhs);
84
+ [[maybe_unused]] const auto __rhs1 = static_cast<unsigned>(__rhs);
85
+ NV_IF_TARGET(NV_IS_DEVICE, (return ::__umulhi(__lhs1, __rhs1);));
86
+ }
87
+ }
88
+ else if constexpr (sizeof(_Tp) == sizeof(int64_t))
89
+ {
90
+ if constexpr (is_signed_v<_Tp>)
91
+ {
92
+ [[maybe_unused]] const auto __lhs1 = static_cast<long long>(__lhs);
93
+ [[maybe_unused]] const auto __rhs1 = static_cast<long long>(__rhs);
94
+ NV_IF_TARGET(NV_IS_DEVICE, (return ::__mul64hi(__lhs1, __rhs1);));
95
+ #if _CCCL_COMPILER(MSVC)
96
+ NV_IF_TARGET(NV_IS_HOST, (return ::__mulh(__lhs1, __rhs1);));
97
+ #endif // _CCCL_COMPILER(MSVC)
98
+ }
99
+ else // is_unsigned_v<_Tp>
100
+ {
101
+ [[maybe_unused]] const auto __lhs1 = static_cast<unsigned long long>(__lhs);
102
+ [[maybe_unused]] const auto __rhs1 = static_cast<unsigned long long>(__rhs);
103
+ NV_IF_TARGET(NV_IS_DEVICE, (return ::__umul64hi(__lhs1, __rhs1);));
104
+ #if _CCCL_COMPILER(MSVC)
105
+ NV_IF_TARGET(NV_IS_HOST, (return ::__umulh(__lhs1, __rhs1);));
106
+ #endif // _CCCL_COMPILER(MSVC)
107
+ }
108
+ }
109
+ }
110
+ if constexpr (sizeof(_Tp) < sizeof(int64_t) || (sizeof(_Tp) == sizeof(int64_t) && _CCCL_HAS_INT128()))
111
+ {
112
+ constexpr auto __bits = ::cuda::std::__num_bits_v<_Tp>;
113
+ using __larger_t = ::cuda::std::__make_nbit_int_t<__bits * 2, is_signed_v<_Tp>>;
114
+ const auto __ret = (static_cast<__larger_t>(__lhs) * __rhs) >> __bits;
115
+ return static_cast<_Tp>(__ret);
116
+ }
117
+ else // sizeof(_Tp) >= sizeof(int64_t) && !_CCCL_HAS_INT128()
118
+ {
119
+ if constexpr (is_signed_v<_Tp>)
120
+ {
121
+ using _Up = ::cuda::std::make_unsigned_t<_Tp>;
122
+ const auto __lhs1 = static_cast<_Up>(__lhs);
123
+ const auto __rhs1 = static_cast<_Up>(__rhs);
124
+ auto __hi = ::cuda::__mul_hi_fallback(__lhs1, __rhs1);
125
+ if (__lhs < 0)
126
+ {
127
+ __hi -= __rhs1;
128
+ }
129
+ if (__rhs < 0)
130
+ {
131
+ __hi -= __lhs1;
132
+ }
133
+ return static_cast<_Tp>(__hi);
134
+ }
135
+ else
136
+ {
137
+ return ::cuda::__mul_hi_fallback(__lhs, __rhs);
138
+ }
139
+ }
140
+ }
141
+
142
+ _CCCL_END_NAMESPACE_CUDA
143
+
144
+ #include <cuda/std/__cccl/epilogue.h>
145
+
146
+ #endif // _CUDA___CMATH_MULTIPLY_HIGH_HALF_H
@@ -24,10 +24,6 @@
24
24
  #include <cuda/__fwd/complex.h>
25
25
  #include <cuda/std/__fwd/complex.h>
26
26
 
27
- #if !_CCCL_COMPILER(NVRTC)
28
- # include <complex>
29
- #endif // !_CCCL_COMPILER(NVRTC)
30
-
31
27
  #include <cuda/std/__cccl/prologue.h>
32
28
 
33
29
  _CCCL_BEGIN_NAMESPACE_CUDA
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA___DEVICE_ALL_DEVICES_H
12
12
  #define _CUDA___DEVICE_ALL_DEVICES_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -22,10 +22,12 @@
22
22
  #endif // no system header
23
23
 
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
+
26
+ # include <cuda/__device/device_ref.h>
25
27
  # include <cuda/__device/physical_device.h>
26
28
  # include <cuda/__driver/driver_api.h>
27
- # include <cuda/std/cassert>
28
- # include <cuda/std/detail/libcxx/include/stdexcept>
29
+ # include <cuda/__fwd/devices.h>
30
+ # include <cuda/std/__cstddef/types.h>
29
31
  # include <cuda/std/span>
30
32
 
31
33
  # include <vector>
@@ -33,129 +35,62 @@
33
35
  # include <cuda/std/__cccl/prologue.h>
34
36
 
35
37
  _CCCL_BEGIN_NAMESPACE_CUDA
36
- namespace __detail
37
- {
38
- //! @brief A random-access range of all available CUDA devices
39
- class all_devices
40
- {
41
- public:
42
- using size_type = ::std::vector<physical_device>::size_type;
43
- using iterator = ::std::vector<physical_device>::const_iterator;
44
- using const_iterator = ::std::vector<physical_device>::const_iterator;
45
-
46
- all_devices() = default;
47
-
48
- [[nodiscard]] const physical_device& operator[](size_type __i) const;
49
-
50
- [[nodiscard]] size_type size() const;
51
38
 
52
- [[nodiscard]] iterator begin() const noexcept;
53
-
54
- [[nodiscard]] iterator end() const noexcept;
55
-
56
- operator ::cuda::std::span<const device_ref>() const;
57
-
58
- private:
59
- struct __initializer_iterator;
60
-
61
- static const ::std::vector<physical_device>& __devices();
62
- };
63
-
64
- //! @brief An iterator used to in-place construct `device` objects in a
65
- //! std::vector.
66
- //!
67
- //! Since `device` objects are not movable or copyable, we need to construct them
68
- //! in-place with a proxy object that can be implicitly converted to a `device`
69
- //! object.
70
- struct all_devices::__initializer_iterator
39
+ [[nodiscard]] _CCCL_HOST_API inline ::std::vector<device_ref> __make_devices()
71
40
  {
72
- using value_type = __emplace_device;
73
- using reference = __emplace_device;
74
- using iterator_category = ::std::forward_iterator_tag;
75
- using difference_type = int;
76
- using pointer = __emplace_device;
77
-
78
- int __id_;
79
-
80
- __emplace_device operator*() const noexcept
41
+ ::std::vector<device_ref> __ret{};
42
+ __ret.reserve(::cuda::__physical_devices().size());
43
+ for (::cuda::std::size_t __i = 0; __i < ::cuda::__physical_devices().size(); ++__i)
81
44
  {
82
- return __emplace_device{__id_};
45
+ __ret.emplace_back(static_cast<int>(__i));
83
46
  }
47
+ return __ret;
48
+ }
84
49
 
85
- __emplace_device operator->() const noexcept
86
- {
87
- return __emplace_device{__id_};
88
- }
50
+ [[nodiscard]] inline ::cuda::std::span<const device_ref> __devices()
51
+ {
52
+ static const auto __devices = ::cuda::__make_devices();
53
+ return ::cuda::std::span<const device_ref>{__devices.data(), __devices.size()};
54
+ }
89
55
 
90
- __initializer_iterator& operator++() noexcept
91
- {
92
- ++__id_;
93
- return *this;
94
- }
56
+ //! @brief A random-access range of all available CUDA devices
57
+ class __all_devices
58
+ {
59
+ public:
60
+ using value_type = ::cuda::std::span<const device_ref>::value_type;
61
+ using size_type = ::cuda::std::span<const device_ref>::size_type;
62
+ using iterator = ::cuda::std::span<const device_ref>::iterator;
63
+
64
+ _CCCL_HIDE_FROM_ABI __all_devices() = default;
65
+ __all_devices(const __all_devices&) = delete;
66
+ __all_devices(__all_devices&&) = delete;
67
+ __all_devices& operator=(const __all_devices&) = delete;
68
+ __all_devices& operator=(__all_devices&&) = delete;
95
69
 
96
- __initializer_iterator operator++(int) noexcept
70
+ [[nodiscard]] _CCCL_HOST_API device_ref operator[](size_type __i) const
97
71
  {
98
- auto __tmp = *this;
99
- ++__id_;
100
- return __tmp;
72
+ if (__i >= size())
73
+ {
74
+ ::cuda::std::__throw_out_of_range("device index out of range");
75
+ }
76
+ return ::cuda::__devices()[__i];
101
77
  }
102
78
 
103
- bool operator==(const __initializer_iterator& __other) const noexcept
79
+ [[nodiscard]] _CCCL_HOST_API size_type size() const
104
80
  {
105
- return __id_ == __other.__id_;
81
+ return ::cuda::__devices().size();
106
82
  }
107
83
 
108
- bool operator!=(const __initializer_iterator& __other) const noexcept
84
+ [[nodiscard]] _CCCL_HOST_API iterator begin() const
109
85
  {
110
- return __id_ != __other.__id_;
86
+ return ::cuda::__devices().begin();
111
87
  }
112
- };
113
88
 
114
- [[nodiscard]] inline const physical_device& all_devices::operator[](size_type __id_) const
115
- {
116
- if (__id_ >= size())
89
+ [[nodiscard]] _CCCL_HOST_API iterator end() const
117
90
  {
118
- if (size() == 0)
119
- {
120
- ::cuda::std::__throw_out_of_range("device was requested but no CUDA devices found");
121
- }
122
- else
123
- {
124
- ::cuda::std::__throw_out_of_range(
125
- (::std::string("device index out of range: ") + ::std::to_string(__id_)).c_str());
126
- }
91
+ return ::cuda::__devices().end();
127
92
  }
128
- return __devices()[__id_];
129
- }
130
-
131
- [[nodiscard]] inline all_devices::size_type all_devices::size() const
132
- {
133
- return __devices().size();
134
- }
135
-
136
- [[nodiscard]] inline all_devices::iterator all_devices::begin() const noexcept
137
- {
138
- return __devices().begin();
139
- }
140
-
141
- [[nodiscard]] inline all_devices::iterator all_devices::end() const noexcept
142
- {
143
- return __devices().end();
144
- }
145
-
146
- inline all_devices::operator ::cuda::std::span<const device_ref>() const
147
- {
148
- static const ::std::vector<device_ref> __refs(begin(), end());
149
- return ::cuda::std::span<const device_ref>(__refs);
150
- }
151
-
152
- inline const ::std::vector<physical_device>& all_devices::__devices()
153
- {
154
- static const ::std::vector<physical_device> __devices{
155
- __initializer_iterator{0}, __initializer_iterator{::cuda::__driver::__deviceGetCount()}};
156
- return __devices;
157
- }
158
- } // namespace __detail
93
+ };
159
94
 
160
95
  //! @brief A range of all available CUDA devices
161
96
  //!
@@ -171,7 +106,7 @@ inline const ::std::vector<physical_device>& all_devices::__devices()
171
106
  //! struct iterator;
172
107
  //! using const_iterator = iterator;
173
108
  //!
174
- //! [[nodiscard]] constexpr const physical_device& operator[](size_type i) const noexcept;
109
+ //! [[nodiscard]] device_ref operator[](size_type i) const noexcept;
175
110
  //!
176
111
  //! [[nodiscard]] size_type size() const;
177
112
  //!
@@ -183,7 +118,7 @@ inline const ::std::vector<physical_device>& all_devices::__devices()
183
118
  //!
184
119
  //! @par
185
120
  //! `__all_devices::iterator` is a random access iterator with a `reference`
186
- //! type of `const physical_device&`.
121
+ //! type of `const device_ref&`.
187
122
  //!
188
123
  //! @par Example
189
124
  //! @code
@@ -194,39 +129,7 @@ inline const ::std::vector<physical_device>& all_devices::__devices()
194
129
  //! @sa
195
130
  //! * device
196
131
  //! * device_ref
197
- inline constexpr __detail::all_devices devices{};
198
-
199
- inline const arch::traits_t& device_ref::arch_traits() const
200
- {
201
- return devices[get()].arch_traits();
202
- }
203
-
204
- [[nodiscard]] inline ::std::vector<device_ref> device_ref::peer_devices() const
205
- {
206
- ::std::vector<device_ref> __result;
207
- __result.reserve(devices.size());
208
-
209
- for (const physical_device& __other_dev : devices)
210
- {
211
- // Exclude the device this API is called on. The main use case for this API
212
- // is enable/disable peer access. While enable peer access can be called on
213
- // device on which memory resides, disable peer access will error-out.
214
- // Usage of the peer access control is smoother when *this is excluded,
215
- // while it can be easily added with .push_back() on the vector if a full
216
- // group of peers is needed (for cases other than peer access control)
217
- if (__other_dev != *this)
218
- {
219
- // While in almost all practical applications peer access should be symmetrical,
220
- // it is possible to build a system with one directional peer access, check
221
- // both ways here just to be safe
222
- if (has_peer_access_to(__other_dev) && __other_dev.has_peer_access_to(*this))
223
- {
224
- __result.push_back(__other_dev);
225
- }
226
- }
227
- }
228
- return __result;
229
- }
132
+ inline constexpr __all_devices devices{};
230
133
 
231
134
  _CCCL_END_NAMESPACE_CUDA
232
135