cuda-cccl 0.3.0__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -22,6 +22,16 @@
22
22
  #endif // no system header
23
23
 
24
24
  #include <cuda/__mdspan/restrict_accessor.h>
25
+ #include <cuda/std/__concepts/concept_macros.h>
26
+ #include <cuda/std/__fwd/array.h>
27
+ #include <cuda/std/__fwd/span.h>
28
+ #include <cuda/std/__type_traits/extent.h>
29
+ #include <cuda/std/__type_traits/is_convertible.h>
30
+ #include <cuda/std/__type_traits/is_pointer.h>
31
+ #include <cuda/std/__type_traits/rank.h>
32
+ #include <cuda/std/__type_traits/remove_all_extents.h>
33
+ #include <cuda/std/__type_traits/remove_pointer.h>
34
+ #include <cuda/std/__type_traits/remove_reference.h>
25
35
  #include <cuda/std/mdspan>
26
36
 
27
37
  #include <cuda/std/__cccl/prologue.h>
@@ -32,7 +42,63 @@ template <typename _ElementType,
32
42
  typename _Extents,
33
43
  typename _LayoutPolicy = ::cuda::std::layout_right,
34
44
  typename _AccessorPolicy = ::cuda::std::default_accessor<_ElementType>>
35
- using restrict_mdspan = ::cuda::std::mdspan<_ElementType, _Extents, _LayoutPolicy, restrict_accessor<_AccessorPolicy>>;
45
+ class restrict_mdspan
46
+ : public ::cuda::std::mdspan<_ElementType, _Extents, _LayoutPolicy, restrict_accessor<_AccessorPolicy>>
47
+ {
48
+ public:
49
+ _LIBCUDACXX_DELEGATE_CONSTRUCTORS(
50
+ restrict_mdspan, ::cuda::std::mdspan, _ElementType, _Extents, _LayoutPolicy, restrict_accessor<_AccessorPolicy>);
51
+
52
+ _CCCL_API friend constexpr void swap(restrict_mdspan& __x, restrict_mdspan& __y) noexcept
53
+ {
54
+ swap(static_cast<__base&>(__x), static_cast<__base&>(__y));
55
+ }
56
+ };
57
+
58
+ _CCCL_TEMPLATE(class _ElementType, class... _OtherIndexTypes)
59
+ _CCCL_REQUIRES((sizeof...(_OtherIndexTypes) > 0)
60
+ _CCCL_AND(::cuda::std::is_convertible_v<_OtherIndexTypes, size_t>&&... && true))
61
+ _CCCL_HOST_DEVICE explicit restrict_mdspan(_ElementType*, _OtherIndexTypes...)
62
+ -> restrict_mdspan<_ElementType, ::cuda::std::extents<size_t, ::cuda::std::__maybe_static_ext<_OtherIndexTypes>...>>;
63
+
64
+ _CCCL_TEMPLATE(class _Pointer)
65
+ _CCCL_REQUIRES(::cuda::std::is_pointer_v<::cuda::std::remove_reference_t<_Pointer>>)
66
+ _CCCL_HOST_DEVICE restrict_mdspan(_Pointer&&)
67
+ -> restrict_mdspan<::cuda::std::remove_pointer_t<::cuda::std::remove_reference_t<_Pointer>>,
68
+ ::cuda::std::extents<size_t>>;
69
+
70
+ _CCCL_TEMPLATE(class _CArray)
71
+ _CCCL_REQUIRES(::cuda::std::is_array_v<_CArray> _CCCL_AND(::cuda::std::rank_v<_CArray> == 1))
72
+ _CCCL_HOST_DEVICE restrict_mdspan(_CArray&)
73
+ -> restrict_mdspan<::cuda::std::remove_all_extents_t<_CArray>,
74
+ ::cuda::std::extents<size_t, ::cuda::std::extent_v<_CArray, 0>>>;
75
+
76
+ template <class _ElementType, class _OtherIndexType, size_t _Size>
77
+ _CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, const ::cuda::std::array<_OtherIndexType, _Size>&)
78
+ -> restrict_mdspan<_ElementType, ::cuda::std::dextents<size_t, _Size>>;
79
+
80
+ template <class _ElementType, class _OtherIndexType, size_t _Size>
81
+ _CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, ::cuda::std::span<_OtherIndexType, _Size>)
82
+ -> restrict_mdspan<_ElementType, ::cuda::std::dextents<size_t, _Size>>;
83
+
84
+ // This one is necessary because all the constructors take `data_handle_type`s, not
85
+ // `_ElementType*`s, and `data_handle_type` is taken from `accessor_type::data_handle_type`, which
86
+ // seems to throw off automatic deduction guides.
87
+ template <class _ElementType, class _OtherIndexType, size_t... _ExtentsPack>
88
+ _CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, const ::cuda::std::extents<_OtherIndexType, _ExtentsPack...>&)
89
+ -> restrict_mdspan<_ElementType, ::cuda::std::extents<_OtherIndexType, _ExtentsPack...>>;
90
+
91
+ template <class _ElementType, class _MappingType>
92
+ _CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, const _MappingType&)
93
+ -> restrict_mdspan<_ElementType, typename _MappingType::extents_type, typename _MappingType::layout_type>;
94
+
95
+ template <class _MappingType, class _AccessorType>
96
+ _CCCL_HOST_DEVICE
97
+ restrict_mdspan(const typename _AccessorType::data_handle_type, const _MappingType&, const _AccessorType&)
98
+ -> restrict_mdspan<typename _AccessorType::element_type,
99
+ typename _MappingType::extents_type,
100
+ typename _MappingType::layout_type,
101
+ _AccessorType>;
36
102
 
37
103
  /***********************************************************************************************************************
38
104
  * Accessibility Traits
@@ -0,0 +1,93 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef _CUDA___MEMORY_POINTER_IN_RANGE_H
12
+ #define _CUDA___MEMORY_POINTER_IN_RANGE_H
13
+
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
24
+ #include <cuda/std/__type_traits/is_constant_evaluated.h>
25
+ #include <cuda/std/cstdint>
26
+ #if _CCCL_HOST_COMPILATION()
27
+ # include <functional>
28
+ #endif // _CCCL_HOST_COMPILATION()
29
+
30
+ #include <cuda/std/__cccl/prologue.h>
31
+
32
+ _CCCL_BEGIN_NAMESPACE_CUDA
33
+
34
+ // Pointers comparison <, <=, >=, > is undefined behavior in C++ (https://eel.is/c++draft/expr.rel#4) when pointers
35
+ // don't belong to the same object or array.
36
+ // - Even when a platform guarantees flat address space, the compiler can leverage UB for optimization purposes.
37
+ // - However, the compiler treats ::std::less<> other functional operators in a special way, ensuring a total ordering.
38
+ // - For device code, we can convert pointers to uintptr_t and compare them.
39
+ //
40
+ // References:
41
+ // - https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2024/p3234r0.html
42
+ // - https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2865r2.pdf
43
+ // - https://www.boost.org/doc/libs/develop/libs/core/doc/html/core/pointer_in_range.html
44
+ // - https://pvs-studio.com/en/blog/posts/cpp/1199/
45
+ // - https://releases.llvm.org/20.1.0/tools/clang/docs/ReleaseNotes.html#resolutions-to-c-defect-reports
46
+
47
+ #if _CCCL_HOST_COMPILATION()
48
+
49
+ template <typename _Tp>
50
+ [[nodiscard]] _CCCL_API bool __ptr_in_range_host(_Tp* __ptr, _Tp* __start, _Tp* __end) noexcept
51
+ {
52
+ _CCCL_ASSERT(::std::greater_equal<>{}(__end, __start), "__ptr_in_range_host: __end must be greater than __start");
53
+ return ::std::greater_equal<>{}(__ptr, __start) && ::std::less<>{}(__ptr, __end);
54
+ }
55
+
56
+ #endif // _CCCL_HOST_COMPILATION()
57
+
58
+ #if _CCCL_DEVICE_COMPILATION()
59
+
60
+ template <typename _Tp>
61
+ [[nodiscard]] _CCCL_API bool __ptr_in_range_device(_Tp* __ptr, _Tp* __start, _Tp* __end) noexcept
62
+ {
63
+ using uintptr_t = ::cuda::std::uintptr_t;
64
+ auto __end_ptr = reinterpret_cast<uintptr_t>(__end);
65
+ auto __start_ptr = reinterpret_cast<uintptr_t>(__start);
66
+ auto __ptr_ptr = reinterpret_cast<uintptr_t>(__ptr);
67
+ _CCCL_ASSERT(__end_ptr >= __start_ptr, "__ptr_in_range_device: __end must be greater than __start");
68
+ return __ptr_ptr >= __start_ptr && __ptr_ptr < __end_ptr;
69
+ }
70
+
71
+ #endif // _CCCL_DEVICE_COMPILATION()
72
+
73
+ template <typename _Tp>
74
+ [[nodiscard]] _CCCL_API constexpr bool ptr_in_range(_Tp* __ptr, _Tp* __start, _Tp* __end) noexcept
75
+ {
76
+ if (::cuda::std::__cccl_default_is_constant_evaluated())
77
+ {
78
+ _CCCL_ASSERT(__end >= __start, "ptr_in_range: __end must be greater than __start");
79
+ return __ptr >= __start && __ptr < __end; // UB is not possible in a constant expression
80
+ }
81
+ else
82
+ {
83
+ NV_IF_ELSE_TARGET(NV_IS_HOST,
84
+ (return ::cuda::__ptr_in_range_host(__ptr, __start, __end);),
85
+ (return ::cuda::__ptr_in_range_device(__ptr, __start, __end);));
86
+ }
87
+ }
88
+
89
+ _CCCL_END_NAMESPACE_CUDA
90
+
91
+ #include <cuda/std/__cccl/epilogue.h>
92
+
93
+ #endif // _CUDA___MEMORY_POINTER_IN_RANGE_H
@@ -8,8 +8,8 @@
8
8
  //
9
9
  //===----------------------------------------------------------------------===//
10
10
 
11
- #ifndef _CUDA___MEMORY_RESOURCE_GET_MEMORY_RESOURCE_CUH
12
- #define _CUDA___MEMORY_RESOURCE_GET_MEMORY_RESOURCE_CUH
11
+ #ifndef _CUDA___MEMORY_RESOURCE_GET_MEMORY_RESOURCE_H
12
+ #define _CUDA___MEMORY_RESOURCE_GET_MEMORY_RESOURCE_H
13
13
 
14
14
  #include <cuda/std/detail/__config>
15
15
 
@@ -23,11 +23,11 @@
23
23
 
24
24
  #include <cuda/__memory_resource/properties.h>
25
25
  #include <cuda/__memory_resource/resource.h>
26
+ #include <cuda/__stream/stream_ref.h>
26
27
  #include <cuda/std/__concepts/equality_comparable.h>
27
28
  #include <cuda/std/__execution/env.h>
28
29
  #include <cuda/std/__type_traits/is_same.h>
29
30
  #include <cuda/std/__type_traits/remove_cvref.h>
30
- #include <cuda/stream_ref>
31
31
 
32
32
  #include <cuda/std/__cccl/prologue.h>
33
33
 
@@ -79,4 +79,4 @@ _CCCL_END_NAMESPACE_CUDA_MR
79
79
 
80
80
  #include <cuda/std/__cccl/epilogue.h>
81
81
 
82
- #endif //_CUDAX__MEMORY_RESOURCE_GET_MEMORY_RESOURCE_CUH
82
+ #endif //_CUDA__MEMORY_RESOURCE_GET_MEMORY_RESOURCE_H
@@ -21,6 +21,7 @@
21
21
  # pragma system_header
22
22
  #endif // no system header
23
23
 
24
+ #include <cuda/std/__type_traits/decay.h>
24
25
  #include <cuda/std/__type_traits/type_set.h>
25
26
  #include <cuda/std/cstddef>
26
27
 
@@ -62,6 +63,49 @@ template <class... _Properties>
62
63
  inline constexpr bool __contains_execution_space_property =
63
64
  __is_host_accessible<_Properties...> || __is_device_accessible<_Properties...>;
64
65
 
66
+ //! @brief A type representing a list of memory resource properties
67
+ //! @tparam _Properties The properties to be included in the list
68
+ //! It has a member template `rebind` that allows constructing a type by combining
69
+ //! a template and type arguments with the properties from this list. The properties
70
+ //! are appended after the type arguments in the resulting type.
71
+ template <class... _Properties>
72
+ struct properties_list
73
+ {
74
+ //! @brief A type alias for a type template instantiated with the properties
75
+ //! from this list appended to the type arguments.
76
+ template <template <class...> class _Fn, class... _ExtraArgs>
77
+ using rebind = _Fn<_ExtraArgs..., _Properties...>;
78
+
79
+ template <class _QueryProperty>
80
+ _CCCL_HOST_API static constexpr bool has_property([[maybe_unused]] _QueryProperty)
81
+ {
82
+ return ::cuda::std::__type_set_contains_v<::cuda::std::__make_type_set<_Properties...>, _QueryProperty>;
83
+ }
84
+ };
85
+
86
+ template <class _Tp>
87
+ inline constexpr bool __is_queries_list = false;
88
+
89
+ template <class... _Tp>
90
+ inline constexpr bool __is_queries_list<properties_list<_Tp...>> = true;
91
+
92
+ template <typename _Tp>
93
+ _CCCL_CONCEPT __has_default_queries =
94
+ _CCCL_REQUIRES_EXPR((_Tp))(requires(__is_queries_list<typename ::cuda::std::decay_t<_Tp>::default_queries>));
95
+
96
+ template <typename _Resource, bool _HasDefaultQueries = __has_default_queries<_Resource>>
97
+ struct __copy_default_queries;
98
+
99
+ template <typename _Resource>
100
+ struct __copy_default_queries<_Resource, true>
101
+ {
102
+ using default_queries = typename _Resource::default_queries;
103
+ };
104
+
105
+ template <typename _Resource>
106
+ struct __copy_default_queries<_Resource, false>
107
+ {};
108
+
65
109
  _CCCL_END_NAMESPACE_CUDA_MR
66
110
 
67
111
  #include <cuda/std/__cccl/epilogue.h>
@@ -22,6 +22,7 @@
22
22
  #endif // no system header
23
23
 
24
24
  #include <cuda/__memory_resource/get_property.h>
25
+ #include <cuda/__stream/stream_ref.h>
25
26
  #include <cuda/std/__concepts/concept_macros.h>
26
27
  #include <cuda/std/__concepts/convertible_to.h>
27
28
  #include <cuda/std/__concepts/equality_comparable.h>
@@ -29,7 +30,6 @@
29
30
  #include <cuda/std/__tuple_dir/sfinae_helpers.h>
30
31
  #include <cuda/std/__type_traits/decay.h>
31
32
  #include <cuda/std/__type_traits/fold.h>
32
- #include <cuda/stream_ref>
33
33
 
34
34
  #include <cuda/std/__cccl/prologue.h>
35
35
 
@@ -26,6 +26,7 @@
26
26
  # include <cuda/__memory_resource/get_property.h>
27
27
  # include <cuda/__memory_resource/properties.h>
28
28
  # include <cuda/__memory_resource/resource.h>
29
+ # include <cuda/__stream/stream_ref.h>
29
30
  # include <cuda/std/__concepts/concept_macros.h>
30
31
  # include <cuda/std/__memory/addressof.h>
31
32
  # include <cuda/std/__type_traits/is_base_of.h>
@@ -34,7 +35,6 @@
34
35
  # include <cuda/std/__utility/exchange.h>
35
36
  # include <cuda/std/__utility/move.h>
36
37
  # include <cuda/std/cstddef>
37
- # include <cuda/stream_ref>
38
38
 
39
39
  # include <cuda/std/__cccl/prologue.h>
40
40
 
@@ -161,10 +161,7 @@ struct _Resource_vtable_builder
161
161
  template <class _Resource>
162
162
  static void _Dealloc(void* __object, void* __ptr, size_t __bytes, size_t __alignment) noexcept
163
163
  {
164
- // TODO: this breaks RMM because their memory resources do not declare their
165
- // deallocate_sync functions to be noexcept. Comment out the check for now until
166
- // we can fix RMM.
167
- // static_assert(noexcept(static_cast<_Resource*>(__object)->deallocate(__ptr, __bytes, __alignment)));
164
+ static_assert(noexcept(static_cast<_Resource*>(__object)->deallocate_sync(__ptr, __bytes, __alignment)));
168
165
  return static_cast<_Resource*>(__object)->deallocate_sync(__ptr, __bytes, __alignment);
169
166
  }
170
167
 
@@ -176,8 +173,9 @@ struct _Resource_vtable_builder
176
173
 
177
174
  template <class _Resource>
178
175
  static void
179
- _Dealloc_async(void* __object, void* __ptr, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream)
176
+ _Dealloc_async(void* __object, void* __ptr, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream) noexcept
180
177
  {
178
+ static_assert(noexcept(static_cast<_Resource*>(__object)->deallocate(__stream, __ptr, __bytes, __alignment)));
181
179
  return static_cast<_Resource*>(__object)->deallocate(__stream, __ptr, __bytes, __alignment);
182
180
  }
183
181
 
@@ -653,8 +653,9 @@
653
653
  #ifndef NVTX3_CPP_DEFINITIONS_V1_0
654
654
  # define NVTX3_CPP_DEFINITIONS_V1_0
655
655
 
656
+ # include <cuda/std/__cccl/memory_wrapper.h>
657
+
656
658
  # include <cstddef>
657
- # include <memory>
658
659
  # include <string>
659
660
  # include <type_traits>
660
661
  # include <utility>
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA___RUNTIME_ENSURE_CURRENT_CONTEXT_H
12
12
  #define _CUDA___RUNTIME_ENSURE_CURRENT_CONTEXT_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -23,7 +23,8 @@
23
23
 
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
 
26
- # include <cuda/__device/all_devices.h>
26
+ # include <cuda/__device/device_ref.h>
27
+ # include <cuda/__device/physical_device.h>
27
28
  # include <cuda/__driver/driver_api.h>
28
29
 
29
30
  # include <cuda/std/__cccl/prologue.h>
@@ -31,6 +32,7 @@
31
32
  # ifndef _CCCL_DOXYGEN_INVOKED // Do not document
32
33
 
33
34
  _CCCL_BEGIN_NAMESPACE_CUDA
35
+
34
36
  class stream_ref;
35
37
 
36
38
  //! @brief RAII helper which on construction sets the current context to the specified one.
@@ -44,9 +46,9 @@ struct [[maybe_unused]] __ensure_current_context
44
46
  //! @param new_device The device to switch the context to
45
47
  //!
46
48
  //! @throws cuda_error if the context switch fails
47
- explicit __ensure_current_context(device_ref __new_device)
49
+ _CCCL_HOST_API explicit __ensure_current_context(device_ref __new_device)
48
50
  {
49
- auto __ctx = devices[__new_device.get()].primary_context();
51
+ auto __ctx = ::cuda::__physical_devices()[__new_device.get()].__primary_context();
50
52
  ::cuda::__driver::__ctxPush(__ctx);
51
53
  }
52
54
 
@@ -56,7 +58,7 @@ struct [[maybe_unused]] __ensure_current_context
56
58
  //! @param ctx The context to switch to
57
59
  //!
58
60
  //! @throws cuda_error if the context switch fails
59
- explicit __ensure_current_context(::CUcontext __ctx)
61
+ _CCCL_HOST_API explicit __ensure_current_context(::CUcontext __ctx)
60
62
  {
61
63
  ::cuda::__driver::__ctxPush(__ctx);
62
64
  }
@@ -67,7 +69,7 @@ struct [[maybe_unused]] __ensure_current_context
67
69
  //! @param stream Stream indicating the context to switch to
68
70
  //!
69
71
  //! @throws cuda_error if the context switch fails
70
- explicit __ensure_current_context(stream_ref __stream);
72
+ _CCCL_HOST_API explicit __ensure_current_context(stream_ref __stream);
71
73
 
72
74
  __ensure_current_context(__ensure_current_context&&) = delete;
73
75
  __ensure_current_context(__ensure_current_context const&) = delete;
@@ -79,7 +81,7 @@ struct [[maybe_unused]] __ensure_current_context
79
81
  //!
80
82
  //! @throws cuda_error if the device switch fails. If the destructor is called
81
83
  //! during stack unwinding, the program is automatically terminated.
82
- ~__ensure_current_context() noexcept(false)
84
+ _CCCL_HOST_API ~__ensure_current_context() noexcept(false)
83
85
  {
84
86
  // TODO would it make sense to assert here that we pushed and popped the same thing?
85
87
  ::cuda::__driver::__ctxPop();
@@ -43,7 +43,7 @@ struct stream : stream_ref
43
43
  //! Priority is defaulted to stream::default_priority
44
44
  //!
45
45
  //! @throws cuda_error if stream creation fails
46
- explicit stream(device_ref __dev, int __priority = default_priority)
46
+ _CCCL_HOST_API explicit stream(device_ref __dev, int __priority = default_priority)
47
47
  : stream_ref(__detail::__invalid_stream)
48
48
  {
49
49
  [[maybe_unused]] __ensure_current_context __ctx_setter(__dev);
@@ -54,7 +54,7 @@ struct stream : stream_ref
54
54
  //!
55
55
  //! @post `stream()` returns an invalid stream handle
56
56
  // Can't be constexpr because __invalid_stream isn't
57
- explicit stream(no_init_t) noexcept
57
+ _CCCL_HOST_API explicit stream(no_init_t) noexcept
58
58
  : stream_ref(__detail::__invalid_stream)
59
59
  {}
60
60
 
@@ -63,7 +63,7 @@ struct stream : stream_ref
63
63
  //! @param __other
64
64
  //!
65
65
  //! @post `__other` is in moved-from state.
66
- stream(stream&& __other) noexcept
66
+ _CCCL_HOST_API stream(stream&& __other) noexcept
67
67
  : stream(::cuda::std::exchange(__other.__stream, __detail::__invalid_stream))
68
68
  {}
69
69
 
@@ -72,7 +72,7 @@ struct stream : stream_ref
72
72
  //! Destroy the `stream` object
73
73
  //!
74
74
  //! @note If the stream fails to be destroyed, the error is silently ignored.
75
- ~stream()
75
+ _CCCL_HOST_API ~stream()
76
76
  {
77
77
  if (__stream != __detail::__invalid_stream)
78
78
  {
@@ -87,7 +87,7 @@ struct stream : stream_ref
87
87
  //! @param __other
88
88
  //!
89
89
  //! @post `__other` is in a moved-from state.
90
- stream& operator=(stream&& __other) noexcept
90
+ _CCCL_HOST_API stream& operator=(stream&& __other) noexcept
91
91
  {
92
92
  stream __tmp(::cuda::std::move(__other));
93
93
  ::cuda::std::swap(__stream, __tmp.__stream);
@@ -103,7 +103,7 @@ struct stream : stream_ref
103
103
  //! @return stream The constructed `stream` object
104
104
  //!
105
105
  //! @note The constructed `stream` object takes ownership of the native handle.
106
- [[nodiscard]] static stream from_native_handle(::cudaStream_t __handle)
106
+ [[nodiscard]] static _CCCL_HOST_API stream from_native_handle(::cudaStream_t __handle)
107
107
  {
108
108
  return stream(__handle);
109
109
  }
@@ -119,7 +119,7 @@ struct stream : stream_ref
119
119
  //! @return cudaStream_t The native handle being held by the `stream` object.
120
120
  //!
121
121
  //! @post The stream object is in a moved-from state.
122
- [[nodiscard]] ::cudaStream_t release()
122
+ [[nodiscard]] _CCCL_HOST_API ::cudaStream_t release()
123
123
  {
124
124
  return ::cuda::std::exchange(__stream, __detail::__invalid_stream);
125
125
  }
@@ -127,7 +127,7 @@ struct stream : stream_ref
127
127
  private:
128
128
  // Use `stream::from_native_handle(s)` to construct an owning `stream`
129
129
  // object from a `cudaStream_t` handle.
130
- explicit stream(::cudaStream_t __handle)
130
+ _CCCL_HOST_API explicit stream(::cudaStream_t __handle)
131
131
  : stream_ref(__handle)
132
132
  {}
133
133
  };
@@ -23,12 +23,14 @@
23
23
 
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
 
26
+ # include <cuda/__device/device_ref.h>
26
27
  # include <cuda/__driver/driver_api.h>
27
28
  # include <cuda/__event/timed_event.h>
28
29
  # include <cuda/__fwd/get_stream.h>
29
30
  # include <cuda/__runtime/ensure_current_context.h>
30
31
  # include <cuda/__utility/no_init.h>
31
32
  # include <cuda/std/__exception/cuda_error.h>
33
+ # include <cuda/std/__utility/to_underlying.h>
32
34
  # include <cuda/std/cstddef>
33
35
 
34
36
  # include <cuda/std/__cccl/prologue.h>
@@ -60,9 +62,10 @@ public:
60
62
  //!
61
63
  //! For behavior of the default stream,
62
64
  //! @see //! https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html
63
- [[deprecated("Using the default/null stream is generally discouraged. If you need to use it, please construct a "
64
- "stream_ref from cudaStream_t{nullptr}")]]
65
- _CCCL_HIDE_FROM_ABI stream_ref() = default;
65
+ CCCL_DEPRECATED_BECAUSE("Using the default/null stream is generally discouraged. If you need to use it, please "
66
+ "construct a "
67
+ "stream_ref from cudaStream_t{nullptr}") _CCCL_HIDE_FROM_ABI
68
+ stream_ref() = default;
66
69
 
67
70
  //! @brief Constructs a `stream_ref` from a `cudaStream_t` handle.
68
71
  //!
@@ -123,8 +126,7 @@ public:
123
126
  //! @brief Deprecated. Use sync() instead.
124
127
  //!
125
128
  //! @deprecated Use sync() instead.
126
- [[deprecated("Use sync() instead.")]]
127
- void wait() const
129
+ CCCL_DEPRECATED_BECAUSE("Use sync() instead.") _CCCL_HOST_API void wait() const
128
130
  {
129
131
  sync();
130
132
  }
@@ -183,7 +185,7 @@ public:
183
185
  //! @throws cuda::cuda_error if the query fails.
184
186
  //!
185
187
  //! @return `true` if all operations have completed, or `false` if not.
186
- [[deprecated("Use is_done() instead.")]] [[nodiscard]] bool ready() const
188
+ [[nodiscard]] CCCL_DEPRECATED_BECAUSE("Use is_done() instead.") _CCCL_HOST_API bool ready() const
187
189
  {
188
190
  return is_done();
189
191
  }
@@ -215,7 +217,7 @@ public:
215
217
  //! @return A new event that was recorded into this stream
216
218
  //!
217
219
  //! @throws cuda_error if event creation or record failed
218
- [[nodiscard]] _CCCL_HOST_API event record_event(event::flags __flags = event::flags::none) const
220
+ [[nodiscard]] _CCCL_HOST_API event record_event(event_flags __flags = event_flags::none) const
219
221
  {
220
222
  return event(*this, __flags);
221
223
  }
@@ -225,7 +227,7 @@ public:
225
227
  //! @return A new timed event that was recorded into this stream
226
228
  //!
227
229
  //! @throws cuda_error if event creation or record failed
228
- [[nodiscard]] _CCCL_HOST_API timed_event record_timed_event(event::flags __flags = event::flags::none) const
230
+ [[nodiscard]] _CCCL_HOST_API timed_event record_timed_event(event_flags __flags = event_flags::none) const
229
231
  {
230
232
  return timed_event(*this, __flags);
231
233
  }
@@ -236,7 +238,7 @@ public:
236
238
  //! returned
237
239
  //!
238
240
  //! @throws cuda_error if device check fails
239
- _CCCL_HOST_API device_ref device() const
241
+ [[nodiscard]] _CCCL_HOST_API device_ref device() const
240
242
  {
241
243
  ::CUdevice __device{};
242
244
  # if _CCCL_CTK_AT_LEAST(13, 0)
@@ -259,7 +261,7 @@ public:
259
261
  }
260
262
  };
261
263
 
262
- inline void event_ref::record(stream_ref __stream) const
264
+ _CCCL_HOST_API inline void event_ref::record(stream_ref __stream) const
263
265
  {
264
266
  _CCCL_ASSERT(__event_ != nullptr, "cuda::event_ref::record no event set");
265
267
  _CCCL_ASSERT(__stream.get() != nullptr, "cuda::event_ref::record invalid stream passed");
@@ -267,26 +269,26 @@ inline void event_ref::record(stream_ref __stream) const
267
269
  ::cuda::__driver::__eventRecord(__event_, __stream.get());
268
270
  }
269
271
 
270
- inline event::event(stream_ref __stream, event::flags __flags)
271
- : event(__stream, static_cast<unsigned>(__flags) | cudaEventDisableTiming)
272
+ _CCCL_HOST_API inline event::event(stream_ref __stream, event_flags __flags)
273
+ : event(__stream, ::cuda::std::to_underlying(__flags) | cudaEventDisableTiming)
272
274
  {
273
275
  record(__stream);
274
276
  }
275
277
 
276
- inline event::event(stream_ref __stream, unsigned __flags)
278
+ _CCCL_HOST_API inline event::event(stream_ref __stream, unsigned __flags)
277
279
  : event_ref(::cudaEvent_t{})
278
280
  {
279
281
  [[maybe_unused]] __ensure_current_context __ctx_setter(__stream);
280
282
  __event_ = ::cuda::__driver::__eventCreate(static_cast<unsigned>(__flags));
281
283
  }
282
284
 
283
- inline timed_event::timed_event(stream_ref __stream, event::flags __flags)
284
- : event(__stream, static_cast<unsigned>(__flags))
285
+ _CCCL_HOST_API inline timed_event::timed_event(stream_ref __stream, event_flags __flags)
286
+ : event(__stream, ::cuda::std::to_underlying(__flags))
285
287
  {
286
288
  record(__stream);
287
289
  }
288
290
 
289
- inline __ensure_current_context::__ensure_current_context(stream_ref __stream)
291
+ _CCCL_HOST_API inline __ensure_current_context::__ensure_current_context(stream_ref __stream)
290
292
  {
291
293
  auto __ctx = __driver::__streamGetCtx(__stream.get());
292
294
  ::cuda::__driver::__ctxPush(__ctx);
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA___UTILITY_BASIC_ANY_H
12
12
  #define _CUDA___UTILITY_BASIC_ANY_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -0,0 +1,65 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef _CUDA___UTILITY_IN_RANGE_H
12
+ #define _CUDA___UTILITY_IN_RANGE_H
13
+
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
24
+ #include <cuda/__type_traits/is_floating_point.h>
25
+ #include <cuda/std/__cmath/isnan.h>
26
+ #include <cuda/std/__concepts/concept_macros.h>
27
+ #include <cuda/std/__type_traits/conditional.h>
28
+ #include <cuda/std/__type_traits/is_extended_floating_point.h>
29
+ #include <cuda/std/__type_traits/is_integer.h>
30
+ #include <cuda/std/__type_traits/is_unsigned_integer.h>
31
+
32
+ #include <cuda/std/__cccl/prologue.h>
33
+
34
+ _CCCL_BEGIN_NAMESPACE_CUDA
35
+
36
+ _CCCL_TEMPLATE(typename _Tp)
37
+ _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> || ::cuda::std::is_floating_point_v<_Tp>
38
+ || ::cuda::std::__is_extended_floating_point_v<_Tp>)
39
+ [[nodiscard]] _CCCL_API constexpr bool in_range(_Tp __v, _Tp __start, _Tp __end) noexcept
40
+ {
41
+ _CCCL_ASSERT(::cuda::std::isnan(__start) || ::cuda::std::isnan(__end) || __end >= __start,
42
+ "in_range: __end must be greater than or equal to __start");
43
+ if constexpr (::cuda::std::__cccl_is_unsigned_integer_v<_Tp>)
44
+ {
45
+ // if __end > __start, we know that the range is always positive. Similarly, __v is positive if unsigned.
46
+ // this optimization is useful when __start and __end are compile-time constants, or when in_range is used multiple
47
+ // times with the same range
48
+ using _Up = ::cuda::std::conditional_t<(sizeof(_Tp) <= sizeof(unsigned)), unsigned, _Tp>; // at least 32-bit
49
+ const auto __start1 = static_cast<_Up>(__start);
50
+ const auto __end1 = static_cast<_Up>(__end);
51
+ const auto __v1 = static_cast<_Up>(__v);
52
+ const auto __range = __end1 - __start1;
53
+ return (__v1 - __start1) <= __range;
54
+ }
55
+ else
56
+ {
57
+ return __v >= __start && __v <= __end;
58
+ }
59
+ }
60
+
61
+ _CCCL_END_NAMESPACE_CUDA
62
+
63
+ #include <cuda/std/__cccl/epilogue.h>
64
+
65
+ #endif // _CUDA___UTILITY_IN_RANGE_H
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA_ALGORITHM
12
12
  #define _CUDA_ALGORITHM
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header