cuda-cccl 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -216,11 +216,10 @@ _CCCL_HOST_API inline void __deviceGetName(char* __name_out, int __len, int __or
216
216
  return __result;
217
217
  }
218
218
 
219
- _CCCL_HOST_API inline void __primaryCtxRelease(::CUdevice __dev)
219
+ [[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __primaryCtxReleaseNoThrow(::CUdevice __dev)
220
220
  {
221
221
  static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDevicePrimaryCtxRelease);
222
- // TODO we might need to ignore failure here
223
- ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to release context for a device", __dev);
222
+ return static_cast<::cudaError_t>(__driver_fn(__dev));
224
223
  }
225
224
 
226
225
  [[nodiscard]] _CCCL_HOST_API inline bool __isPrimaryCtxActive(::CUdevice __dev)
@@ -325,6 +324,109 @@ _CCCL_HOST_API void __memsetAsync(void* __dst, _Tp __value, size_t __count, ::CU
325
324
  }
326
325
  }
327
326
 
327
+ _CCCL_HOST_API inline ::cudaError_t __mempoolCreateNoThrow(::CUmemoryPool* __pool, ::CUmemPoolProps* __props)
328
+ {
329
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolCreate);
330
+ return static_cast<::cudaError_t>(__driver_fn(__pool, __props));
331
+ }
332
+
333
+ _CCCL_HOST_API inline void __mempoolSetAttribute(::CUmemoryPool __pool, ::CUmemPool_attribute __attr, void* __value)
334
+ {
335
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolSetAttribute);
336
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to set attribute for a memory pool", __pool, __attr, __value);
337
+ }
338
+
339
+ _CCCL_HOST_API inline size_t __mempoolGetAttribute(::CUmemoryPool __pool, ::CUmemPool_attribute __attr)
340
+ {
341
+ size_t __value = 0;
342
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolGetAttribute);
343
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get attribute for a memory pool", __pool, __attr, &__value);
344
+ return __value;
345
+ }
346
+
347
+ _CCCL_HOST_API inline void __mempoolDestroy(::CUmemoryPool __pool)
348
+ {
349
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolDestroy);
350
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to destroy a memory pool", __pool);
351
+ }
352
+
353
+ _CCCL_HOST_API inline ::CUdeviceptr
354
+ __mallocFromPoolAsync(::cuda::std::size_t __bytes, ::CUmemoryPool __pool, ::CUstream __stream)
355
+ {
356
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemAllocFromPoolAsync);
357
+ ::CUdeviceptr __result = 0;
358
+ ::cuda::__driver::__call_driver_fn(
359
+ __driver_fn, "Failed to allocate memory from a memory pool", &__result, __bytes, __pool, __stream);
360
+ return __result;
361
+ }
362
+
363
+ _CCCL_HOST_API inline void __mempoolTrimTo(::CUmemoryPool __pool, ::cuda::std::size_t __min_bytes_to_keep)
364
+ {
365
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolTrimTo);
366
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to trim a memory pool", __pool, __min_bytes_to_keep);
367
+ }
368
+
369
+ _CCCL_HOST_API inline ::cudaError_t __freeAsyncNoThrow(::CUdeviceptr __dptr, ::CUstream __stream)
370
+ {
371
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemFreeAsync);
372
+ return static_cast<::cudaError_t>(__driver_fn(__dptr, __stream));
373
+ }
374
+
375
+ _CCCL_HOST_API inline void __mempoolSetAccess(::CUmemoryPool __pool, ::CUmemAccessDesc* __descs, ::size_t __count)
376
+ {
377
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolSetAccess);
378
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to set access of a memory pool", __pool, __descs, __count);
379
+ }
380
+
381
+ _CCCL_HOST_API inline ::CUmemAccess_flags __mempoolGetAccess(::CUmemoryPool __pool, ::CUmemLocation* __location)
382
+ {
383
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolGetAccess);
384
+ ::CUmemAccess_flags __flags;
385
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get access of a memory pool", &__flags, __pool, __location);
386
+ return __flags;
387
+ }
388
+
389
+ # if _CCCL_CTK_AT_LEAST(13, 0)
390
+ _CCCL_HOST_API inline ::CUmemoryPool
391
+ __getDefaultMemPool(CUmemLocation __location, CUmemAllocationType_enum __allocation_type)
392
+ {
393
+ static auto __driver_fn =
394
+ _CCCLRT_GET_DRIVER_FUNCTION_VERSIONED(cuMemGetDefaultMemPool, cuMemGetDefaultMemPool, 13, 0);
395
+ ::CUmemoryPool __result = nullptr;
396
+ ::cuda::__driver::__call_driver_fn(
397
+ __driver_fn, "Failed to get default memory pool", &__result, &__location, __allocation_type);
398
+ return __result;
399
+ }
400
+ # endif // _CCCL_CTK_AT_LEAST(13, 0)
401
+
402
+ _CCCL_HOST_API inline ::CUdeviceptr __mallocManaged(::cuda::std::size_t __bytes, unsigned int __flags)
403
+ {
404
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemAllocManaged);
405
+ ::CUdeviceptr __result = 0;
406
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to allocate managed memory", &__result, __bytes, __flags);
407
+ return __result;
408
+ }
409
+
410
+ _CCCL_HOST_API inline void* __mallocHost(::cuda::std::size_t __bytes)
411
+ {
412
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemAllocHost);
413
+ void* __result = nullptr;
414
+ ::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to allocate host memory", &__result, __bytes);
415
+ return __result;
416
+ }
417
+
418
+ _CCCL_HOST_API inline ::cudaError_t __freeNoThrow(::CUdeviceptr __dptr)
419
+ {
420
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemFree);
421
+ return static_cast<::cudaError_t>(__driver_fn(__dptr));
422
+ }
423
+
424
+ _CCCL_HOST_API inline ::cudaError_t __freeHostNoThrow(void* __dptr)
425
+ {
426
+ static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemFreeHost);
427
+ return static_cast<::cudaError_t>(__driver_fn(__dptr));
428
+ }
429
+
328
430
  // Unified Addressing
329
431
 
330
432
  // TODO: we don't want to have these functions here, refactoring expected
@@ -23,12 +23,13 @@
23
23
 
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
 
26
+ # include <cuda/__device/device_ref.h>
26
27
  # include <cuda/__driver/driver_api.h>
27
28
  # include <cuda/__event/event_ref.h>
28
29
  # include <cuda/__runtime/ensure_current_context.h>
29
30
  # include <cuda/__utility/no_init.h>
31
+ # include <cuda/std/__utility/to_underlying.h>
30
32
  # include <cuda/std/cstddef>
31
- # include <cuda/std/utility>
32
33
 
33
34
  # include <cuda/std/__cccl/prologue.h>
34
35
 
@@ -36,38 +37,43 @@ _CCCL_BEGIN_NAMESPACE_CUDA
36
37
 
37
38
  class timed_event;
38
39
 
40
+ //! @brief Flags to use when creating the event.
41
+ enum class event_flags : unsigned
42
+ {
43
+ none = cudaEventDefault,
44
+ blocking_sync = cudaEventBlockingSync,
45
+ interprocess = cudaEventInterprocess,
46
+ };
47
+
48
+ [[nodiscard]] _CCCL_HOST_API constexpr event_flags operator|(event_flags __lhs, event_flags __rhs) noexcept
49
+ {
50
+ return static_cast<event_flags>(::cuda::std::to_underlying(__lhs) | ::cuda::std::to_underlying(__rhs));
51
+ }
52
+
39
53
  //! @brief An owning wrapper for an untimed `cudaEvent_t`.
40
54
  class event : public event_ref
41
55
  {
42
56
  friend class timed_event;
43
57
 
44
58
  public:
45
- //! @brief Flags to use when creating the event.
46
- enum class flags : unsigned
47
- {
48
- none = cudaEventDefault,
49
- blocking_sync = cudaEventBlockingSync,
50
- interprocess = cudaEventInterprocess,
51
- };
52
-
53
59
  //! @brief Construct a new `event` object with timing disabled, and record
54
60
  //! the event in the specified stream.
55
61
  //!
56
62
  //! @throws cuda_error if the event creation fails.
57
- explicit event(stream_ref __stream, flags __flags = flags::none);
63
+ _CCCL_HOST_API explicit event(stream_ref __stream, event_flags __flags = event_flags::none);
58
64
 
59
65
  //! @brief Construct a new `event` object with timing disabled. The event can only be recorded on streams from the
60
66
  //! specified device.
61
67
  //!
62
68
  //! @throws cuda_error if the event creation fails.
63
- explicit event(device_ref __device, flags __flags = flags::none)
64
- : event(__device, static_cast<unsigned int>(__flags) | cudaEventDisableTiming)
69
+ _CCCL_HOST_API explicit event(device_ref __device, event_flags __flags = event_flags::none)
70
+ : event(__device, ::cuda::std::to_underlying(__flags) | cudaEventDisableTiming)
65
71
  {}
66
72
 
67
73
  //! @brief Construct a new `event` object into the moved-from state.
68
74
  //!
69
75
  //! @post `get()` returns `cudaEvent_t()`.
70
- explicit constexpr event(no_init_t) noexcept
76
+ _CCCL_HOST_API explicit constexpr event(no_init_t) noexcept
71
77
  : event_ref(::cudaEvent_t{})
72
78
  {}
73
79
 
@@ -76,7 +82,7 @@ public:
76
82
  //! @param __other
77
83
  //!
78
84
  //! @post `__other` is in a moved-from state.
79
- constexpr event(event&& __other) noexcept
85
+ _CCCL_HOST_API constexpr event(event&& __other) noexcept
80
86
  : event_ref(::cuda::std::exchange(__other.__event_, {}))
81
87
  {}
82
88
 
@@ -86,7 +92,7 @@ public:
86
92
  //! @brief Destroy the `event` object
87
93
  //!
88
94
  //! @note If the event fails to be destroyed, the error is silently ignored.
89
- ~event()
95
+ _CCCL_HOST_API ~event()
90
96
  {
91
97
  if (__event_ != nullptr)
92
98
  {
@@ -101,7 +107,7 @@ public:
101
107
  //! @param __other
102
108
  //!
103
109
  //! @post `__other` is in a moved-from state.
104
- event& operator=(event&& __other) noexcept
110
+ _CCCL_HOST_API event& operator=(event&& __other) noexcept
105
111
  {
106
112
  event __tmp(::cuda::std::move(__other));
107
113
  ::cuda::std::swap(__event_, __tmp.__event_);
@@ -118,7 +124,7 @@ public:
118
124
  //! @return event The constructed `event` object
119
125
  //!
120
126
  //! @note The constructed `event` object takes ownership of the native handle.
121
- [[nodiscard]] static event from_native_handle(::cudaEvent_t __evnt) noexcept
127
+ [[nodiscard]] static _CCCL_HOST_API event from_native_handle(::cudaEvent_t __evnt) noexcept
122
128
  {
123
129
  return event(__evnt);
124
130
  }
@@ -134,26 +140,21 @@ public:
134
140
  //! @return cudaEvent_t The native handle being held by the `event` object.
135
141
  //!
136
142
  //! @post The event object is in a moved-from state.
137
- [[nodiscard]] constexpr ::cudaEvent_t release() noexcept
143
+ [[nodiscard]] _CCCL_HOST_API constexpr ::cudaEvent_t release() noexcept
138
144
  {
139
145
  return ::cuda::std::exchange(__event_, {});
140
146
  }
141
147
 
142
- [[nodiscard]] friend constexpr flags operator|(flags __lhs, flags __rhs) noexcept
143
- {
144
- return static_cast<flags>(static_cast<unsigned>(__lhs) | static_cast<unsigned>(__rhs));
145
- }
146
-
147
148
  private:
148
149
  // Use `event::from_native_handle(e)` to construct an owning `event`
149
150
  // object from a `cudaEvent_t` handle.
150
- explicit constexpr event(::cudaEvent_t __evnt) noexcept
151
+ _CCCL_HOST_API explicit constexpr event(::cudaEvent_t __evnt) noexcept
151
152
  : event_ref(__evnt)
152
153
  {}
153
154
 
154
- explicit event(stream_ref __stream, unsigned __flags);
155
+ _CCCL_HOST_API explicit event(stream_ref __stream, unsigned __flags);
155
156
 
156
- explicit event(device_ref __device, unsigned __flags)
157
+ _CCCL_HOST_API explicit event(device_ref __device, unsigned __flags)
157
158
  : event_ref(::cudaEvent_t{})
158
159
  {
159
160
  [[maybe_unused]] __ensure_current_context __ctx_setter(__device);
@@ -56,7 +56,7 @@ public:
56
56
  //!
57
57
  //! @note: It is the callers responsibility to ensure the `event_ref` does not
58
58
  //! outlive the event denoted by the `cudaEvent_t` handle.
59
- constexpr event_ref(::cudaEvent_t __evnt) noexcept
59
+ _CCCL_HOST_API constexpr event_ref(::cudaEvent_t __evnt) noexcept
60
60
  : __event_(__evnt)
61
61
  {}
62
62
 
@@ -108,7 +108,7 @@ public:
108
108
  //! @brief Retrieve the native `cudaEvent_t` handle.
109
109
  //!
110
110
  //! @return cudaEvent_t The native handle being held by the event_ref object.
111
- [[nodiscard]] constexpr ::cudaEvent_t get() const noexcept
111
+ [[nodiscard]] _CCCL_HOST_API constexpr ::cudaEvent_t get() const noexcept
112
112
  {
113
113
  return __event_;
114
114
  }
@@ -116,7 +116,7 @@ public:
116
116
  //! @brief Checks if the `event_ref` is valid
117
117
  //!
118
118
  //! @return true if the `event_ref` is valid, false otherwise.
119
- [[nodiscard]] explicit constexpr operator bool() const noexcept
119
+ [[nodiscard]] _CCCL_HOST_API explicit constexpr operator bool() const noexcept
120
120
  {
121
121
  return __event_ != nullptr;
122
122
  }
@@ -129,7 +129,7 @@ public:
129
129
  //! @param __lhs The first `event_ref` to compare
130
130
  //! @param __rhs The second `event_ref` to compare
131
131
  //! @return true if `lhs` and `rhs` refer to the same `cudaEvent_t` object.
132
- [[nodiscard]] friend constexpr bool operator==(event_ref __lhs, event_ref __rhs) noexcept
132
+ [[nodiscard]] friend _CCCL_HOST_API constexpr bool operator==(event_ref __lhs, event_ref __rhs) noexcept
133
133
  {
134
134
  return __lhs.__event_ == __rhs.__event_;
135
135
  }
@@ -142,7 +142,7 @@ public:
142
142
  //! @param __lhs The first `event_ref` to compare
143
143
  //! @param __rhs The second `event_ref` to compare
144
144
  //! @return true if `lhs` and `rhs` refer to different `cudaEvent_t` objects.
145
- [[nodiscard]] friend constexpr bool operator!=(event_ref __lhs, event_ref __rhs) noexcept
145
+ [[nodiscard]] friend _CCCL_HOST_API constexpr bool operator!=(event_ref __lhs, event_ref __rhs) noexcept
146
146
  {
147
147
  return __lhs.__event_ != __rhs.__event_;
148
148
  }
@@ -26,10 +26,12 @@
26
26
 
27
27
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
28
28
 
29
+ # include <cuda/__device/device_ref.h>
29
30
  # include <cuda/__driver/driver_api.h>
30
31
  # include <cuda/__event/event.h>
31
32
  # include <cuda/__utility/no_init.h>
32
33
  # include <cuda/std/__chrono/duration.h>
34
+ # include <cuda/std/__utility/to_underlying.h>
33
35
  # include <cuda/std/cstddef>
34
36
 
35
37
  # include <cuda/std/__cccl/prologue.h>
@@ -44,20 +46,20 @@ public:
44
46
  //! and record the event on the specified stream.
45
47
  //!
46
48
  //! @throws cuda_error if the event creation fails.
47
- explicit timed_event(stream_ref __stream, flags __flags = flags::none);
49
+ _CCCL_HOST_API explicit timed_event(stream_ref __stream, event_flags __flags = event_flags::none);
48
50
 
49
51
  //! @brief Construct a new `timed_event` object with the specified flags. The event can only be recorded on streams
50
52
  //! from the specified device.
51
53
  //!
52
54
  //! @throws cuda_error if the event creation fails.
53
- explicit timed_event(device_ref __device, flags __flags = flags::none)
54
- : event(__device, static_cast<unsigned>(__flags))
55
+ _CCCL_HOST_API explicit timed_event(device_ref __device, event_flags __flags = event_flags::none)
56
+ : event(__device, ::cuda::std::to_underlying(__flags))
55
57
  {}
56
58
 
57
59
  //! @brief Construct a new `timed_event` object into the moved-from state.
58
60
  //!
59
61
  //! @post `get()` returns `cudaEvent_t()`.
60
- explicit constexpr timed_event(no_init_t) noexcept
62
+ _CCCL_HOST_API explicit constexpr timed_event(no_init_t) noexcept
61
63
  : event(no_init)
62
64
  {}
63
65
 
@@ -73,7 +75,7 @@ public:
73
75
  //! @return timed_event The constructed `timed_event` object
74
76
  //!
75
77
  //! @note The constructed `timed_event` object takes ownership of the native handle.
76
- [[nodiscard]] static timed_event from_native_handle(::cudaEvent_t __evnt) noexcept
78
+ [[nodiscard]] static _CCCL_HOST_API timed_event from_native_handle(::cudaEvent_t __evnt) noexcept
77
79
  {
78
80
  return timed_event(__evnt);
79
81
  }
@@ -94,7 +96,8 @@ public:
94
96
  //! @return cuda::std::chrono::nanoseconds The elapsed time in nanoseconds.
95
97
  //!
96
98
  //! @note The elapsed time has a resolution of approximately 0.5 microseconds.
97
- [[nodiscard]] friend ::cuda::std::chrono::nanoseconds operator-(const timed_event& __end, const timed_event& __start)
99
+ [[nodiscard]] friend _CCCL_HOST_API ::cuda::std::chrono::nanoseconds
100
+ operator-(const timed_event& __end, const timed_event& __start)
98
101
  {
99
102
  const auto __ms = ::cuda::__driver::__eventElapsedTime(__start.get(), __end.get());
100
103
  return ::cuda::std::chrono::nanoseconds(static_cast<::cuda::std::chrono::nanoseconds::rep>(__ms * 1'000'000.0));
@@ -103,7 +106,7 @@ public:
103
106
  private:
104
107
  // Use `timed_event::from_native_handle(e)` to construct an owning `timed_event`
105
108
  // object from a `cudaEvent_t` handle.
106
- explicit constexpr timed_event(::cudaEvent_t __evnt) noexcept
109
+ _CCCL_HOST_API explicit constexpr timed_event(::cudaEvent_t __evnt) noexcept
107
110
  : event(__evnt)
108
111
  {}
109
112
  };
@@ -0,0 +1,44 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef _CUDA___FWD_DEVICES_H
12
+ #define _CUDA___FWD_DEVICES_H
13
+
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
24
+ #include <cuda/std/__fwd/span.h>
25
+
26
+ #include <cuda/std/__cccl/prologue.h>
27
+
28
+ _CCCL_BEGIN_NAMESPACE_CUDA
29
+
30
+ class __physical_device;
31
+ class device_ref;
32
+ template <::cudaDeviceAttr _Attr>
33
+ struct __dev_attr;
34
+ struct arch_traits_t;
35
+ class compute_capability;
36
+ enum class arch_id : int;
37
+
38
+ inline constexpr int __arch_specific_id_multiplier = 100000;
39
+
40
+ _CCCL_END_NAMESPACE_CUDA
41
+
42
+ #include <cuda/std/__cccl/epilogue.h>
43
+
44
+ #endif // _CUDA___FWD_DEVICES_H
@@ -42,6 +42,15 @@ inline constexpr bool __is_zip_function = false;
42
42
  template <class _Fn>
43
43
  inline constexpr bool __is_zip_function<zip_function<_Fn>> = true;
44
44
 
45
+ template <class _Fn, class... _Iterators>
46
+ class zip_transform_iterator;
47
+
48
+ template <class>
49
+ inline constexpr bool __is_zip_transform_iterator = false;
50
+
51
+ template <class _Fn, class... _Iterators>
52
+ inline constexpr bool __is_zip_transform_iterator<zip_transform_iterator<_Fn, _Iterators...>> = true;
53
+
45
54
  _CCCL_END_NAMESPACE_CUDA
46
55
 
47
56
  #include <cuda/std/__cccl/epilogue.h>
@@ -23,6 +23,7 @@
23
23
 
24
24
  #include <cuda/std/__iterator/concepts.h>
25
25
  #include <cuda/std/__iterator/iterator_traits.h>
26
+ #include <cuda/std/__ranges/compressed_movable_box.h>
26
27
  #include <cuda/std/__ranges/movable_box.h>
27
28
  #include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
28
29
  #include <cuda/std/__type_traits/is_nothrow_move_constructible.h>
@@ -64,8 +65,28 @@ class constant_iterator
64
65
  private:
65
66
  static_assert(::cuda::std::__integer_like<_Index>, "The index type of cuda::constant_iterator must be integer-like!");
66
67
 
67
- ::cuda::std::ranges::__movable_box<_Tp> __value_{::cuda::std::in_place};
68
- _Index __index_ = 0;
68
+ // Not a base because then the friend operators would be ambiguous
69
+ ::cuda::std::__compressed_movable_box<_Index, _Tp> __store_;
70
+
71
+ [[nodiscard]] _CCCL_API constexpr _Index& __index() noexcept
72
+ {
73
+ return __store_.template __get<0>();
74
+ }
75
+
76
+ [[nodiscard]] _CCCL_API constexpr const _Index& __index() const noexcept
77
+ {
78
+ return __store_.template __get<0>();
79
+ }
80
+
81
+ [[nodiscard]] _CCCL_API constexpr _Tp& __value() noexcept
82
+ {
83
+ return __store_.template __get<1>();
84
+ }
85
+
86
+ [[nodiscard]] _CCCL_API constexpr const _Tp& __value() const noexcept
87
+ {
88
+ return __store_.template __get<1>();
89
+ }
69
90
 
70
91
  public:
71
92
  using iterator_concept = ::cuda::std::random_access_iterator_tag;
@@ -78,22 +99,17 @@ public:
78
99
  using reference = _Tp;
79
100
  using pointer = void;
80
101
 
81
- #if _CCCL_HAS_CONCEPTS()
82
- _CCCL_HIDE_FROM_ABI constant_iterator()
83
- requires ::cuda::std::default_initializable<_Tp>
84
- = default;
85
- #else // ^^^ _CCCL_HAS_CONCEPTS() ^^^ / vvv !_CCCL_HAS_CONCEPTS() vvv
86
102
  _CCCL_TEMPLATE(class _Tp2 = _Tp)
87
103
  _CCCL_REQUIRES(::cuda::std::default_initializable<_Tp2>)
88
- _CCCL_API constexpr constant_iterator() noexcept(::cuda::std::is_nothrow_default_constructible_v<_Tp2>) {}
89
- #endif // !_CCCL_HAS_CONCEPTS()
104
+ _CCCL_API constexpr constant_iterator() noexcept(::cuda::std::is_nothrow_default_constructible_v<_Tp2>)
105
+ : __store_()
106
+ {}
90
107
 
91
108
  //! @brief Creates a @c constant_iterator from a value. The index is set to zero
92
109
  //! @param __value The value to store in the @c constant_iterator
93
110
  _CCCL_EXEC_CHECK_DISABLE
94
111
  _CCCL_API constexpr constant_iterator(_Tp __value) noexcept(::cuda::std::is_nothrow_move_constructible_v<_Tp>)
95
- : __value_(::cuda::std::in_place, ::cuda::std::move(__value))
96
- , __index_()
112
+ : __store_(0, ::cuda::std::move(__value))
97
113
  {}
98
114
 
99
115
  //! @brief Creates @c constant_iterator from a value and an index
@@ -104,32 +120,31 @@ public:
104
120
  _CCCL_REQUIRES(::cuda::std::__integer_like<_Index2>)
105
121
  _CCCL_API constexpr explicit constant_iterator(_Tp __value, _Index2 __index) noexcept(
106
122
  ::cuda::std::is_nothrow_move_constructible_v<_Tp>)
107
- : __value_(::cuda::std::in_place, ::cuda::std::move(__value))
108
- , __index_(static_cast<_Index>(__index))
123
+ : __store_(static_cast<_Index>(__index), ::cuda::std::move(__value))
109
124
  {}
110
125
 
111
126
  //! @brief Returns a the current index
112
127
  [[nodiscard]] _CCCL_API constexpr difference_type index() const noexcept
113
128
  {
114
- return static_cast<difference_type>(__index_);
129
+ return static_cast<difference_type>(__index());
115
130
  }
116
131
 
117
132
  //! @brief Returns a const reference to the stored value
118
133
  [[nodiscard]] _CCCL_API constexpr const _Tp& operator*() const noexcept
119
134
  {
120
- return *__value_;
135
+ return __value();
121
136
  }
122
137
 
123
138
  //! @brief Returns a const reference to the stored value
124
139
  [[nodiscard]] _CCCL_API constexpr const _Tp& operator[](difference_type) const noexcept
125
140
  {
126
- return *__value_;
141
+ return __value();
127
142
  }
128
143
 
129
144
  //! @brief Increments the stored index
130
145
  _CCCL_API constexpr constant_iterator& operator++() noexcept
131
146
  {
132
- ++__index_;
147
+ ++__index();
133
148
  return *this;
134
149
  }
135
150
 
@@ -147,9 +162,9 @@ public:
147
162
  {
148
163
  if constexpr (::cuda::std::is_signed_v<_Index>)
149
164
  {
150
- _CCCL_ASSERT(__index_ > 0, "The index must be greater than or equal to 0");
165
+ _CCCL_ASSERT(__index() > 0, "The index must be greater than or equal to 0");
151
166
  }
152
- --__index_;
167
+ --__index();
153
168
  return *this;
154
169
  }
155
170
 
@@ -168,9 +183,9 @@ public:
168
183
  {
169
184
  if constexpr (::cuda::std::is_signed_v<_Index>)
170
185
  {
171
- _CCCL_ASSERT(__index_ + __n >= 0, "The index must be greater than or equal to 0");
186
+ _CCCL_ASSERT(__index() + __n >= 0, "The index must be greater than or equal to 0");
172
187
  }
173
- __index_ += static_cast<_Index>(__n);
188
+ __index() += static_cast<_Index>(__n);
174
189
  return *this;
175
190
  }
176
191
 
@@ -200,9 +215,9 @@ public:
200
215
  {
201
216
  if constexpr (::cuda::std::is_signed_v<_Index>)
202
217
  {
203
- _CCCL_ASSERT(__index_ - __n >= 0, "The index must be greater than or equal to 0");
218
+ _CCCL_ASSERT(__index() - __n >= 0, "The index must be greater than or equal to 0");
204
219
  }
205
- __index_ -= static_cast<_Index>(__n);
220
+ __index() -= static_cast<_Index>(__n);
206
221
  return *this;
207
222
  }
208
223
 
@@ -220,14 +235,14 @@ public:
220
235
  [[nodiscard]] _CCCL_API friend constexpr difference_type
221
236
  operator-(const constant_iterator& __lhs, const constant_iterator& __rhs) noexcept
222
237
  {
223
- return static_cast<difference_type>(__lhs.__index_) - static_cast<difference_type>(__rhs.__index_);
238
+ return static_cast<difference_type>(__lhs.__index()) - static_cast<difference_type>(__rhs.__index());
224
239
  }
225
240
 
226
241
  //! @brief Compares two @c constant_iterator for equality by comparing the index in the sequence
227
242
  [[nodiscard]] _CCCL_API friend constexpr bool
228
243
  operator==(const constant_iterator& __lhs, const constant_iterator& __rhs) noexcept
229
244
  {
230
- return __lhs.__index_ == __rhs.__index_;
245
+ return __lhs.__index() == __rhs.__index();
231
246
  }
232
247
 
233
248
  #if _CCCL_STD_VER <= 2017
@@ -235,7 +250,7 @@ public:
235
250
  [[nodiscard]] _CCCL_API friend constexpr bool
236
251
  operator!=(const constant_iterator& __lhs, const constant_iterator& __rhs) noexcept
237
252
  {
238
- return __lhs.__index_ != __rhs.__index_;
253
+ return __lhs.__index() != __rhs.__index();
239
254
  }
240
255
  #endif // _CCCL_STD_VER <= 2017
241
256
 
@@ -244,32 +259,32 @@ public:
244
259
  [[nodiscard]] _CCCL_API friend constexpr auto
245
260
  operator<=>(const constant_iterator& __lhs, const constant_iterator& __rhs) noexcept
246
261
  {
247
- return __lhs.__index_ <=> __rhs.__index_;
262
+ return __lhs.__index() <=> __rhs.__index();
248
263
  }
249
264
  #else // ^^^ _LIBCUDACXX_HAS_SPACESHIP_OPERATOR() ^^^ / vvv !_LIBCUDACXX_HAS_SPACESHIP_OPERATOR() vvv
250
265
  //! @brief Compares two @c constant_iterator for less than by comparing the index in the sequence
251
266
  [[nodiscard]] _CCCL_API friend constexpr bool
252
267
  operator<(const constant_iterator& __lhs, const constant_iterator& __rhs) noexcept
253
268
  {
254
- return __lhs.__index_ < __rhs.__index_;
269
+ return __lhs.__index() < __rhs.__index();
255
270
  }
256
271
  //! @brief Compares two @c constant_iterator for less equal by comparing the index in the sequence
257
272
  [[nodiscard]] _CCCL_API friend constexpr bool
258
273
  operator<=(const constant_iterator& __lhs, const constant_iterator& __rhs) noexcept
259
274
  {
260
- return __lhs.__index_ <= __rhs.__index_;
275
+ return __lhs.__index() <= __rhs.__index();
261
276
  }
262
277
  //! @brief Compares two @c constant_iterator for greater than by comparing the index in the sequence
263
278
  [[nodiscard]] _CCCL_API friend constexpr bool
264
279
  operator>(const constant_iterator& __lhs, const constant_iterator& __rhs) noexcept
265
280
  {
266
- return __lhs.__index_ > __rhs.__index_;
281
+ return __lhs.__index() > __rhs.__index();
267
282
  }
268
283
  //! @brief Compares two @c constant_iterator for greater equal by comparing the index in the sequence
269
284
  [[nodiscard]] _CCCL_API friend constexpr bool
270
285
  operator>=(const constant_iterator& __lhs, const constant_iterator& __rhs) noexcept
271
286
  {
272
- return __lhs.__index_ >= __rhs.__index_;
287
+ return __lhs.__index() >= __rhs.__index();
273
288
  }
274
289
  #endif // !_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR()
275
290
  };