cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -26,6 +26,7 @@
26
26
  #include <cuda/__cmath/ilog.h>
27
27
  #include <cuda/__cmath/ipow.h>
28
28
  #include <cuda/__cmath/isqrt.h>
29
+ #include <cuda/__cmath/mul_hi.h>
29
30
  #include <cuda/__cmath/neg.h>
30
31
  #include <cuda/__cmath/pow2.h>
31
32
  #include <cuda/__cmath/round_down.h>
@@ -11,10 +11,23 @@
11
11
  #ifndef _CUDA_DEVICES
12
12
  #define _CUDA_DEVICES
13
13
 
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
14
24
  #include <cuda/__device/all_devices.h>
25
+ #include <cuda/__device/arch_id.h>
15
26
  #include <cuda/__device/arch_traits.h>
16
27
  #include <cuda/__device/attributes.h>
28
+ #include <cuda/__device/compute_capability.h>
17
29
  #include <cuda/__device/device_ref.h>
18
30
  #include <cuda/__device/physical_device.h>
31
+ #include <cuda/version>
19
32
 
20
33
  #endif // _CUDA_DEVICES
@@ -33,6 +33,7 @@
33
33
  #include <cuda/__iterator/transform_output_iterator.h>
34
34
  #include <cuda/__iterator/zip_function.h>
35
35
  #include <cuda/__iterator/zip_iterator.h>
36
+ #include <cuda/__iterator/zip_transform_iterator.h>
36
37
  #include <cuda/std/iterator>
37
38
 
38
39
  #endif // _CUDA_ITERATOR
@@ -28,6 +28,7 @@
28
28
  #include <cuda/__memory/discard_memory.h>
29
29
  #include <cuda/__memory/get_device_address.h>
30
30
  #include <cuda/__memory/is_aligned.h>
31
+ #include <cuda/__memory/ptr_in_range.h>
31
32
  #include <cuda/__memory/ptr_rebind.h>
32
33
  #include <cuda/std/memory>
33
34
 
@@ -52,12 +52,12 @@ __equal_range(_Iter __first, _Sent __last, const _Tp& __value, _Compare&& __comp
52
52
  {
53
53
  auto __half_len = ::cuda::std::__half_positive(__len);
54
54
  _Iter __mid = _IterOps<_AlgPolicy>::next(__first, __half_len);
55
- if (::cuda::std::__invoke(__comp, ::cuda::std::__invoke(__proj, *__mid), __value))
55
+ if (::cuda::std::invoke(__comp, ::cuda::std::invoke(__proj, *__mid), __value))
56
56
  {
57
57
  __first = ++__mid;
58
58
  __len -= __half_len + 1;
59
59
  }
60
- else if (::cuda::std::__invoke(__comp, __value, ::cuda::std::__invoke(__proj, *__mid)))
60
+ else if (::cuda::std::invoke(__comp, __value, ::cuda::std::invoke(__proj, *__mid)))
61
61
  {
62
62
  __end = __mid;
63
63
  __len = __half_len;
@@ -33,7 +33,7 @@ template <class _Iter, class _Sent, class _Tp, class _Proj>
33
33
  {
34
34
  for (; __first != __last; ++__first)
35
35
  {
36
- if (::cuda::std::__invoke(__proj, *__first) == __value)
36
+ if (::cuda::std::invoke(__proj, *__first) == __value)
37
37
  {
38
38
  break;
39
39
  }
@@ -40,13 +40,11 @@ _CCCL_API constexpr bool __includes(
40
40
  for (; __first2 != __last2; ++__first1)
41
41
  {
42
42
  if (__first1 == __last1
43
- || ::cuda::std::__invoke(
44
- __comp, ::cuda::std::__invoke(__proj2, *__first2), ::cuda::std::__invoke(__proj1, *__first1)))
43
+ || ::cuda::std::invoke(__comp, ::cuda::std::invoke(__proj2, *__first2), ::cuda::std::invoke(__proj1, *__first1)))
45
44
  {
46
45
  return false;
47
46
  }
48
- if (!::cuda::std::__invoke(
49
- __comp, ::cuda::std::__invoke(__proj1, *__first1), ::cuda::std::__invoke(__proj2, *__first2)))
47
+ if (!::cuda::std::invoke(__comp, ::cuda::std::invoke(__proj1, *__first1), ::cuda::std::invoke(__proj2, *__first2)))
50
48
  {
51
49
  ++__first2;
52
50
  }
@@ -46,7 +46,7 @@ _CCCL_API constexpr _Iter __lower_bound(_Iter __first, _Sent __last, const _Type
46
46
  auto __l2 = ::cuda::std::__half_positive(__len);
47
47
  _Iter __m = __first;
48
48
  _IterOps<_AlgPolicy>::advance(__m, __l2);
49
- if (::cuda::std::__invoke(__comp, ::cuda::std::__invoke(__proj, *__m), __value))
49
+ if (::cuda::std::invoke(__comp, ::cuda::std::invoke(__proj, *__m), __value))
50
50
  {
51
51
  __first = ++__m;
52
52
  __len -= __l2 + 1;
@@ -47,26 +47,18 @@ struct _ProjectedPred
47
47
  {}
48
48
 
49
49
  template <class _Tp>
50
- typename __invoke_of<
51
- _Pred&,
52
- decltype(::cuda::std::__invoke(::cuda::std::declval<_Proj&>(), ::cuda::std::declval<_Tp>()))>::type constexpr
53
- _CCCL_API inline
54
- operator()(_Tp&& __v) const
50
+ invoke_result_t<_Pred&, invoke_result_t<_Proj&, _Tp>> constexpr _CCCL_API inline operator()(_Tp&& __v) const
55
51
  {
56
- return ::cuda::std::__invoke(__pred, ::cuda::std::__invoke(__proj, ::cuda::std::forward<_Tp>(__v)));
52
+ return ::cuda::std::invoke(__pred, ::cuda::std::invoke(__proj, ::cuda::std::forward<_Tp>(__v)));
57
53
  }
58
54
 
59
55
  template <class _T1, class _T2>
60
- typename __invoke_of<
61
- _Pred&,
62
- decltype(::cuda::std::__invoke(::cuda::std::declval<_Proj&>(), ::cuda::std::declval<_T1>())),
63
- decltype(::cuda::std::__invoke(::cuda::std::declval<_Proj&>(), ::cuda::std::declval<_T2>()))>::type constexpr
64
- _CCCL_API inline
65
- operator()(_T1&& __lhs, _T2&& __rhs) const
56
+ invoke_result_t<_Pred&, invoke_result_t<_Proj&, _T1>, invoke_result_t<_Proj&, _T2>> _CCCL_API inline
57
+ operator()(_T1&& __lhs, _T2&& __rhs) const
66
58
  {
67
- return ::cuda::std::__invoke(__pred,
68
- ::cuda::std::__invoke(__proj, ::cuda::std::forward<_T1>(__lhs)),
69
- ::cuda::std::__invoke(__proj, ::cuda::std::forward<_T2>(__rhs)));
59
+ return ::cuda::std::invoke(__pred,
60
+ ::cuda::std::invoke(__proj, ::cuda::std::forward<_T1>(__lhs)),
61
+ ::cuda::std::invoke(__proj, ::cuda::std::forward<_T2>(__rhs)));
70
62
  }
71
63
  };
72
64
 
@@ -44,7 +44,7 @@ _CCCL_API constexpr _Iter __min_element(_Iter __first, _Sent __last, _Comp __com
44
44
  _Iter __i = __first;
45
45
  while (++__i != __last)
46
46
  {
47
- if (::cuda::std::__invoke(__comp, ::cuda::std::__invoke(__proj, *__i), ::cuda::std::__invoke(__proj, *__first)))
47
+ if (::cuda::std::invoke(__comp, ::cuda::std::invoke(__proj, *__i), ::cuda::std::invoke(__proj, *__first)))
48
48
  {
49
49
  __first = __i;
50
50
  }
@@ -46,8 +46,7 @@ public:
46
46
  template <class _Iter>
47
47
  _CCCL_API constexpr bool operator()(_Iter& __it1, _Iter& __it2)
48
48
  {
49
- return ::cuda::std::__invoke(
50
- __comp_, ::cuda::std::__invoke(__proj_, *__it1), ::cuda::std::__invoke(__proj_, *__it2));
49
+ return ::cuda::std::invoke(__comp_, ::cuda::std::invoke(__proj_, *__it1), ::cuda::std::invoke(__proj_, *__it2));
51
50
  }
52
51
  };
53
52
 
@@ -69,8 +69,8 @@ _CCCL_API constexpr pair<_InputIterator, _RandomAccessIterator> __partial_sort_c
69
69
  typename iterator_traits<_RandomAccessIterator>::difference_type __len = __r - __result_first;
70
70
  for (; __first != __last; ++__first)
71
71
  {
72
- if (::cuda::std::__invoke(
73
- __comp, ::cuda::std::__invoke(__proj1, *__first), ::cuda::std::__invoke(__proj2, *__result_first)))
72
+ if (::cuda::std::invoke(
73
+ __comp, ::cuda::std::invoke(__proj1, *__first), ::cuda::std::invoke(__proj2, *__result_first)))
74
74
  {
75
75
  *__result_first = *__first;
76
76
  ::cuda::std::__sift_down<_AlgPolicy>(__result_first, __projected_comp, __len, __result_first);
@@ -45,7 +45,7 @@ __upper_bound(_Iter __first, _Sent __last, const _Tp& __value, _Compare&& __comp
45
45
  {
46
46
  auto __half_len = ::cuda::std::__half_positive(__len);
47
47
  auto __mid = _IterOps<_AlgPolicy>::next(__first, __half_len);
48
- if (::cuda::std::__invoke(__comp, __value, ::cuda::std::__invoke(__proj, *__mid)))
48
+ if (::cuda::std::invoke(__comp, __value, ::cuda::std::invoke(__proj, *__mid)))
49
49
  {
50
50
  __len = __half_len;
51
51
  }
@@ -100,7 +100,14 @@ template <typename _Tp>
100
100
  template <typename _Tp>
101
101
  [[nodiscard]] _CCCL_HIDE_FROM_ABI _CCCL_DEVICE int __cccl_countl_zero_impl_device(_Tp __v) noexcept
102
102
  {
103
- return (sizeof(_Tp) == sizeof(uint32_t)) ? ::__clz(static_cast<int>(__v)) : ::__clzll(static_cast<long long>(__v));
103
+ if constexpr (sizeof(_Tp) == sizeof(uint32_t))
104
+ {
105
+ return static_cast<int>(::__clz(static_cast<int>(__v)));
106
+ }
107
+ else
108
+ {
109
+ return static_cast<int>(::__clzll(static_cast<long long>(__v)));
110
+ }
104
111
  }
105
112
  #endif // _CCCL_CUDA_COMPILATION()
106
113
 
@@ -114,11 +114,11 @@ template <typename _Tp>
114
114
  {
115
115
  if constexpr (sizeof(_Tp) == sizeof(uint32_t))
116
116
  {
117
- return ::__clz(static_cast<int>(::__brev(__v)));
117
+ return static_cast<int>(::__clz(static_cast<int>(::__brev(__v))));
118
118
  }
119
119
  else
120
120
  {
121
- return ::__clzll(static_cast<long long>(::__brevll(__v)));
121
+ return static_cast<int>(::__clzll(static_cast<long long>(::__brevll(__v))));
122
122
  }
123
123
  }
124
124
  #endif // _CCCL_CUDA_COMPILATION()
@@ -275,10 +275,10 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_aligned(
275
275
  // do first word
276
276
  if (__first.__ctz_ != 0)
277
277
  {
278
- unsigned __clz = __bits_per_word - __first.__ctz_;
279
- difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz), __n);
278
+ unsigned __clz_f = __bits_per_word - __first.__ctz_;
279
+ difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
280
280
  __n -= __dn;
281
- __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
281
+ __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
282
282
  __storage_type __b = *__first.__seg_ & __m;
283
283
  *__result.__seg_ &= ~__m;
284
284
  *__result.__seg_ |= __b;
@@ -420,8 +420,8 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_backward_aligned(
420
420
  {
421
421
  difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__last.__ctz_), __n);
422
422
  __n -= __dn;
423
- unsigned __clz = __bits_per_word - __last.__ctz_;
424
- __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
423
+ unsigned __clz_f = __bits_per_word - __last.__ctz_;
424
+ __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_f);
425
425
  __storage_type __b = *__last.__seg_ & __m;
426
426
  *__result.__seg_ &= ~__m;
427
427
  *__result.__seg_ |= __b;
@@ -635,10 +635,10 @@ _CCCL_API inline __bit_iterator<_Cr, false> __swap_ranges_aligned(
635
635
  // do first word
636
636
  if (__first.__ctz_ != 0)
637
637
  {
638
- unsigned __clz = __bits_per_word - __first.__ctz_;
639
- difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz), __n);
638
+ unsigned __clz_f = __bits_per_word - __first.__ctz_;
639
+ difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
640
640
  __n -= __dn;
641
- __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
641
+ __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
642
642
  __storage_type __b1 = *__first.__seg_ & __m;
643
643
  *__first.__seg_ &= ~__m;
644
644
  __storage_type __b2 = *__result.__seg_ & __m;
@@ -988,10 +988,10 @@ _CCCL_API constexpr bool __equal_aligned(
988
988
  // do first word
989
989
  if (__first1.__ctz_ != 0)
990
990
  {
991
- unsigned __clz = __bits_per_word - __first1.__ctz_;
992
- difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz), __n);
991
+ unsigned __clz_f = __bits_per_word - __first1.__ctz_;
992
+ difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
993
993
  __n -= __dn;
994
- __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
994
+ __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
995
995
  if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
996
996
  {
997
997
  return false;
@@ -0,0 +1,36 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef _CUDA_STD__CCCL_ALGORITHM_WRAPPER_H
12
+ #define _CUDA_STD__CCCL_ALGORITHM_WRAPPER_H
13
+
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
24
+ // When nvc++ uses CCCL components as part of its implementation of
25
+ // Standard C++ algorithms, a cycle of included files may result when CCCL code
26
+ // tries to use a standard algorithm. The THRUST_INCLUDING_ALGORITHMS_HEADER macro
27
+ // is defined only when CCCL is including an algorithms-related header, giving
28
+ // the compiler a chance to detect and break the cycle of includes.
29
+
30
+ #if !_CCCL_COMPILER(NVRTC)
31
+ # define THRUST_INCLUDING_ALGORITHMS_HEADER
32
+ # include <algorithm>
33
+ # undef THRUST_INCLUDING_ALGORITHMS_HEADER
34
+ #endif // !_CCCL_COMPILER(NVRTC)
35
+
36
+ #endif // _CUDA_STD__CCCL_ALGORITHM_WRAPPER_H
@@ -25,6 +25,7 @@
25
25
 
26
26
  #include <cuda/std/__cccl/attributes.h>
27
27
  #include <cuda/std/__cccl/extended_data_types.h>
28
+ #include <cuda/std/__cccl/host_std_lib.h>
28
29
 
29
30
  //! This file consolidates all compiler builtin detection for CCCL.
30
31
  //!
@@ -607,55 +608,51 @@
607
608
  # define _CCCL_BUILTIN_STRLEN(...) __builtin_strlen(__VA_ARGS__)
608
609
  #endif
609
610
 
610
- // Some compilers provide std::move/std::forward/etc as builtins
611
- #if defined(__cplusplus)
612
- // Bring in the feature test macros (needed for std::forward_like)
613
- # if _CCCL_HAS_INCLUDE(<version>) // <version> should be the smallest include possible
614
- # include <version>
615
- # elif !_CCCL_COMPILER(NVRTC)
616
- # include <ciso646> // otherwise go for the smallest possible header
617
- # endif // !_CCCL_COMPILER(NVRTC)
618
-
619
- // Bring in the bits of the STL we need
620
- # if defined(_GLIBCXX_VERSION)
621
- # include <bits/move.h> // for move, forward, forward_like, and addressof
622
- # elif defined(_LIBCPP_VERSION)
623
- # include <__memory/addressof.h>
624
- # include <__utility/as_const.h>
625
- # include <__utility/forward.h>
626
- # include <__utility/forward_like.h>
627
- # include <__utility/move.h>
628
- # endif
629
-
630
- # if defined(_GLIBCXX_VERSION) || defined(_LIBCPP_VERSION)
631
- // std::move builtin
632
- # if _CCCL_COMPILER(CLANG, >=, 15) || _CCCL_COMPILER(GCC, >=, 15)
633
- # define _CCCL_HAS_BUILTIN_STD_MOVE() 1
634
- # endif
635
-
636
- // std::forward builtin
637
- # if _CCCL_COMPILER(CLANG, >=, 15) || _CCCL_COMPILER(GCC, >=, 15)
638
- # define _CCCL_HAS_BUILTIN_STD_FORWARD() 1
639
- # endif
640
-
641
- // std::addressof builtin
642
- # if _CCCL_COMPILER(CLANG, >=, 15) || _CCCL_COMPILER(GCC, >=, 15)
643
- # define _CCCL_HAS_BUILTIN_STD_ADDRESSOF() 1
644
- # endif
645
-
646
- // std::as_const builtin
647
- # if _CCCL_COMPILER(CLANG, >=, 15)
648
- # define _CCCL_HAS_BUILTIN_STD_AS_CONST() 1
649
- # endif
650
-
651
- // std::forward_like builtin
652
- // Leaving out MSVC for now because it is hard for forward-declare std::forward_like.
653
- # if (_CCCL_COMPILER(CLANG, >=, 17) || _CCCL_COMPILER(GCC, >=, 15)) && defined(__cpp_lib_forward_like) \
654
- && (__cpp_lib_forward_like >= 202217L)
655
- # define _CCCL_HAS_BUILTIN_STD_FORWARD_LIKE() 1
656
- # endif
657
- # endif // defined(_GLIBCXX_VERSION) || defined(_LIBCPP_VERSION) || defined(_MSVC_STL_VERSION)
658
- #endif // defined(__cplusplus)
611
+ // todo: re-enable std builtins
612
+
613
+ // // Some compilers provide std::move/std::forward/etc as builtins
614
+ // #if defined(__cplusplus)
615
+ // // Bring in the bits of the STL we need
616
+ // # if _CCCL_HOST_STD_LIB(LIBSTDCXX)
617
+ // # include <bits/move.h> // for move, forward, forward_like, and addressof
618
+ // # elif _CCCL_HOST_STD_LIB(LIBCXX)
619
+ // # include <__memory/addressof.h>
620
+ // # include <__utility/as_const.h>
621
+ // # include <__utility/forward.h>
622
+ // # if __cpp_lib_forward_like >= 202217L
623
+ // # include <__utility/forward_like.h>
624
+ // # endif // __cpp_lib_forward_like >= 202217L
625
+ // # include <__utility/move.h>
626
+ // # endif
627
+
628
+ // # if _CCCL_HOST_STD_LIB(LIBSTDCXX) || _CCCL_HOST_STD_LIB(LIBCXX)
629
+ // // std::move builtin
630
+ // # if _CCCL_COMPILER(CLANG, >=, 15) || _CCCL_COMPILER(GCC, >=, 15)
631
+ // # define _CCCL_HAS_BUILTIN_STD_MOVE() 1
632
+ // # endif
633
+
634
+ // // std::forward builtin
635
+ // # if _CCCL_COMPILER(CLANG, >=, 15) || _CCCL_COMPILER(GCC, >=, 15)
636
+ // # define _CCCL_HAS_BUILTIN_STD_FORWARD() 1
637
+ // # endif
638
+
639
+ // // std::addressof builtin
640
+ // # if _CCCL_COMPILER(CLANG, >=, 15) || _CCCL_COMPILER(GCC, >=, 15)
641
+ // # define _CCCL_HAS_BUILTIN_STD_ADDRESSOF() 1
642
+ // # endif
643
+
644
+ // // std::as_const builtin
645
+ // # if _CCCL_COMPILER(CLANG, >=, 15)
646
+ // # define _CCCL_HAS_BUILTIN_STD_AS_CONST() 1
647
+ // # endif
648
+
649
+ // // std::forward_like builtin
650
+ // // Leaving out MSVC for now because it is hard for forward-declare std::forward_like.
651
+ // # if (_CCCL_COMPILER(CLANG, >=, 17) || _CCCL_COMPILER(GCC, >=, 15)) && __cpp_lib_forward_like >= 202217L
652
+ // # define _CCCL_HAS_BUILTIN_STD_FORWARD_LIKE() 1
653
+ // # endif
654
+ // # endif // _CCCL_HOST_STD_LIB(LIBSTDCXX) || _CCCL_HOST_STD_LIB(LIBCXX)
655
+ // #endif // defined(__cplusplus)
659
656
 
660
657
  #ifndef _CCCL_HAS_BUILTIN_STD_MOVE
661
658
  # define _CCCL_HAS_BUILTIN_STD_MOVE() 0
@@ -65,4 +65,10 @@
65
65
  # endif // _CCCL_CUDA_COMPILER(NVCC)
66
66
  #endif // !_CCCL_EXEC_CHECK_DISABLE
67
67
 
68
+ #if _CCCL_CUDA_COMPILER(NVHPC)
69
+ # define _CCCL_TARGET_CONSTEXPR
70
+ #else // ^^^ _CCCL_CUDA_COMPILER(NVHPC) ^^^ / vvv !_CCCL_CUDA_COMPILER(NVHPC) vvv
71
+ # define _CCCL_TARGET_CONSTEXPR constexpr
72
+ #endif // ^^^ !_CCCL_CUDA_COMPILER(NVHPC) ^^^
73
+
68
74
  #endif // __CCCL_EXECUTION_SPACE_H
@@ -0,0 +1,52 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef __CCCL_HOST_STD_LIB_H
12
+ #define __CCCL_HOST_STD_LIB_H
13
+
14
+ #include <cuda/std/__cccl/compiler.h>
15
+ #include <cuda/std/__cccl/preprocessor.h>
16
+ #include <cuda/std/__cccl/system_header.h>
17
+
18
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
19
+ # pragma GCC system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
21
+ # pragma clang system_header
22
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
23
+ # pragma system_header
24
+ #endif // no system header
25
+
26
+ #define _CCCL_HOST_STD_LIB_LIBSTDCXX() 0
27
+ #define _CCCL_HOST_STD_LIB_LIBCXX() 0
28
+ #define _CCCL_HOST_STD_LIB_STL() 0
29
+
30
+ // include a minimal header
31
+ #if _CCCL_HAS_INCLUDE(<version>)
32
+ # include <version>
33
+ #elif _CCCL_HAS_INCLUDE(<ciso646>)
34
+ # include <ciso646>
35
+ #endif // ^^^ _CCCL_HAS_INCLUDE(<ciso646>) ^^^
36
+
37
+ #if defined(_MSVC_STL_VERSION)
38
+ # undef _CCCL_HOST_STD_LIB_STL
39
+ # define _CCCL_HOST_STD_LIB_STL() 1
40
+ #elif defined(__GLIBCXX__)
41
+ # undef _CCCL_HOST_STD_LIB_LIBSTDCXX
42
+ # define _CCCL_HOST_STD_LIB_LIBSTDCXX() 1
43
+ #elif defined(_LIBCPP_VERSION)
44
+ # undef _CCCL_HOST_STD_LIB_LIBCXX
45
+ # define _CCCL_HOST_STD_LIB_LIBCXX() 1
46
+ #endif // ^^^ _LIBCPP_VERSION ^^^
47
+
48
+ #define _CCCL_HOST_STD_LIB(_X) _CCCL_HOST_STD_LIB_##_X()
49
+ #define _CCCL_HAS_HOST_STD_LIB() \
50
+ (_CCCL_HOST_STD_LIB_LIBSTDCXX() || _CCCL_HOST_STD_LIB_LIBCXX() || _CCCL_HOST_STD_LIB_STL())
51
+
52
+ #endif // __CCCL_HOST_STD_LIB_H
@@ -0,0 +1,36 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef _CUDA_STD__CCCL_MEMORY_WRAPPER_H
12
+ #define _CUDA_STD__CCCL_MEMORY_WRAPPER_H
13
+
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
24
+ // When nvc++ uses CCCL components as part of its implementation of
25
+ // Standard C++ algorithms, a cycle of included files may result when CCCL code
26
+ // tries to use a standard algorithm. The THRUST_INCLUDING_ALGORITHMS_HEADER macro
27
+ // is defined only when CCCL is including an algorithms-related header, giving
28
+ // the compiler a chance to detect and break the cycle of includes.
29
+
30
+ #if !_CCCL_COMPILER(NVRTC)
31
+ # define THRUST_INCLUDING_ALGORITHMS_HEADER
32
+ # include <memory>
33
+ # undef THRUST_INCLUDING_ALGORITHMS_HEADER
34
+ #endif // !_CCCL_COMPILER(NVRTC)
35
+
36
+ #endif // _CUDA_STD__CCCL_MEMORY_WRAPPER_H
@@ -0,0 +1,36 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef _CUDA_STD__CCCL_NUMERIC_WRAPPER_H
12
+ #define _CUDA_STD__CCCL_NUMERIC_WRAPPER_H
13
+
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
24
+ // When a compiler uses CCCL components as part of its implementation of
25
+ // Standard C++ algorithms, a cycle of included files may result when CCCL code
26
+ // tries to use a standard algorithm. The THRUST_INCLUDING_ALGORITHMS_HEADER macro
27
+ // is defined only when CCCL is including an algorithms-related header, giving
28
+ // the compiler a chance to detect and break the cycle of includes.
29
+
30
+ #if !_CCCL_COMPILER(NVRTC)
31
+ # define THRUST_INCLUDING_ALGORITHMS_HEADER
32
+ # include <numeric>
33
+ # undef THRUST_INCLUDING_ALGORITHMS_HEADER
34
+ #endif // !_CCCL_COMPILER(NVRTC)
35
+
36
+ #endif // _CUDA_STD__CCCL_NUMERIC_WRAPPER_H
@@ -43,19 +43,19 @@ template <class _Rep, class _Period = ratio<1>>
43
43
  class _CCCL_TYPE_VISIBILITY_DEFAULT duration;
44
44
 
45
45
  template <class _Tp>
46
- inline const bool __is_duration_v = false;
46
+ inline constexpr bool __is_duration_v = false;
47
47
 
48
48
  template <class _Rep, class _Period>
49
- inline const bool __is_duration_v<duration<_Rep, _Period>> = true;
49
+ inline constexpr bool __is_duration_v<duration<_Rep, _Period>> = true;
50
50
 
51
51
  template <class _Rep, class _Period>
52
- inline const bool __is_duration_v<const duration<_Rep, _Period>> = true;
52
+ inline constexpr bool __is_duration_v<const duration<_Rep, _Period>> = true;
53
53
 
54
54
  template <class _Rep, class _Period>
55
- inline const bool __is_duration_v<volatile duration<_Rep, _Period>> = true;
55
+ inline constexpr bool __is_duration_v<volatile duration<_Rep, _Period>> = true;
56
56
 
57
57
  template <class _Rep, class _Period>
58
- inline const bool __is_duration_v<const volatile duration<_Rep, _Period>> = true;
58
+ inline constexpr bool __is_duration_v<const volatile duration<_Rep, _Period>> = true;
59
59
 
60
60
  } // namespace chrono
61
61
 
@@ -190,29 +190,29 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT duration
190
190
  struct __no_overflow
191
191
  {
192
192
  private:
193
- static const intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value;
194
- static const intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
195
- static const intmax_t __n1 = _R1::num / __gcd_n1_n2;
196
- static const intmax_t __d1 = _R1::den / __gcd_d1_d2;
197
- static const intmax_t __n2 = _R2::num / __gcd_n1_n2;
198
- static const intmax_t __d2 = _R2::den / __gcd_d1_d2;
199
- static const intmax_t max = -((intmax_t(1) << (sizeof(intmax_t) * CHAR_BIT - 1)) + 1);
193
+ static constexpr intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value;
194
+ static constexpr intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
195
+ static constexpr intmax_t __n1 = _R1::num / __gcd_n1_n2;
196
+ static constexpr intmax_t __d1 = _R1::den / __gcd_d1_d2;
197
+ static constexpr intmax_t __n2 = _R2::num / __gcd_n1_n2;
198
+ static constexpr intmax_t __d2 = _R2::den / __gcd_d1_d2;
199
+ static constexpr intmax_t max = -((intmax_t(1) << (sizeof(intmax_t) * CHAR_BIT - 1)) + 1);
200
200
 
201
201
  template <intmax_t _Xp, intmax_t _Yp, bool __overflow>
202
202
  struct __mul // __overflow == false
203
203
  {
204
- static const intmax_t value = _Xp * _Yp;
204
+ static constexpr intmax_t value = _Xp * _Yp;
205
205
  };
206
206
 
207
207
  template <intmax_t _Xp, intmax_t _Yp>
208
208
  struct __mul<_Xp, _Yp, true>
209
209
  {
210
- static const intmax_t value = 1;
210
+ static constexpr intmax_t value = 1;
211
211
  };
212
212
 
213
213
  public:
214
- static const bool value = (__n1 <= max / __d2) && (__n2 <= max / __d1);
215
- using type = ratio<__mul<__n1, __d2, !value>::value, __mul<__n2, __d1, !value>::value>;
214
+ static constexpr bool value = (__n1 <= max / __d2) && (__n2 <= max / __d1);
215
+ using type = ratio<__mul<__n1, __d2, !value>::value, __mul<__n2, __d1, !value>::value>;
216
216
  };
217
217
 
218
218
  public:
@@ -40,11 +40,11 @@ namespace chrono
40
40
  class _CCCL_TYPE_VISIBILITY_DEFAULT steady_clock
41
41
  {
42
42
  public:
43
- using duration = nanoseconds;
44
- using rep = duration::rep;
45
- using period = duration::period;
46
- using time_point = ::cuda::std::chrono::time_point<steady_clock, duration>;
47
- static constexpr const bool is_steady = true;
43
+ using duration = nanoseconds;
44
+ using rep = duration::rep;
45
+ using period = duration::period;
46
+ using time_point = ::cuda::std::chrono::time_point<steady_clock, duration>;
47
+ static constexpr bool is_steady = true;
48
48
 
49
49
  [[nodiscard]] _CCCL_API static time_point now() noexcept;
50
50
  };