cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -39,11 +39,11 @@ namespace chrono
39
39
  class _CCCL_TYPE_VISIBILITY_DEFAULT system_clock
40
40
  {
41
41
  public:
42
- using duration = ::cuda::std::chrono::nanoseconds;
43
- using rep = duration::rep;
44
- using period = duration::period;
45
- using time_point = ::cuda::std::chrono::time_point<system_clock>;
46
- static constexpr const bool is_steady = false;
42
+ using duration = ::cuda::std::chrono::nanoseconds;
43
+ using rep = duration::rep;
44
+ using period = duration::period;
45
+ using time_point = ::cuda::std::chrono::time_point<system_clock>;
46
+ static constexpr bool is_steady = false;
47
47
 
48
48
  [[nodiscard]] _CCCL_API inline static time_point now() noexcept
49
49
  {
@@ -27,9 +27,10 @@
27
27
  #include <cuda/std/__type_traits/is_floating_point.h>
28
28
  #include <cuda/std/__type_traits/is_integral.h>
29
29
 
30
- #if _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG) || !_CCCL_CUDA_COMPILATION()
30
+ // MSVC and clang cuda need the host side functions included
31
+ #if _CCCL_HOST_COMPILATION() || _CCCL_CUDA_COMPILER(CLANG)
31
32
  # include <math.h>
32
- #endif // _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG) || !_CCCL_CUDA_COMPILATION()
33
+ #endif // _CCCL_HOST_COMPILATION() || _CCCL_CUDA_COMPILER(CLANG)
33
34
 
34
35
  #include <cuda/std/__cccl/prologue.h>
35
36
 
@@ -23,6 +23,7 @@
23
23
 
24
24
  #include <cuda/std/__complex/vector_support.h>
25
25
  #include <cuda/std/__concepts/concept_macros.h>
26
+ #include <cuda/std/__fwd/complex.h>
26
27
  #include <cuda/std/__fwd/get.h>
27
28
  #include <cuda/std/__tuple_dir/tuple_element.h>
28
29
  #include <cuda/std/__tuple_dir/tuple_size.h>
@@ -36,9 +37,9 @@
36
37
  #include <cuda/std/limits>
37
38
 
38
39
  // Compatibility helpers for thrust to convert between `std::complex` and `cuda::std::complex`
40
+ // todo: find a way to get rid of this include
39
41
  #if !_CCCL_COMPILER(NVRTC)
40
- # include <complex>
41
- # include <sstream> // for std::basic_ostringstream
42
+ # include <complex> // for std::complex stream operators
42
43
 
43
44
  # define _LIBCUDACXX_ACCESS_STD_COMPLEX_REAL(__c) reinterpret_cast<const _Up(&)[2]>(__c)[0]
44
45
  # define _LIBCUDACXX_ACCESS_STD_COMPLEX_IMAG(__c) reinterpret_cast<const _Up(&)[2]>(__c)[1]
@@ -21,28 +21,28 @@
21
21
  # pragma system_header
22
22
  #endif // no system header
23
23
 
24
- #include <cuda/std/__complex/complex.h>
24
+ // gcc < 8 warns about it's extended literals being shadowed by the implementation, so let's just disable the complex
25
+ // literals
26
+ #if !_CCCL_COMPILER(GCC, <, 8)
25
27
 
26
- #include <cuda/std/__cccl/prologue.h>
28
+ # include <cuda/std/__complex/complex.h>
27
29
 
28
- _CCCL_BEGIN_NAMESPACE_CUDA_STD
30
+ # include <cuda/std/__cccl/prologue.h>
29
31
 
30
- #ifdef _LIBCUDACXX_HAS_STL_LITERALS
31
- // Literal suffix for complex number literals [complex.literals]
32
+ _CCCL_BEGIN_NAMESPACE_CUDA_STD
32
33
 
33
34
  _CCCL_DIAG_PUSH
34
35
  _CCCL_DIAG_SUPPRESS_GCC("-Wliteral-suffix")
35
36
  _CCCL_DIAG_SUPPRESS_CLANG("-Wuser-defined-literals")
36
- _CCCL_DIAG_SUPPRESS_MSVC(4455)
37
+ _CCCL_DIAG_SUPPRESS_NVHPC(lit_suffix_no_underscore)
38
+ _CCCL_DIAG_SUPPRESS_MSVC(4455) // literal suffix identifiers that do not start with an underscore are reserved
39
+ _CCCL_BEGIN_NV_DIAG_SUPPRESS(2506, 20208) // a user-provided literal suffix must begin with "_",
40
+ // long double treated as double
37
41
 
38
42
  inline namespace literals
39
43
  {
40
44
  inline namespace complex_literals
41
45
  {
42
- # if !_CCCL_CUDA_COMPILER(NVCC) && !_CCCL_COMPILER(NVRTC)
43
- // NOTE: if you get a warning from GCC <7 here that "literal operator suffixes not preceded by ‘_’ are reserved for
44
- // future standardization" then we are sorry. The warning was implemented before GCC 7, but can only be disabled since
45
- // GCC 7. See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69523
46
46
  _CCCL_API constexpr complex<long double> operator""il(long double __im)
47
47
  {
48
48
  return {0.0l, __im};
@@ -71,36 +71,16 @@ _CCCL_API constexpr complex<float> operator""if(unsigned long long __im)
71
71
  {
72
72
  return {0.0f, static_cast<float>(__im)};
73
73
  }
74
- # else // ^^^ !_CCCL_CUDA_COMPILER(NVCC) && !_CCCL_COMPILER(NVRTC) ^^^ / vvv other compilers vvv
75
- _CCCL_API constexpr complex<double> operator""i(double __im)
76
- {
77
- return {0.0, static_cast<double>(__im)};
78
- }
79
-
80
- _CCCL_API constexpr complex<double> operator""i(unsigned long long __im)
81
- {
82
- return {0.0, static_cast<double>(__im)};
83
- }
84
-
85
- _CCCL_API constexpr complex<float> operator""if(double __im)
86
- {
87
- return {0.0f, static_cast<float>(__im)};
88
- }
89
-
90
- _CCCL_API constexpr complex<float> operator""if(unsigned long long __im)
91
- {
92
- return {0.0f, static_cast<float>(__im)};
93
- }
94
- # endif // other compilers
95
74
  } // namespace complex_literals
96
75
  } // namespace literals
97
76
 
77
+ _CCCL_END_NV_DIAG_SUPPRESS()
98
78
  _CCCL_DIAG_POP
99
79
 
100
- #endif // _LIBCUDACXX_HAS_STL_LITERALS
101
-
102
80
  _CCCL_END_NAMESPACE_CUDA_STD
103
81
 
104
- #include <cuda/std/__cccl/epilogue.h>
82
+ # include <cuda/std/__cccl/epilogue.h>
83
+
84
+ #endif // !_CCCL_COMPILER(GCC, <, 8)
105
85
 
106
86
  #endif // _CUDA_STD___COMPLEX_LITERALS_H
@@ -31,8 +31,9 @@
31
31
  # include <cuda/std/__type_traits/enable_if.h>
32
32
  # include <cuda/std/__type_traits/is_constructible.h>
33
33
 
34
+ // todo: find a way to get rid of this include
34
35
  # if !_CCCL_COMPILER(NVRTC)
35
- # include <sstream> // for std::basic_ostringstream
36
+ # include <complex> // for std::complex stream operators
36
37
  # endif // !_CCCL_COMPILER(NVRTC)
37
38
 
38
39
  # include <cuda/std/__cccl/prologue.h>
@@ -31,8 +31,9 @@
31
31
  # include <cuda/std/__type_traits/enable_if.h>
32
32
  # include <cuda/std/__type_traits/is_constructible.h>
33
33
 
34
+ // todo: find a way to get rid of this include
34
35
  # if !_CCCL_COMPILER(NVRTC)
35
- # include <sstream> // for std::basic_ostringstream
36
+ # include <complex> // for std::complex stream operators
36
37
  # endif // !_CCCL_COMPILER(NVRTC)
37
38
 
38
39
  # include <cuda/std/__cccl/prologue.h>
@@ -294,7 +295,7 @@ struct __get_complex_impl<__half>
294
295
  }
295
296
  };
296
297
 
297
- # if !defined(_LIBCUDACXX_HAS_NO_LOCALIZATION) && !_CCCL_COMPILER(NVRTC)
298
+ # if !_CCCL_COMPILER(NVRTC)
298
299
  template <class _CharT, class _Traits>
299
300
  ::std::basic_istream<_CharT, _Traits>& operator>>(::std::basic_istream<_CharT, _Traits>& __is, complex<__half>& __x)
300
301
  {
@@ -310,7 +311,7 @@ operator<<(::std::basic_ostream<_CharT, _Traits>& __os, const complex<__half>& _
310
311
  {
311
312
  return __os << complex<float>{__x};
312
313
  }
313
- # endif // !_LIBCUDACXX_HAS_NO_LOCALIZATION && !_CCCL_COMPILER(NVRTC)
314
+ # endif // !_CCCL_COMPILER(NVRTC)
314
315
 
315
316
  _CCCL_END_NAMESPACE_CUDA_STD
316
317
 
@@ -35,8 +35,8 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD
35
35
 
36
36
  template <class _Fn, class... _Args>
37
37
  concept invocable = requires(_Fn&& __fn, _Args&&... __args) {
38
- ::cuda::std::__invoke(::cuda::std::forward<_Fn>(__fn), ::cuda::std::forward<_Args>(__args)...); // not required to be
39
- // equality preserving
38
+ ::cuda::std::invoke(::cuda::std::forward<_Fn>(__fn), ::cuda::std::forward<_Args>(__args)...); // not required to be
39
+ // equality preserving
40
40
  };
41
41
 
42
42
  // [concept.regular.invocable]
@@ -21,8 +21,9 @@
21
21
  # pragma system_header
22
22
  #endif // no system header
23
23
 
24
+ #include <cuda/__cmath/mul_hi.h>
24
25
  #include <cuda/std/__cstddef/types.h>
25
- #include <cuda/std/cstring>
26
+ #include <cuda/std/__cstring/memset.h>
26
27
 
27
28
  #if !_CCCL_COMPILER(NVRTC)
28
29
  # include <cstdlib>
@@ -44,7 +45,7 @@ using ::malloc;
44
45
 
45
46
  const size_t __nbytes = __n * __size;
46
47
 
47
- if (::__umul64hi(__n, __size) == 0)
48
+ if (::cuda::mul_hi(__n, __size) == 0)
48
49
  {
49
50
  __ptr = ::cuda::std::malloc(__nbytes);
50
51
  if (__ptr != nullptr)
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA_STD___FLOATING_POINT_FP_H
12
12
  #define _CUDA_STD___FLOATING_POINT_FP_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -97,14 +97,13 @@ _CCCL_API inline _Tp& __mu(reference_wrapper<_Tp> __t, _Uj&)
97
97
  }
98
98
 
99
99
  template <class _Ti, class... _Uj, size_t... _Indx>
100
- _CCCL_API inline typename __invoke_of<_Ti&, _Uj...>::type
101
- __mu_expand(_Ti& __ti, tuple<_Uj...>& __uj, __tuple_indices<_Indx...>)
100
+ _CCCL_API inline invoke_result_t<_Ti&, _Uj...> __mu_expand(_Ti& __ti, tuple<_Uj...>& __uj, __tuple_indices<_Indx...>)
102
101
  {
103
102
  return __ti(::cuda::std::forward<_Uj>(::cuda::std::get<_Indx>(__uj))...);
104
103
  }
105
104
 
106
105
  template <class _Ti, class... _Uj>
107
- _CCCL_API inline enable_if_t<is_bind_expression<_Ti>::value, __invoke_of<_Ti&, _Uj...>>
106
+ _CCCL_API inline enable_if_t<is_bind_expression<_Ti>::value, invoke_result<_Ti&, _Uj...>>
108
107
  __mu(_Ti& __ti, tuple<_Uj...>& __uj)
109
108
  {
110
109
  using __indices = __make_tuple_indices_t<sizeof...(_Uj)>;
@@ -151,12 +150,12 @@ struct __mu_return_invocable // false
151
150
  template <class _Ti, class... _Uj>
152
151
  struct __mu_return_invocable<true, _Ti, _Uj...>
153
152
  {
154
- using type = typename __invoke_of<_Ti&, _Uj...>::type;
153
+ using type = invoke_result_t<_Ti&, _Uj...>;
155
154
  };
156
155
 
157
156
  template <class _Ti, class... _Uj>
158
157
  struct __mu_return_impl<_Ti, false, true, false, tuple<_Uj...>>
159
- : public __mu_return_invocable<__invocable<_Ti&, _Uj...>::value, _Ti, _Uj...>
158
+ : public __mu_return_invocable<is_invocable_v<_Ti&, _Uj...>, _Ti, _Uj...>
160
159
  {};
161
160
 
162
161
  template <class _Ti, class _TupleUj>
@@ -196,13 +195,13 @@ struct __is_valid_bind_return
196
195
  template <class _Fp, class... _BoundArgs, class _TupleUj>
197
196
  struct __is_valid_bind_return<_Fp, tuple<_BoundArgs...>, _TupleUj>
198
197
  {
199
- static const bool value = __invocable<_Fp, typename __mu_return<_BoundArgs, _TupleUj>::type...>::value;
198
+ static const bool value = is_invocable_v<_Fp, typename __mu_return<_BoundArgs, _TupleUj>::type...>;
200
199
  };
201
200
 
202
201
  template <class _Fp, class... _BoundArgs, class _TupleUj>
203
202
  struct __is_valid_bind_return<_Fp, const tuple<_BoundArgs...>, _TupleUj>
204
203
  {
205
- static const bool value = __invocable<_Fp, typename __mu_return<const _BoundArgs, _TupleUj>::type...>::value;
204
+ static const bool value = is_invocable_v<_Fp, typename __mu_return<const _BoundArgs, _TupleUj>::type...>;
206
205
  };
207
206
 
208
207
  template <class _Fp, class _BoundArgs, class _TupleUj, bool = __is_valid_bind_return<_Fp, _BoundArgs, _TupleUj>::value>
@@ -211,13 +210,13 @@ struct __bind_return;
211
210
  template <class _Fp, class... _BoundArgs, class _TupleUj>
212
211
  struct __bind_return<_Fp, tuple<_BoundArgs...>, _TupleUj, true>
213
212
  {
214
- using type = typename __invoke_of<_Fp&, typename __mu_return<_BoundArgs, _TupleUj>::type...>::type;
213
+ using type = invoke_result_t<_Fp&, typename __mu_return<_BoundArgs, _TupleUj>::type...>;
215
214
  };
216
215
 
217
216
  template <class _Fp, class... _BoundArgs, class _TupleUj>
218
217
  struct __bind_return<_Fp, const tuple<_BoundArgs...>, _TupleUj, true>
219
218
  {
220
- using type = typename __invoke_of<_Fp&, typename __mu_return<const _BoundArgs, _TupleUj>::type...>::type;
219
+ using type = invoke_result_t<_Fp&, typename __mu_return<const _BoundArgs, _TupleUj>::type...>;
221
220
  };
222
221
 
223
222
  template <class _Fp, class _BoundArgs, class _TupleUj>
@@ -295,8 +294,7 @@ public:
295
294
  result_type>
296
295
  operator()(_Args&&... __args)
297
296
  {
298
- using _Invoker = __invoke_void_return_wrapper<_Rp>;
299
- return _Invoker::__call(static_cast<base&>(*this), ::cuda::std::forward<_Args>(__args)...);
297
+ return ::cuda::std::invoke_r<_Rp>(static_cast<base&>(*this), ::cuda::std::forward<_Args>(__args)...);
300
298
  }
301
299
 
302
300
  template <class... _Args>
@@ -305,8 +303,7 @@ public:
305
303
  result_type>
306
304
  operator()(_Args&&... __args) const
307
305
  {
308
- using _Invoker = __invoke_void_return_wrapper<_Rp>;
309
- return _Invoker::__call(static_cast<base const&>(*this), ::cuda::std::forward<_Args>(__args)...);
306
+ return ::cuda::std::invoke_r<_Rp>(static_cast<base const&>(*this), ::cuda::std::forward<_Args>(__args)...);
310
307
  }
311
308
  };
312
309
 
@@ -174,8 +174,7 @@ public:
174
174
 
175
175
  _CCCL_API inline _Rp operator()(_ArgTypes&&... __arg)
176
176
  {
177
- using _Invoker = __invoke_void_return_wrapper<_Rp>;
178
- return _Invoker::__call(__f_.first(), ::cuda::std::forward<_ArgTypes>(__arg)...);
177
+ return ::cuda::std::invoke_r<_Rp>(__f_.first(), ::cuda::std::forward<_ArgTypes>(__arg)...);
179
178
  }
180
179
 
181
180
  _CCCL_API inline __alloc_func* __clone() const
@@ -227,8 +226,7 @@ public:
227
226
 
228
227
  _CCCL_API inline _Rp operator()(_ArgTypes&&... __arg)
229
228
  {
230
- using _Invoker = __invoke_void_return_wrapper<_Rp>;
231
- return _Invoker::__call(__f_, ::cuda::std::forward<_ArgTypes>(__arg)...);
229
+ return ::cuda::std::invoke_r<_Rp>(__f_, ::cuda::std::forward<_ArgTypes>(__arg)...);
232
230
  }
233
231
 
234
232
  _CCCL_API inline __default_alloc_func* __clone() const
@@ -963,7 +961,7 @@ public:
963
961
 
964
962
  virtual _Rp operator()(_ArgTypes&&... __arg)
965
963
  {
966
- return ::cuda::std::__invoke(__f_, ::cuda::std::forward<_ArgTypes>(__arg)...);
964
+ return ::cuda::std::invoke(__f_, ::cuda::std::forward<_ArgTypes>(__arg)...);
967
965
  }
968
966
 
969
967
  # ifndef _CCCL_NO_RTTI
@@ -996,13 +994,12 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT function<_Rp(_ArgTypes...)>
996
994
 
997
995
  __func __f_;
998
996
 
999
- template <class _Fp, bool = !is_same_v<remove_cvref_t<_Fp>, function> && __invocable<_Fp, _ArgTypes...>::value>
997
+ template <class _Fp, bool = !is_same_v<remove_cvref_t<_Fp>, function> && is_invocable_v<_Fp, _ArgTypes...>>
1000
998
  struct __callable;
1001
999
  template <class _Fp>
1002
1000
  struct __callable<_Fp, true>
1003
1001
  {
1004
- static const bool value =
1005
- is_void_v<_Rp> || __is_core_convertible<typename __invoke_of<_Fp, _ArgTypes...>::type, _Rp>::value;
1002
+ static const bool value = is_void_v<_Rp> || __is_core_convertible<invoke_result_t<_Fp, _ArgTypes...>, _Rp>::value;
1006
1003
  };
1007
1004
  template <class _Fp>
1008
1005
  struct __callable<_Fp, false>