cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -30,14 +30,20 @@
30
30
  # pragma system_header
31
31
  #endif // no system header
32
32
 
33
- #include <thrust/detail/memory_wrapper.h>
33
+ #include <thrust/detail/allocator/allocator_traits.h>
34
+ #include <thrust/detail/type_deduction.h>
34
35
  #include <thrust/detail/type_traits.h>
35
36
  #include <thrust/detail/type_traits/has_member_function.h>
36
37
  #include <thrust/detail/type_traits/has_nested_type.h>
38
+ #include <thrust/detail/type_traits/is_call_possible.h>
37
39
  #include <thrust/detail/type_traits/pointer_traits.h>
38
40
 
41
+ #include <cuda/std/__cccl/memory_wrapper.h>
42
+ #include <cuda/std/limits>
39
43
  #include <cuda/std/type_traits>
40
44
 
45
+ #include <new>
46
+
41
47
  THRUST_NAMESPACE_BEGIN
42
48
  namespace detail
43
49
  {
@@ -46,9 +52,11 @@ namespace detail
46
52
  template <typename Alloc>
47
53
  struct allocator_system;
48
54
 
55
+ template <typename Alloc>
56
+ struct allocator_traits;
57
+
49
58
  namespace allocator_traits_detail
50
59
  {
51
-
52
60
  __THRUST_DEFINE_HAS_NESTED_TYPE(has_value_type, value_type)
53
61
  __THRUST_DEFINE_HAS_NESTED_TYPE(has_pointer, pointer)
54
62
  __THRUST_DEFINE_HAS_NESTED_TYPE(has_const_pointer, const_pointer)
@@ -65,21 +73,11 @@ __THRUST_DEFINE_HAS_NESTED_TYPE(has_system_type, system_type)
65
73
  __THRUST_DEFINE_HAS_NESTED_TYPE(has_is_always_equal, is_always_equal)
66
74
  __THRUST_DEFINE_HAS_MEMBER_FUNCTION(has_member_system_impl, system)
67
75
 
68
- template <typename Alloc, typename U>
69
- struct has_rebind
70
- {
71
- using yes_type = char;
72
- using no_type = int;
73
-
74
- template <typename S>
75
- static yes_type test(typename S::template rebind<U>::other*);
76
- template <typename S>
77
- static no_type test(...);
76
+ template <typename Alloc, typename U, typename = void>
77
+ inline constexpr bool has_rebind = false;
78
78
 
79
- static bool const value = sizeof(test<U>(0)) == sizeof(yes_type);
80
-
81
- using type = thrust::detail::integral_constant<bool, value>;
82
- };
79
+ template <typename Alloc, typename U>
80
+ inline constexpr bool has_rebind<Alloc, U, ::cuda::std::void_t<decltype(U::template rebind<U>::other)>> = true;
83
81
 
84
82
  _CCCL_SUPPRESS_DEPRECATED_PUSH
85
83
 
@@ -99,8 +97,7 @@ THRUST_SPECIALIZE_DEPRECATED(has_const_reference)
99
97
  #undef THRUST_SPECIALIZE_DEPRECATED
100
98
 
101
99
  template <typename T, typename U>
102
- struct has_rebind<std::allocator<T>, U> : false_type
103
- {};
100
+ inline constexpr bool has_rebind<std::allocator<T>, U, void> = false;
104
101
 
105
102
  template <typename T>
106
103
  struct nested_pointer
@@ -191,7 +188,7 @@ struct has_member_system
191
188
 
192
189
  _CCCL_SUPPRESS_DEPRECATED_POP
193
190
 
194
- template <class Alloc, class U, bool = has_rebind<Alloc, U>::value>
191
+ template <class Alloc, class U, bool = has_rebind<Alloc, U>>
195
192
  struct rebind_alloc
196
193
  {
197
194
  using type = typename Alloc::template rebind<U>::other;
@@ -209,6 +206,159 @@ struct rebind_alloc<Alloc<T, Args...>, U, false>
209
206
  using type = Alloc<U, Args...>;
210
207
  };
211
208
 
209
+ __THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_allocate_with_hint_impl, allocate)
210
+
211
+ template <typename Alloc>
212
+ class has_member_allocate_with_hint
213
+ {
214
+ using pointer = typename allocator_traits<Alloc>::pointer;
215
+ using size_type = typename allocator_traits<Alloc>::size_type;
216
+ using const_void_pointer = typename allocator_traits<Alloc>::const_void_pointer;
217
+
218
+ public:
219
+ using type = typename has_member_allocate_with_hint_impl<Alloc, pointer(size_type, const_void_pointer)>::type;
220
+ static const bool value = type::value;
221
+ };
222
+
223
+ template <typename Alloc>
224
+ _CCCL_HOST_DEVICE typename allocator_traits<Alloc>::pointer
225
+ allocate(Alloc& a,
226
+ typename allocator_traits<Alloc>::size_type n,
227
+ [[maybe_unused]] typename allocator_traits<Alloc>::const_void_pointer hint)
228
+ {
229
+ if constexpr (has_member_allocate_with_hint<Alloc>::value)
230
+ {
231
+ return a.allocate(n, hint);
232
+ }
233
+ else
234
+ {
235
+ return a.allocate(n);
236
+ }
237
+ }
238
+
239
+ __THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_construct1_impl, construct)
240
+
241
+ template <typename Alloc, typename T>
242
+ struct has_member_construct1 : has_member_construct1_impl<Alloc, void(T*)>
243
+ {};
244
+
245
+ _CCCL_EXEC_CHECK_DISABLE
246
+ template <typename Alloc, typename T>
247
+ _CCCL_HOST_DEVICE void construct(Alloc& a, T* p)
248
+ {
249
+ if constexpr (has_member_construct1<Alloc, T>::value)
250
+ {
251
+ a.construct(p);
252
+ }
253
+ else
254
+ {
255
+ ::new (static_cast<void*>(p)) T();
256
+ }
257
+ }
258
+
259
+ __THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_construct2_impl, construct)
260
+
261
+ template <typename Alloc, typename T, typename Arg1>
262
+ struct has_member_construct2 : has_member_construct2_impl<Alloc, void(T*, const Arg1&)>
263
+ {};
264
+
265
+ _CCCL_EXEC_CHECK_DISABLE
266
+ template <typename Alloc, typename T, typename Arg1>
267
+ _CCCL_HOST_DEVICE void construct(Alloc& a, T* p, const Arg1& arg1)
268
+ {
269
+ if constexpr (has_member_construct2<Alloc, T, Arg1>::value)
270
+ {
271
+ a.construct(p, arg1);
272
+ }
273
+ else
274
+ {
275
+ ::new (static_cast<void*>(p)) T(arg1);
276
+ }
277
+ }
278
+
279
+ __THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_constructN_impl, construct)
280
+
281
+ template <typename Alloc, typename T, typename... Args>
282
+ struct has_member_constructN : has_member_constructN_impl<Alloc, void(T*, Args...)>
283
+ {};
284
+
285
+ _CCCL_EXEC_CHECK_DISABLE
286
+ template <typename Alloc, typename T, typename... Args>
287
+ inline _CCCL_HOST_DEVICE void construct([[maybe_unused]] Alloc& a, T* p, Args&&... args)
288
+ {
289
+ if constexpr (has_member_constructN<Alloc, T, Args...>::value)
290
+ {
291
+ a.construct(p, THRUST_FWD(args)...);
292
+ }
293
+ else
294
+ {
295
+ ::new (static_cast<void*>(p)) T(THRUST_FWD(args)...);
296
+ }
297
+ }
298
+
299
+ __THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_destroy_impl, destroy)
300
+
301
+ template <typename Alloc, typename T>
302
+ struct has_member_destroy : has_member_destroy_impl<Alloc, void(T*)>
303
+ {};
304
+
305
+ _CCCL_EXEC_CHECK_DISABLE
306
+ template <typename Alloc, typename T>
307
+ _CCCL_HOST_DEVICE void destroy([[maybe_unused]] Alloc& a, T* p)
308
+ {
309
+ if constexpr (has_member_destroy<Alloc, T>::value)
310
+ {
311
+ a.destroy(p);
312
+ }
313
+ else
314
+ {
315
+ p->~T();
316
+ }
317
+ }
318
+
319
+ __THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_max_size_impl, max_size)
320
+
321
+ template <typename Alloc>
322
+ class has_member_max_size
323
+ {
324
+ using size_type = typename allocator_traits<Alloc>::size_type;
325
+
326
+ public:
327
+ using type = typename has_member_max_size_impl<Alloc, size_type()>::type;
328
+ static const bool value = type::value;
329
+ };
330
+
331
+ template <typename Alloc>
332
+ _CCCL_HOST_DEVICE typename allocator_traits<Alloc>::size_type max_size([[maybe_unused]] const Alloc& a)
333
+ {
334
+ if constexpr (has_member_max_size<Alloc>::value)
335
+ {
336
+ return a.max_size();
337
+ }
338
+ else
339
+ {
340
+ using size_type = typename allocator_traits<Alloc>::size_type;
341
+ return ::cuda::std::numeric_limits<size_type>::max();
342
+ }
343
+ }
344
+
345
+ // TODO(bgruber): can be return decltype(auto) here?
346
+ template <typename Alloc>
347
+ _CCCL_HOST_DEVICE ::cuda::std::
348
+ _If<has_member_system<Alloc>::value, typename allocator_system<Alloc>::type&, typename allocator_system<Alloc>::type>
349
+ system(Alloc& a)
350
+ {
351
+ if constexpr (has_member_system<Alloc>::value)
352
+ {
353
+ // return the allocator's system
354
+ return a.system();
355
+ }
356
+ else
357
+ {
358
+ // return a copy of a value-initialized system
359
+ return typename allocator_system<Alloc>::type{};
360
+ }
361
+ }
212
362
  } // namespace allocator_traits_detail
213
363
 
214
364
  template <typename Alloc>
@@ -293,32 +443,157 @@ public:
293
443
  using other = allocator_traits;
294
444
 
295
445
  // Deprecated std::allocator aliases that we need:
296
- using reference = typename thrust::detail::pointer_traits<pointer>::reference;
297
- using const_reference = typename thrust::detail::pointer_traits<const_pointer>::reference;
298
-
299
- inline _CCCL_HOST_DEVICE static pointer allocate(allocator_type& a, size_type n);
446
+ using reference = typename pointer_traits<pointer>::reference;
447
+ using const_reference = typename pointer_traits<const_pointer>::reference;
300
448
 
301
- inline _CCCL_HOST_DEVICE static pointer allocate(allocator_type& a, size_type n, const_void_pointer hint);
449
+ inline _CCCL_HOST_DEVICE static pointer allocate(allocator_type& a, size_type n)
450
+ {
451
+ struct workaround_warnings
452
+ {
453
+ _CCCL_EXEC_CHECK_DISABLE
454
+ static _CCCL_HOST_DEVICE pointer allocate(Alloc& a, size_type n)
455
+ {
456
+ return a.allocate(n);
457
+ }
458
+ };
459
+
460
+ return workaround_warnings::allocate(a, n);
461
+ }
462
+
463
+ inline _CCCL_HOST_DEVICE static pointer allocate(allocator_type& a, size_type n, const_void_pointer hint)
464
+ {
465
+ return allocator_traits_detail::allocate(a, n, hint);
466
+ }
302
467
 
303
- inline _CCCL_HOST_DEVICE static void deallocate(allocator_type& a, pointer p, size_type n) noexcept;
468
+ inline _CCCL_HOST_DEVICE static void deallocate(allocator_type& a, pointer p, size_type n) noexcept
469
+ {
470
+ struct workaround_warnings
471
+ {
472
+ _CCCL_EXEC_CHECK_DISABLE
473
+ static _CCCL_HOST_DEVICE void deallocate(Alloc& a, pointer p, size_type n) noexcept
474
+ {
475
+ return a.deallocate(p, n);
476
+ }
477
+ };
478
+
479
+ return workaround_warnings::deallocate(a, p, n);
480
+ }
304
481
 
305
482
  // XXX should probably change T* to pointer below and then relax later
306
483
 
307
484
  template <typename T>
308
- inline _CCCL_HOST_DEVICE static void construct(allocator_type& a, T* p);
485
+ _CCCL_HOST_DEVICE static void construct(allocator_type& a, T* p)
486
+ {
487
+ return allocator_traits_detail::construct(a, p);
488
+ }
309
489
 
310
490
  template <typename T, typename Arg1>
311
- inline _CCCL_HOST_DEVICE static void construct(allocator_type& a, T* p, const Arg1& arg1);
491
+ _CCCL_HOST_DEVICE static void construct(allocator_type& a, T* p, const Arg1& arg1)
492
+ {
493
+ return allocator_traits_detail::construct(a, p, arg1);
494
+ }
312
495
 
313
496
  template <typename T, typename... Args>
314
- inline _CCCL_HOST_DEVICE static void construct(allocator_type& a, T* p, Args&&... args);
497
+ _CCCL_HOST_DEVICE static void construct(allocator_type& a, T* p, Args&&... args)
498
+ {
499
+ return allocator_traits_detail::construct(a, p, THRUST_FWD(args)...);
500
+ }
315
501
 
316
502
  template <typename T>
317
- inline _CCCL_HOST_DEVICE static void destroy(allocator_type& a, T* p) noexcept;
503
+ _CCCL_HOST_DEVICE static void destroy(allocator_type& a, T* p) noexcept
504
+ {
505
+ return allocator_traits_detail::destroy(a, p);
506
+ }
318
507
 
319
- inline _CCCL_HOST_DEVICE static size_type max_size(const allocator_type& a);
508
+ _CCCL_HOST_DEVICE static size_type max_size(const allocator_type& a)
509
+ {
510
+ return allocator_traits_detail::max_size(a);
511
+ }
320
512
  }; // end allocator_traits
321
513
 
514
+ // std::allocator's member functions are deprecated in C++17 and removed in
515
+ // C++20, so we can't just use the generic implementation for allocator_traits
516
+ // that calls the allocator's member functions.
517
+ // Instead, specialize allocator_traits for std::allocator and defer to
518
+ // std::allocator_traits<std::allocator> and let the STL do whatever it needs
519
+ // to for the current c++ version. Manually forward the calls to suppress
520
+ // host/device warnings.
521
+ template <typename T>
522
+ struct allocator_traits<std::allocator<T>> : public std::allocator_traits<std::allocator<T>>
523
+ {
524
+ private:
525
+ using superclass = std::allocator_traits<std::allocator<T>>;
526
+
527
+ public:
528
+ using allocator_type = typename superclass::allocator_type;
529
+ using value_type = typename superclass::value_type;
530
+ using pointer = typename superclass::pointer;
531
+ using const_pointer = typename superclass::const_pointer;
532
+ using void_pointer = typename superclass::void_pointer;
533
+ using const_void_pointer = typename superclass::const_void_pointer;
534
+ using difference_type = typename superclass::difference_type;
535
+ using size_type = typename superclass::size_type;
536
+ using propagate_on_container_swap = typename superclass::propagate_on_container_swap;
537
+ using propagate_on_container_copy_assignment = typename superclass::propagate_on_container_copy_assignment;
538
+ using propagate_on_container_move_assignment = typename superclass::propagate_on_container_move_assignment;
539
+
540
+ // std::allocator_traits added this in C++17, but thrust::allocator_traits defines
541
+ // it unconditionally.
542
+ using is_always_equal =
543
+ typename eval_if<allocator_traits_detail::has_is_always_equal<allocator_type>::value,
544
+ allocator_traits_detail::nested_is_always_equal<allocator_type>,
545
+ ::cuda::std::is_empty<allocator_type>>::type;
546
+
547
+ // std::allocator_traits doesn't provide these, but
548
+ // thrust::detail::allocator_traits does. These used to be part of the
549
+ // std::allocator API but were deprecated in C++17.
550
+ using reference = typename pointer_traits<pointer>::reference;
551
+ using const_reference = typename pointer_traits<const_pointer>::reference;
552
+
553
+ template <typename U>
554
+ using rebind_alloc = std::allocator<U>;
555
+ template <typename U>
556
+ using rebind_traits = allocator_traits<std::allocator<U>>;
557
+
558
+ _CCCL_EXEC_CHECK_DISABLE
559
+ _CCCL_HOST_DEVICE static pointer allocate(allocator_type& a, size_type n)
560
+ {
561
+ return superclass::allocate(a, n);
562
+ }
563
+
564
+ _CCCL_EXEC_CHECK_DISABLE
565
+ _CCCL_HOST_DEVICE static pointer allocate(allocator_type& a, size_type n, const_void_pointer hint)
566
+ {
567
+ return superclass::allocate(a, n, hint);
568
+ }
569
+
570
+ _CCCL_EXEC_CHECK_DISABLE
571
+ _CCCL_HOST_DEVICE static void deallocate(allocator_type& a, pointer p, size_type n) noexcept
572
+ {
573
+ superclass::deallocate(a, p, n);
574
+ }
575
+
576
+ _CCCL_EXEC_CHECK_DISABLE
577
+ template <typename U, typename... Args>
578
+ _CCCL_HOST_DEVICE static void construct(allocator_type& a, U* p, Args&&... args)
579
+ {
580
+ superclass::construct(a, p, THRUST_FWD(args)...);
581
+ }
582
+
583
+ _CCCL_EXEC_CHECK_DISABLE
584
+ template <typename U>
585
+ _CCCL_HOST_DEVICE static void destroy(allocator_type& a, U* p) noexcept
586
+ {
587
+ superclass::destroy(a, p);
588
+ }
589
+
590
+ _CCCL_EXEC_CHECK_DISABLE
591
+ _CCCL_HOST_DEVICE static size_type max_size(const allocator_type& a)
592
+ {
593
+ return superclass::max_size(a);
594
+ }
595
+ };
596
+
322
597
  // we consider a type an allocator if T::value_type exists
323
598
  // it doesn't make much sense (containers, which are not allocators, will fulfill this requirement),
324
599
  // but allocator_traits is specified to work for any type with that nested alias
@@ -341,10 +616,11 @@ struct allocator_system
341
616
  ::cuda::std::add_lvalue_reference<type>,
342
617
  ::cuda::std::type_identity<type>>::type;
343
618
 
344
- _CCCL_HOST_DEVICE inline static get_result_type get(Alloc& a);
619
+ _CCCL_HOST_DEVICE inline static get_result_type get(Alloc& a)
620
+ {
621
+ return allocator_traits_detail::system(a);
622
+ }
345
623
  };
346
624
 
347
625
  } // namespace detail
348
626
  THRUST_NAMESPACE_END
349
-
350
- #include <thrust/detail/allocator/allocator_traits.inl>
@@ -25,21 +25,168 @@
25
25
  #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
26
26
  # pragma system_header
27
27
  #endif // no system header
28
+
29
+ #include <thrust/advance.h>
30
+ #include <thrust/detail/allocator/allocator_traits.h>
31
+ #include <thrust/detail/copy.h>
28
32
  #include <thrust/detail/execution_policy.h>
33
+ #include <thrust/detail/type_traits/pointer_traits.h>
34
+ #include <thrust/distance.h>
35
+ #include <thrust/for_each.h>
36
+ #include <thrust/iterator/iterator_traits.h>
37
+ #include <thrust/iterator/zip_iterator.h>
38
+ #include <thrust/tuple.h>
39
+
40
+ #include <cuda/std/__cccl/memory_wrapper.h>
29
41
 
30
42
  THRUST_NAMESPACE_BEGIN
31
43
  namespace detail
32
44
  {
45
+ template <typename Allocator, typename InputType, typename OutputType>
46
+ struct copy_construct_with_allocator
47
+ {
48
+ Allocator& a;
49
+
50
+ template <typename Tuple>
51
+ inline _CCCL_HOST_DEVICE void operator()(Tuple t)
52
+ {
53
+ const InputType& in = thrust::get<0>(t);
54
+ OutputType& out = thrust::get<1>(t);
55
+
56
+ allocator_traits<Allocator>::construct(a, &out, in);
57
+ }
58
+ };
59
+
60
+ // we need to use allocator_traits<Allocator>::construct() to
61
+ // copy construct a T if either:
62
+ // 1. Allocator has a 2-argument construct() member or
63
+ // 2. T has a non-trivial copy constructor
64
+ template <typename Allocator, typename T>
65
+ inline constexpr bool needs_copy_construct_via_allocator =
66
+ allocator_traits_detail::has_member_construct2<Allocator, T, T>::value
67
+ || !::cuda::std::is_trivially_copy_constructible<T>::value;
68
+
69
+ // we know that std::allocator::construct's only effect is to call T's
70
+ // copy constructor, so we needn't consider or use its construct() member for copy construction
71
+ template <typename U, typename T>
72
+ inline constexpr bool needs_copy_construct_via_allocator<std::allocator<U>, T> =
73
+ !::cuda::std::is_trivially_copy_constructible_v<T>;
74
+
75
+ // XXX it's regrettable that this implementation is copied almost
76
+ // exactly from system::detail::generic::uninitialized_copy
77
+ // perhaps generic::uninitialized_copy could call this routine
78
+ // with a default allocator
79
+ template <typename Allocator, typename FromSystem, typename ToSystem, typename InputIterator, typename Pointer>
80
+ _CCCL_HOST_DEVICE Pointer uninitialized_copy_with_allocator(
81
+ Allocator& a,
82
+ const thrust::execution_policy<FromSystem>& from_system,
83
+ const thrust::execution_policy<ToSystem>& to_system,
84
+ InputIterator first,
85
+ InputIterator last,
86
+ Pointer result)
87
+ {
88
+ if constexpr (::cuda::std::is_convertible_v<FromSystem, ToSystem>)
89
+ {
90
+ // zip up the iterators
91
+ using IteratorTuple = thrust::tuple<InputIterator, Pointer>;
92
+ using ZipIterator = thrust::zip_iterator<IteratorTuple>;
93
+
94
+ ZipIterator begin = thrust::make_zip_iterator(first, result);
95
+ ZipIterator end = begin;
96
+
97
+ // get a zip_iterator pointing to the end
98
+ const thrust::detail::it_difference_t<InputIterator> n = ::cuda::std::distance(first, last);
99
+ ::cuda::std::advance(end, n);
100
+
101
+ // create a functor
102
+ using InputType = it_value_t<InputIterator>;
103
+ using OutputType = it_value_t<Pointer>;
104
+
105
+ // do the for_each
106
+ // note we use to_system to dispatch the for_each
107
+ thrust::for_each(to_system, begin, end, copy_construct_with_allocator<Allocator, InputType, OutputType>{a});
108
+
109
+ // return the end of the output range
110
+ return thrust::get<1>(end.get_iterator_tuple());
111
+ }
112
+ else
113
+ {
114
+ // the systems aren't trivially interoperable
115
+ // just call two_system_copy and hope for the best
116
+ return thrust::detail::two_system_copy(from_system, to_system, first, last, result);
117
+ }
118
+ }
119
+
120
+ // XXX it's regrettable that this implementation is copied almost
121
+ // exactly from system::detail::generic::uninitialized_copy_n
122
+ // perhaps generic::uninitialized_copy_n could call this routine
123
+ // with a default allocator
124
+ template <typename Allocator, typename FromSystem, typename ToSystem, typename InputIterator, typename Size, typename Pointer>
125
+ _CCCL_HOST_DEVICE Pointer uninitialized_copy_with_allocator_n(
126
+ Allocator& a,
127
+ const thrust::execution_policy<FromSystem>& from_system,
128
+ const thrust::execution_policy<ToSystem>& to_system,
129
+ InputIterator first,
130
+ Size n,
131
+ Pointer result)
132
+ {
133
+ if constexpr (::cuda::std::is_convertible_v<FromSystem, ToSystem>)
134
+ {
135
+ // zip up the iterators
136
+ using IteratorTuple = thrust::tuple<InputIterator, Pointer>;
137
+ using ZipIterator = thrust::zip_iterator<IteratorTuple>;
138
+
139
+ ZipIterator begin = thrust::make_zip_iterator(first, result);
140
+
141
+ // create a functor
142
+ using InputType = it_value_t<InputIterator>;
143
+ using OutputType = it_value_t<Pointer>;
144
+
145
+ // do the for_each_n
146
+ // note we use to_system to dispatch the for_each_n
147
+ ZipIterator end =
148
+ thrust::for_each_n(to_system, begin, n, copy_construct_with_allocator<Allocator, InputType, OutputType>(a));
149
+
150
+ // return the end of the output range
151
+ return thrust::get<1>(end.get_iterator_tuple());
152
+ }
153
+ else
154
+ {
155
+ // the systems aren't trivially interoperable
156
+ // just call two_system_copy_n and hope for the best
157
+ return thrust::detail::two_system_copy_n(from_system, to_system, first, n, result);
158
+ }
159
+ }
33
160
 
34
161
  template <typename System, typename Allocator, typename InputIterator, typename Pointer>
35
162
  _CCCL_HOST_DEVICE Pointer copy_construct_range(
36
- thrust::execution_policy<System>& from_system, Allocator& a, InputIterator first, InputIterator last, Pointer result);
163
+ thrust::execution_policy<System>& from_system, Allocator& a, InputIterator first, InputIterator last, Pointer result)
164
+ {
165
+ if constexpr (needs_copy_construct_via_allocator<Allocator, typename pointer_element<Pointer>::type>)
166
+ {
167
+ return uninitialized_copy_with_allocator(a, from_system, allocator_system<Allocator>::get(a), first, last, result);
168
+ }
169
+ else
170
+ {
171
+ // just call two_system_copy
172
+ return thrust::detail::two_system_copy(from_system, allocator_system<Allocator>::get(a), first, last, result);
173
+ }
174
+ }
37
175
 
38
176
  template <typename System, typename Allocator, typename InputIterator, typename Size, typename Pointer>
39
177
  _CCCL_HOST_DEVICE Pointer copy_construct_range_n(
40
- thrust::execution_policy<System>& from_system, Allocator& a, InputIterator first, Size n, Pointer result);
178
+ thrust::execution_policy<System>& from_system, Allocator& a, InputIterator first, Size n, Pointer result)
179
+ {
180
+ if constexpr (needs_copy_construct_via_allocator<Allocator, typename pointer_element<Pointer>::type>)
181
+ {
182
+ return uninitialized_copy_with_allocator_n(a, from_system, allocator_system<Allocator>::get(a), first, n, result);
183
+ }
184
+ else
185
+ {
186
+ // just call two_system_copy_n
187
+ return thrust::detail::two_system_copy_n(from_system, allocator_system<Allocator>::get(a), first, n, result);
188
+ }
189
+ }
41
190
 
42
191
  } // namespace detail
43
192
  THRUST_NAMESPACE_END
44
-
45
- #include <thrust/detail/allocator/copy_construct_range.inl>
@@ -26,14 +26,71 @@
26
26
  # pragma system_header
27
27
  #endif // no system header
28
28
 
29
+ #include <thrust/detail/allocator/allocator_traits.h>
30
+ #include <thrust/detail/allocator/destroy_range.h>
31
+ #include <thrust/detail/type_traits/pointer_traits.h>
32
+ #include <thrust/for_each.h>
33
+
34
+ #include <cuda/std/__cccl/memory_wrapper.h>
35
+
29
36
  THRUST_NAMESPACE_BEGIN
30
37
  namespace detail
31
38
  {
39
+ // destroy_range has three cases:
40
+ // if Allocator has an effectful member function destroy:
41
+ // 1. destroy via the allocator
42
+ // else
43
+ // 2. if T has a non-trivial destructor, destroy the range without using the allocator
44
+ // 3. if T has a trivial destructor, do a no-op
45
+
46
+ template <typename Allocator, typename T>
47
+ inline constexpr bool has_effectful_member_destroy = allocator_traits_detail::has_member_destroy<Allocator, T>::value;
48
+
49
+ // std::allocator::destroy's only effect is to invoke its argument's destructor
50
+ template <typename U, typename T>
51
+ inline constexpr bool has_effectful_member_destroy<std::allocator<U>, T> = false;
52
+
53
+ template <typename Allocator>
54
+ struct destroy_via_allocator
55
+ {
56
+ Allocator& a;
57
+
58
+ template <typename T>
59
+ _CCCL_HOST_DEVICE void operator()(T& x) noexcept
60
+ {
61
+ allocator_traits<Allocator>::destroy(a, &x);
62
+ }
63
+ };
64
+
65
+ // we must prepare for His coming
66
+ struct gozer
67
+ {
68
+ _CCCL_EXEC_CHECK_DISABLE
69
+ template <typename T>
70
+ inline _CCCL_HOST_DEVICE void operator()(T& x) noexcept
71
+ {
72
+ x.~T();
73
+ }
74
+ };
32
75
 
33
76
  template <typename Allocator, typename Pointer, typename Size>
34
- _CCCL_HOST_DEVICE inline void destroy_range(Allocator& a, Pointer p, Size n) noexcept;
77
+ _CCCL_HOST_DEVICE void
78
+ destroy_range([[maybe_unused]] Allocator& a, [[maybe_unused]] Pointer p, [[maybe_unused]] Size n) noexcept
79
+ {
80
+ using pe_t = typename pointer_element<Pointer>::type;
81
+
82
+ // case 1: destroy via allocator
83
+ if constexpr (has_effectful_member_destroy<Allocator, pe_t>)
84
+ {
85
+ thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, destroy_via_allocator<Allocator>{a});
86
+ }
87
+ // case 2: destroy without the allocator
88
+ else if constexpr (!::cuda::std::is_trivially_destructible_v<pe_t>)
89
+ {
90
+ thrust::for_each_n(allocator_system<Allocator>::get(a), p, n, gozer());
91
+ }
92
+ // case 3: Allocator has no member function "destroy", and T has a trivial destructor, nothing to be done
93
+ }
35
94
 
36
95
  } // namespace detail
37
96
  THRUST_NAMESPACE_END
38
-
39
- #include <thrust/detail/allocator/destroy_range.inl>