cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -4,7 +4,7 @@
4
4
 
5
5
  # Python signatures are declared in the companion Python stub file _bindings.pyi
6
6
  # Make sure to update PYI with change to Python API to ensure that Python
7
- # static type checker tools like mypy green-lights cuda.cccl.parallel
7
+ # static type checker tools like mypy green-lights cuda.compute
8
8
 
9
9
  from libc.string cimport memset, memcpy
10
10
  from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
@@ -120,6 +120,10 @@ cdef extern from "cccl/c/types.h":
120
120
  ASCENDING "CCCL_ASCENDING"
121
121
  DESCENDING "CCCL_DESCENDING"
122
122
 
123
+ cpdef enum cccl_init_kind_t:
124
+ VALUE_INIT "CCCL_VALUE_INIT"
125
+ FUTURE_VALUE_INIT "CCCL_FUTURE_VALUE_INIT"
126
+ NO_INIT "CCCL_NO_INIT"
123
127
 
124
128
  cdef void arg_type_check(
125
129
  str arg_name,
@@ -136,6 +140,7 @@ OpKind = cccl_op_kind_t
136
140
  TypeEnum = cccl_type_enum
137
141
  IteratorKind = cccl_iterator_kind_t
138
142
  SortOrder = cccl_sort_order_t
143
+ InitKind = cccl_init_kind_t
139
144
 
140
145
  cdef void _validate_alignment(int alignment) except *:
141
146
  """
@@ -724,6 +729,11 @@ cdef class Iterator:
724
729
  else:
725
730
  return IteratorKind.ITERATOR
726
731
 
732
+ @property
733
+ def value_type(self):
734
+ cdef cccl_type_info type_info = self.iter_data.value_type
735
+ return TypeInfo(type_info.size, type_info.alignment, type_info.type)
736
+
727
737
  def is_kind_pointer(self):
728
738
  cdef cccl_iterator_kind_t it_kind = self.iter_data.type
729
739
  return (it_kind == cccl_iterator_kind_t.POINTER)
@@ -947,8 +957,9 @@ cdef extern from "cccl/c/scan.h":
947
957
  cccl_iterator_t,
948
958
  cccl_iterator_t,
949
959
  cccl_op_t,
950
- cccl_value_t,
960
+ cccl_type_info,
951
961
  _Bool,
962
+ cccl_init_kind_t,
952
963
  int, int, const char*, const char*, const char*, const char*
953
964
  ) nogil
954
965
 
@@ -976,6 +987,41 @@ cdef extern from "cccl/c/scan.h":
976
987
  CUstream
977
988
  ) nogil
978
989
 
990
+ cdef CUresult cccl_device_exclusive_scan_future_value(
991
+ cccl_device_scan_build_result_t,
992
+ void *,
993
+ size_t *,
994
+ cccl_iterator_t,
995
+ cccl_iterator_t,
996
+ uint64_t,
997
+ cccl_op_t,
998
+ cccl_iterator_t,
999
+ CUstream
1000
+ ) nogil
1001
+
1002
+ cdef CUresult cccl_device_inclusive_scan_future_value(
1003
+ cccl_device_scan_build_result_t,
1004
+ void *,
1005
+ size_t *,
1006
+ cccl_iterator_t,
1007
+ cccl_iterator_t,
1008
+ uint64_t,
1009
+ cccl_op_t,
1010
+ cccl_iterator_t,
1011
+ CUstream
1012
+ ) nogil
1013
+
1014
+ cdef CUresult cccl_device_inclusive_scan_no_init(
1015
+ cccl_device_scan_build_result_t,
1016
+ void *,
1017
+ size_t *,
1018
+ cccl_iterator_t,
1019
+ cccl_iterator_t,
1020
+ uint64_t,
1021
+ cccl_op_t,
1022
+ CUstream
1023
+ ) nogil
1024
+
979
1025
  cdef CUresult cccl_device_scan_cleanup(
980
1026
  cccl_device_scan_build_result_t*
981
1027
  ) nogil
@@ -989,8 +1035,9 @@ cdef class DeviceScanBuildResult:
989
1035
  Iterator d_in,
990
1036
  Iterator d_out,
991
1037
  Op op,
992
- Value h_init,
1038
+ TypeInfo init_type,
993
1039
  bint force_inclusive,
1040
+ cccl_init_kind_t init_kind,
994
1041
  CommonData common_data
995
1042
  ):
996
1043
  cdef CUresult status = -1
@@ -1008,8 +1055,9 @@ cdef class DeviceScanBuildResult:
1008
1055
  d_in.iter_data,
1009
1056
  d_out.iter_data,
1010
1057
  op.op_data,
1011
- h_init.value_data,
1058
+ init_type.type_info,
1012
1059
  force_inclusive,
1060
+ init_kind,
1013
1061
  cc_major,
1014
1062
  cc_minor,
1015
1063
  cub_path,
@@ -1035,7 +1083,7 @@ cdef class DeviceScanBuildResult:
1035
1083
  Iterator d_out,
1036
1084
  size_t num_items,
1037
1085
  Op op,
1038
- Value h_init,
1086
+ Value init_value,
1039
1087
  stream
1040
1088
  ):
1041
1089
  cdef CUresult status = -1
@@ -1052,7 +1100,7 @@ cdef class DeviceScanBuildResult:
1052
1100
  d_out.iter_data,
1053
1101
  <uint64_t>num_items,
1054
1102
  op.op_data,
1055
- h_init.value_data,
1103
+ init_value.value_data,
1056
1104
  c_stream
1057
1105
  )
1058
1106
  if status != 0:
@@ -1069,7 +1117,7 @@ cdef class DeviceScanBuildResult:
1069
1117
  Iterator d_out,
1070
1118
  size_t num_items,
1071
1119
  Op op,
1072
- Value h_init,
1120
+ Value init_value,
1073
1121
  stream
1074
1122
  ):
1075
1123
  cdef CUresult status = -1
@@ -1086,7 +1134,7 @@ cdef class DeviceScanBuildResult:
1086
1134
  d_out.iter_data,
1087
1135
  <uint64_t>num_items,
1088
1136
  op.op_data,
1089
- h_init.value_data,
1137
+ init_value.value_data,
1090
1138
  c_stream
1091
1139
  )
1092
1140
  if status != 0:
@@ -1095,6 +1143,107 @@ cdef class DeviceScanBuildResult:
1095
1143
  )
1096
1144
  return storage_sz
1097
1145
 
1146
+ cpdef int compute_inclusive_future_value(
1147
+ DeviceScanBuildResult self,
1148
+ temp_storage_ptr,
1149
+ temp_storage_bytes,
1150
+ Iterator d_in,
1151
+ Iterator d_out,
1152
+ size_t num_items,
1153
+ Op op,
1154
+ Iterator init_value,
1155
+ stream
1156
+ ):
1157
+ cdef CUresult status = -1
1158
+ cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
1159
+ cdef size_t storage_sz = <size_t>temp_storage_bytes
1160
+ cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
1161
+
1162
+ with nogil:
1163
+ status = cccl_device_inclusive_scan_future_value(
1164
+ self.build_data,
1165
+ storage_ptr,
1166
+ &storage_sz,
1167
+ d_in.iter_data,
1168
+ d_out.iter_data,
1169
+ <uint64_t>num_items,
1170
+ op.op_data,
1171
+ init_value.iter_data,
1172
+ c_stream
1173
+ )
1174
+ if status != 0:
1175
+ raise RuntimeError(
1176
+ f"Failed executing inclusive scan, error code: {status}"
1177
+ )
1178
+ return storage_sz
1179
+
1180
+ cpdef int compute_exclusive_future_value(
1181
+ DeviceScanBuildResult self,
1182
+ temp_storage_ptr,
1183
+ temp_storage_bytes,
1184
+ Iterator d_in,
1185
+ Iterator d_out,
1186
+ size_t num_items,
1187
+ Op op,
1188
+ Iterator init_value,
1189
+ stream
1190
+ ):
1191
+ cdef CUresult status = -1
1192
+ cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
1193
+ cdef size_t storage_sz = <size_t>temp_storage_bytes
1194
+ cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
1195
+
1196
+ with nogil:
1197
+ status = cccl_device_exclusive_scan_future_value(
1198
+ self.build_data,
1199
+ storage_ptr,
1200
+ &storage_sz,
1201
+ d_in.iter_data,
1202
+ d_out.iter_data,
1203
+ <uint64_t>num_items,
1204
+ op.op_data,
1205
+ init_value.iter_data,
1206
+ c_stream
1207
+ )
1208
+ if status != 0:
1209
+ raise RuntimeError(
1210
+ f"Failed executing exclusive scan, error code: {status}"
1211
+ )
1212
+ return storage_sz
1213
+
1214
+ cpdef int compute_inclusive_no_init(
1215
+ DeviceScanBuildResult self,
1216
+ temp_storage_ptr,
1217
+ temp_storage_bytes,
1218
+ Iterator d_in,
1219
+ Iterator d_out,
1220
+ size_t num_items,
1221
+ Op op,
1222
+ object init_value,
1223
+ stream
1224
+ ):
1225
+ cdef CUresult status = -1
1226
+ cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
1227
+ cdef size_t storage_sz = <size_t>temp_storage_bytes
1228
+ cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
1229
+
1230
+ with nogil:
1231
+ status = cccl_device_inclusive_scan_no_init(
1232
+ self.build_data,
1233
+ storage_ptr,
1234
+ &storage_sz,
1235
+ d_in.iter_data,
1236
+ d_out.iter_data,
1237
+ <uint64_t>num_items,
1238
+ op.op_data,
1239
+ c_stream
1240
+ )
1241
+ if status != 0:
1242
+ raise RuntimeError(
1243
+ f"Failed executing inclusive scan, error code: {status}"
1244
+ )
1245
+ return storage_sz
1246
+
1098
1247
  def _get_cubin(self):
1099
1248
  return PyBytes_FromStringAndSize(
1100
1249
  <const char*>self.build_data.cubin,
@@ -148,7 +148,7 @@ def make_histogram_even(
148
148
  Example:
149
149
  Below, ``make_histogram_even`` is used to create a histogram object that can be reused.
150
150
 
151
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/histogram/histogram_object.py
151
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_object.py
152
152
  :language: python
153
153
  :start-after: # example-begin
154
154
 
@@ -190,7 +190,7 @@ def histogram_even(
190
190
  Example:
191
191
  Below, ``histogram_even`` is used to compute a histogram with evenly-spaced bins.
192
192
 
193
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/histogram/histogram_even_basic.py
193
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_even_basic.py
194
194
  :language: python
195
195
  :start-after: # example-begin
196
196
  :caption: Basic histogram example.
@@ -166,7 +166,7 @@ def make_merge_sort(
166
166
  Example:
167
167
  Below, ``make_merge_sort`` is used to create a merge sort object that can be reused.
168
168
 
169
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/merge_sort_object.py
169
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_object.py
170
170
  :language: python
171
171
  :start-after: # example-begin
172
172
 
@@ -201,7 +201,7 @@ def merge_sort(
201
201
  Example:
202
202
  Below, ``merge_sort`` is used to sort a sequence of keys inplace. It also rearranges the items according to the keys' order.
203
203
 
204
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/merge_sort_basic.py
204
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_basic.py
205
205
  :language: python
206
206
  :start-after: # example-begin
207
207
 
@@ -222,7 +222,7 @@ def make_radix_sort(
222
222
  Example:
223
223
  Below, ``make_radix_sort`` is used to create a radix sort object that can be reused.
224
224
 
225
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_object.py
225
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_object.py
226
226
  :language: python
227
227
  :start-after: # example-begin
228
228
 
@@ -259,14 +259,14 @@ def radix_sort(
259
259
  Example:
260
260
  Below, ``radix_sort`` is used to sort a sequence of keys. It also rearranges the values according to the keys' order.
261
261
 
262
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_basic.py
262
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_basic.py
263
263
  :language: python
264
264
  :start-after: # example-begin
265
265
 
266
266
 
267
267
  In the following example, ``radix_sort`` is used to sort a sequence of keys with a ``DoubleBuffer` for reduced temporary storage.
268
268
 
269
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_buffer.py
269
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_buffer.py
270
270
  :language: python
271
271
  :start-after: # example-begin
272
272
 
@@ -130,7 +130,7 @@ def make_reduce_into(
130
130
  Example:
131
131
  Below, ``make_reduce_into`` is used to create a reduction object that can be reused.
132
132
 
133
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/reduce_object.py
133
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/reduce_object.py
134
134
  :language: python
135
135
  :start-after: # example-begin
136
136
 
@@ -163,7 +163,7 @@ def reduce_into(
163
163
  Example:
164
164
  Below, ``reduce_into`` is used to compute the sum of a sequence of integers.
165
165
 
166
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/sum_reduction.py
166
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/sum_reduction.py
167
167
  :language: python
168
168
  :start-after: # example-begin
169
169
 
@@ -3,7 +3,7 @@
3
3
  #
4
4
  # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5
5
 
6
- from typing import Callable, Union
6
+ from typing import Callable, Union, cast
7
7
 
8
8
  import numba
9
9
  import numpy as np
@@ -20,14 +20,27 @@ from ..op import OpKind
20
20
  from ..typing import DeviceArrayLike, GpuStruct
21
21
 
22
22
 
23
+ def get_init_kind(
24
+ init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
25
+ ) -> _bindings.InitKind:
26
+ match init_value:
27
+ case None:
28
+ return _bindings.InitKind.NO_INIT
29
+ case _ if isinstance(init_value, DeviceArrayLike):
30
+ return _bindings.InitKind.FUTURE_VALUE_INIT
31
+ case _:
32
+ return _bindings.InitKind.VALUE_INIT
33
+
34
+
23
35
  class _Scan:
24
36
  __slots__ = [
25
37
  "build_result",
26
38
  "d_in_cccl",
27
39
  "d_out_cccl",
28
- "h_init_cccl",
40
+ "init_value_cccl",
29
41
  "op_wrapper",
30
42
  "device_scan_fn",
43
+ "init_kind",
31
44
  ]
32
45
 
33
46
  # TODO: constructor shouldn't require concrete `d_in`, `d_out`:
@@ -36,36 +49,74 @@ class _Scan:
36
49
  d_in: DeviceArrayLike | IteratorBase,
37
50
  d_out: DeviceArrayLike | IteratorBase,
38
51
  op: Callable | OpKind,
39
- h_init: np.ndarray | GpuStruct,
52
+ init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
40
53
  force_inclusive: bool,
41
54
  ):
42
55
  self.d_in_cccl = cccl.to_cccl_input_iter(d_in)
43
56
  self.d_out_cccl = cccl.to_cccl_output_iter(d_out)
44
- self.h_init_cccl = cccl.to_cccl_value(h_init)
45
- if isinstance(h_init, np.ndarray):
46
- value_type = numba.from_dtype(h_init.dtype)
47
- else:
48
- value_type = numba.typeof(h_init)
57
+
58
+ self.init_kind = get_init_kind(init_value)
59
+
60
+ self.init_value_cccl: _bindings.Iterator | _bindings.Value | None
61
+
62
+ match self.init_kind:
63
+ case _bindings.InitKind.NO_INIT:
64
+ # TODO: we just need to extract the dtype from the input iterator
65
+ if not isinstance(d_in, DeviceArrayLike):
66
+ raise ValueError(
67
+ "No init value not supported for non-DeviceArrayLike input"
68
+ )
69
+
70
+ self.init_value_cccl = None
71
+ value_type = numba.from_dtype(protocols.get_dtype(d_in))
72
+ init_value_type_info = self.d_in_cccl.value_type
73
+
74
+ case _bindings.InitKind.FUTURE_VALUE_INIT:
75
+ self.init_value_cccl = cccl.to_cccl_input_iter(init_value)
76
+ value_type = numba.from_dtype(
77
+ protocols.get_dtype(cast(DeviceArrayLike, init_value))
78
+ )
79
+ init_value_type_info = self.init_value_cccl.value_type
80
+
81
+ case _bindings.InitKind.VALUE_INIT:
82
+ self.init_value_cccl = cccl.to_cccl_value(init_value)
83
+ value_type = (
84
+ numba.from_dtype(init_value.dtype)
85
+ if isinstance(init_value, np.ndarray)
86
+ else numba.typeof(init_value)
87
+ )
88
+ init_value_type_info = self.init_value_cccl.type
49
89
 
50
90
  # For well-known operations, we don't need a signature
51
91
  if isinstance(op, OpKind):
52
92
  self.op_wrapper = cccl.to_cccl_op(op, None)
53
93
  else:
54
94
  self.op_wrapper = cccl.to_cccl_op(op, value_type(value_type, value_type))
95
+
55
96
  self.build_result = call_build(
56
97
  _bindings.DeviceScanBuildResult,
57
98
  self.d_in_cccl,
58
99
  self.d_out_cccl,
59
100
  self.op_wrapper,
60
- self.h_init_cccl,
101
+ init_value_type_info,
61
102
  force_inclusive,
103
+ self.init_kind,
62
104
  )
63
105
 
64
- self.device_scan_fn = (
65
- self.build_result.compute_inclusive
66
- if force_inclusive
67
- else self.build_result.compute_exclusive
68
- )
106
+ match (force_inclusive, self.init_kind):
107
+ case (True, _bindings.InitKind.FUTURE_VALUE_INIT):
108
+ self.device_scan_fn = self.build_result.compute_inclusive_future_value
109
+ case (True, _bindings.InitKind.VALUE_INIT):
110
+ self.device_scan_fn = self.build_result.compute_inclusive
111
+ case (True, _bindings.InitKind.NO_INIT):
112
+ self.device_scan_fn = self.build_result.compute_inclusive_no_init
113
+
114
+ case (False, _bindings.InitKind.FUTURE_VALUE_INIT):
115
+ self.device_scan_fn = self.build_result.compute_exclusive_future_value
116
+ case (False, _bindings.InitKind.VALUE_INIT):
117
+ self.device_scan_fn = self.build_result.compute_exclusive
118
+ case (False, _bindings.InitKind.NO_INIT):
119
+ raise ValueError("Exclusive scan with No init value is not supported")
69
120
 
70
121
  def __call__(
71
122
  self,
@@ -73,13 +124,25 @@ class _Scan:
73
124
  d_in,
74
125
  d_out,
75
126
  num_items: int,
76
- h_init: np.ndarray | GpuStruct,
127
+ init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
77
128
  stream=None,
78
129
  ):
79
130
  set_cccl_iterator_state(self.d_in_cccl, d_in)
80
131
  set_cccl_iterator_state(self.d_out_cccl, d_out)
81
132
 
82
- self.h_init_cccl.state = to_cccl_value_state(h_init)
133
+ match self.init_kind:
134
+ case _bindings.InitKind.FUTURE_VALUE_INIT:
135
+ # We know that the init_value_cccl is an Iterator, so this cast
136
+ # tells MyPy what the actual type is. cast() is a no-op at runtime,
137
+ # which makes it better than isinstance() since this is a hot path
138
+ # and we have to minimize the work we do prior to calling the
139
+ # kernel.
140
+ self.init_value_cccl = cast(_bindings.Iterator, self.init_value_cccl)
141
+ set_cccl_iterator_state(self.init_value_cccl, init_value)
142
+
143
+ case _bindings.InitKind.VALUE_INIT:
144
+ self.init_value_cccl = cast(_bindings.Value, self.init_value_cccl)
145
+ self.init_value_cccl.state = to_cccl_value_state(init_value)
83
146
 
84
147
  stream_handle = validate_and_get_stream(stream)
85
148
 
@@ -97,7 +160,7 @@ class _Scan:
97
160
  self.d_out_cccl,
98
161
  num_items,
99
162
  self.op_wrapper,
100
- self.h_init_cccl,
163
+ self.init_value_cccl,
101
164
  stream_handle,
102
165
  )
103
166
  return temp_storage_bytes
@@ -107,7 +170,7 @@ def make_cache_key(
107
170
  d_in: DeviceArrayLike | IteratorBase,
108
171
  d_out: DeviceArrayLike | IteratorBase,
109
172
  op: Callable | OpKind,
110
- h_init: np.ndarray,
173
+ init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
111
174
  ):
112
175
  d_in_key = (
113
176
  d_in.kind if isinstance(d_in, IteratorBase) else protocols.get_dtype(d_in)
@@ -123,8 +186,17 @@ def make_cache_key(
123
186
  else:
124
187
  op_key = CachableFunction(op)
125
188
 
126
- h_init_key = h_init.dtype
127
- return (d_in_key, d_out_key, op_key, h_init_key)
189
+ init_kind_key = get_init_kind(init_value)
190
+ match init_kind_key:
191
+ case _bindings.InitKind.NO_INIT:
192
+ init_value_key = None
193
+ case _bindings.InitKind.FUTURE_VALUE_INIT:
194
+ init_value_key = protocols.get_dtype(cast(DeviceArrayLike, init_value))
195
+ case _bindings.InitKind.VALUE_INIT:
196
+ init_value = cast(np.ndarray | GpuStruct, init_value)
197
+ init_value_key = init_value.dtype
198
+
199
+ return (d_in_key, d_out_key, op_key, init_value_key, init_kind_key)
128
200
 
129
201
 
130
202
  # TODO Figure out `sum` without operator and initial value
@@ -134,14 +206,14 @@ def make_exclusive_scan(
134
206
  d_in: DeviceArrayLike | IteratorBase,
135
207
  d_out: DeviceArrayLike | IteratorBase,
136
208
  op: Callable | OpKind,
137
- h_init: np.ndarray,
209
+ init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
138
210
  ):
139
211
  """Computes a device-wide scan using the specified binary ``op`` and initial value ``init``.
140
212
 
141
213
  Example:
142
214
  Below, ``make_exclusive_scan`` is used to create an exclusive scan object that can be reused.
143
215
 
144
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/exclusive_scan_object.py
216
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_object.py
145
217
  :language: python
146
218
  :start-after: # example-begin
147
219
 
@@ -150,19 +222,19 @@ def make_exclusive_scan(
150
222
  d_in: Device array or iterator containing the input sequence of data items
151
223
  d_out: Device array that will store the result of the scan
152
224
  op: Callable or OpKind representing the binary operator to apply
153
- init: Numpy array storing initial value of the scan
225
+ init_value: Numpy array, device array, or GPU struct storing initial value of the scan, or None for no initial value
154
226
 
155
227
  Returns:
156
228
  A callable object that can be used to perform the scan
157
229
  """
158
- return _Scan(d_in, d_out, op, h_init, False)
230
+ return _Scan(d_in, d_out, op, init_value, False)
159
231
 
160
232
 
161
233
  def exclusive_scan(
162
234
  d_in: DeviceArrayLike | IteratorBase,
163
235
  d_out: DeviceArrayLike | IteratorBase,
164
236
  op: Callable | OpKind,
165
- h_init: np.ndarray | GpuStruct,
237
+ init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
166
238
  num_items: int,
167
239
  stream=None,
168
240
  ):
@@ -174,7 +246,7 @@ def exclusive_scan(
174
246
  Example:
175
247
  Below, ``exclusive_scan`` is used to compute an exclusive scan with max operation.
176
248
 
177
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/exclusive_scan_max.py
249
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_max.py
178
250
  :language: python
179
251
  :start-after: # example-begin
180
252
 
@@ -183,14 +255,14 @@ def exclusive_scan(
183
255
  d_in: Device array or iterator containing the input sequence of data items
184
256
  d_out: Device array or iterator to store the result of the scan
185
257
  op: Binary scan operator
186
- h_init: Initial value for the scan
258
+ init_value: Initial value for the scan
187
259
  num_items: Number of items to scan
188
260
  stream: CUDA stream for the operation (optional)
189
261
  """
190
- scanner = make_exclusive_scan(d_in, d_out, op, h_init)
191
- tmp_storage_bytes = scanner(None, d_in, d_out, num_items, h_init, stream)
262
+ scanner = make_exclusive_scan(d_in, d_out, op, init_value)
263
+ tmp_storage_bytes = scanner(None, d_in, d_out, num_items, init_value, stream)
192
264
  tmp_storage = TempStorageBuffer(tmp_storage_bytes, stream)
193
- scanner(tmp_storage, d_in, d_out, num_items, h_init, stream)
265
+ scanner(tmp_storage, d_in, d_out, num_items, init_value, stream)
194
266
 
195
267
 
196
268
  # TODO Figure out `sum` without operator and initial value
@@ -200,14 +272,14 @@ def make_inclusive_scan(
200
272
  d_in: DeviceArrayLike | IteratorBase,
201
273
  d_out: DeviceArrayLike | IteratorBase,
202
274
  op: Callable | OpKind,
203
- h_init: np.ndarray,
275
+ init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
204
276
  ):
205
277
  """Computes a device-wide scan using the specified binary ``op`` and initial value ``init``.
206
278
 
207
279
  Example:
208
280
  Below, ``make_inclusive_scan`` is used to create an inclusive scan object that can be reused.
209
281
 
210
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/inclusive_scan_object.py
282
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_object.py
211
283
  :language: python
212
284
  :start-after: # example-begin
213
285
 
@@ -216,19 +288,19 @@ def make_inclusive_scan(
216
288
  d_in: Device array or iterator containing the input sequence of data items
217
289
  d_out: Device array that will store the result of the scan
218
290
  op: Callable or OpKind representing the binary operator to apply
219
- init: Numpy array storing initial value of the scan
291
+ init_value: Numpy array, device array, or GPU struct storing initial value of the scan, or None for no initial value
220
292
 
221
293
  Returns:
222
294
  A callable object that can be used to perform the scan
223
295
  """
224
- return _Scan(d_in, d_out, op, h_init, True)
296
+ return _Scan(d_in, d_out, op, init_value, True)
225
297
 
226
298
 
227
299
  def inclusive_scan(
228
300
  d_in: DeviceArrayLike | IteratorBase,
229
301
  d_out: DeviceArrayLike | IteratorBase,
230
302
  op: Callable | OpKind,
231
- h_init: np.ndarray | GpuStruct,
303
+ init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
232
304
  num_items: int,
233
305
  stream=None,
234
306
  ):
@@ -240,7 +312,7 @@ def inclusive_scan(
240
312
  Example:
241
313
  Below, ``inclusive_scan`` is used to compute an inclusive scan (prefix sum).
242
314
 
243
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/inclusive_scan_custom.py
315
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_custom.py
244
316
  :language: python
245
317
  :start-after: # example-begin
246
318
 
@@ -249,11 +321,11 @@ def inclusive_scan(
249
321
  d_in: Device array or iterator containing the input sequence of data items
250
322
  d_out: Device array or iterator to store the result of the scan
251
323
  op: Binary scan operator
252
- h_init: Initial value for the scan
324
+ init_value: Initial value for the scan
253
325
  num_items: Number of items to scan
254
326
  stream: CUDA stream for the operation (optional)
255
327
  """
256
- scanner = make_inclusive_scan(d_in, d_out, op, h_init)
257
- tmp_storage_bytes = scanner(None, d_in, d_out, num_items, h_init, stream)
328
+ scanner = make_inclusive_scan(d_in, d_out, op, init_value)
329
+ tmp_storage_bytes = scanner(None, d_in, d_out, num_items, init_value, stream)
258
330
  tmp_storage = TempStorageBuffer(tmp_storage_bytes, stream)
259
- scanner(tmp_storage, d_in, d_out, num_items, h_init, stream)
331
+ scanner(tmp_storage, d_in, d_out, num_items, init_value, stream)
@@ -179,7 +179,7 @@ def make_segmented_reduce(
179
179
  Example:
180
180
  Below, ``make_segmented_reduce`` is used to create a segmented reduction object that can be reused.
181
181
 
182
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/segmented/segmented_reduce_object.py
182
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_object.py
183
183
  :language: python
184
184
  :start-after: # example-begin
185
185
 
@@ -216,7 +216,7 @@ def segmented_reduce(
216
216
  Example:
217
217
  Below, ``segmented_reduce`` is used to compute the minimum value of segments in a sequence of integers.
218
218
 
219
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/segmented/segmented_reduce_basic.py
219
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_basic.py
220
220
  :language: python
221
221
  :start-after: # example-begin
222
222