cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -48,6 +48,13 @@
48
48
  #include <thrust/system/cuda/detail/util.h>
49
49
  #include <thrust/type_traits/is_trivially_relocatable.h>
50
50
 
51
+ #if _CCCL_HAS_CUDA_COMPILER()
52
+ # include <cub/device/dispatch/tuning/tuning_transform.cuh>
53
+ #endif // _CCCL_HAS_CUDA_COMPILER()
54
+
55
+ #include <cuda/__fwd/zip_iterator.h>
56
+ #include <cuda/std/tuple>
57
+
51
58
  THRUST_NAMESPACE_BEGIN
52
59
  namespace cuda_cub
53
60
  {
@@ -61,6 +68,21 @@ template <class Derived, class InputIt, class OutputIt, class TransformOp>
61
68
  OutputIt _CCCL_API _CCCL_FORCEINLINE
62
69
  transform(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op);
63
70
 
71
+ // Forward declare to work around a cyclic include, since "cuda/detail/transform.h" includes this header
72
+ // We want this to unwrap zip_transform_iterator
73
+ namespace __transform
74
+ {
75
+ _CCCL_EXEC_CHECK_DISABLE
76
+ template <class Derived, class Offset, class... InputIts, class OutputIt, class TransformOp, class Predicate>
77
+ OutputIt _CCCL_API _CCCL_FORCEINLINE cub_transform_many(
78
+ execution_policy<Derived>& policy,
79
+ ::cuda::std::tuple<InputIts...> firsts,
80
+ OutputIt result,
81
+ Offset num_items,
82
+ TransformOp transform_op,
83
+ Predicate pred);
84
+ } // namespace __transform
85
+
64
86
  namespace __copy
65
87
  {
66
88
  template <class H, class D, class T, class Size>
@@ -190,6 +212,17 @@ device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last,
190
212
 
191
213
  return result + n;
192
214
  }
215
+ else if constexpr (::cuda::__is_zip_transform_iterator<InputIt>)
216
+ {
217
+ const auto n = ::cuda::std::distance(first, last);
218
+ return cuda_cub::__transform::cub_transform_many(
219
+ policy,
220
+ ::cuda::std::move(first).__base(),
221
+ result,
222
+ n,
223
+ ::cuda::std::move(first).__pred(),
224
+ cub::detail::transform::always_true_predicate{});
225
+ }
193
226
  else
194
227
  {
195
228
  return cuda_cub::transform(
@@ -39,11 +39,13 @@
39
39
  #if _CCCL_HAS_CUDA_COMPILER()
40
40
  # include <thrust/system/cuda/config.h>
41
41
 
42
- # include <thrust/distance.h>
43
- # include <thrust/iterator/counting_iterator.h>
44
- # include <thrust/iterator/transform_iterator.h>
45
42
  # include <thrust/system/cuda/detail/execution_policy.h>
46
43
 
44
+ # include <cuda/__iterator/counting_iterator.h>
45
+ # include <cuda/__iterator/transform_iterator.h>
46
+ # include <cuda/__iterator/zip_iterator.h>
47
+ # include <cuda/std/__iterator/distance.h>
48
+
47
49
  THRUST_NAMESPACE_BEGIN
48
50
  namespace cuda_cub
49
51
  {
@@ -62,7 +64,6 @@ InputIt _CCCL_HOST_DEVICE find(execution_policy<Derived>& policy, InputIt first,
62
64
  }; // namespace cuda_cub
63
65
  THRUST_NAMESPACE_END
64
66
 
65
- # include <thrust/iterator/zip_iterator.h>
66
67
  # include <thrust/system/cuda/detail/reduce.h>
67
68
 
68
69
  THRUST_NAMESPACE_BEGIN
@@ -92,109 +93,13 @@ struct functor
92
93
  }
93
94
  }
94
95
  };
95
-
96
- template <class ValueType, class InputIt, class UnaryOp>
97
- struct transform_input_iterator_t
98
- {
99
- using self_t = transform_input_iterator_t;
100
- using difference_type = thrust::detail::it_difference_t<InputIt>;
101
- using value_type = ValueType;
102
- using pointer = void;
103
- using reference = value_type;
104
- using iterator_category = ::cuda::std::random_access_iterator_tag;
105
-
106
- InputIt input;
107
- mutable UnaryOp op;
108
-
109
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE transform_input_iterator_t(InputIt input, UnaryOp op)
110
- : input(input)
111
- , op(op)
112
- {}
113
-
114
- transform_input_iterator_t(const self_t&) = default;
115
-
116
- // UnaryOp might not be copy assignable, such as when it is a lambda. Define
117
- // an explicit copy assignment operator that doesn't try to assign it.
118
- _CCCL_HOST_DEVICE self_t& operator=(const self_t& o)
119
- {
120
- input = o.input;
121
- return *this;
122
- }
123
-
124
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator++(int)
125
- {
126
- self_t retval = *this;
127
- ++input;
128
- return retval;
129
- }
130
-
131
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator++()
132
- {
133
- ++input;
134
- return *this;
135
- }
136
-
137
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const
138
- {
139
- thrust::detail::it_value_t<InputIt> x = *input;
140
- return op(x);
141
- }
142
-
143
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*()
144
- {
145
- thrust::detail::it_value_t<InputIt> x = *input;
146
- return op(x);
147
- }
148
-
149
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator+(difference_type n) const
150
- {
151
- return self_t(input + n, op);
152
- }
153
-
154
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t& operator+=(difference_type n)
155
- {
156
- input += n;
157
- return *this;
158
- }
159
-
160
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator-(difference_type n) const
161
- {
162
- return self_t(input - n, op);
163
- }
164
-
165
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t& operator-=(difference_type n)
166
- {
167
- input -= n;
168
- return *this;
169
- }
170
-
171
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_t other) const
172
- {
173
- return input - other.input;
174
- }
175
-
176
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](difference_type n) const
177
- {
178
- return op(input[n]);
179
- }
180
-
181
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_t& rhs) const
182
- {
183
- return (input == rhs.input);
184
- }
185
-
186
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_t& rhs) const
187
- {
188
- return (input != rhs.input);
189
- }
190
- };
191
96
  } // namespace __find_if
192
97
 
193
98
  template <class Derived, class InputIt, class Size, class Predicate>
194
99
  InputIt _CCCL_HOST_DEVICE
195
100
  find_if_n(execution_policy<Derived>& policy, InputIt first, Size num_items, Predicate predicate)
196
101
  {
197
- using result_type = typename thrust::tuple<bool, Size>;
102
+ using result_type = ::cuda::std::tuple<bool, Size>;
198
103
 
199
104
  // empty sequence
200
105
  if (num_items == 0)
@@ -212,27 +117,20 @@ find_if_n(execution_policy<Derived>& policy, InputIt first, Size num_items, Pred
212
117
  const Size interval_threshold = 1 << 20;
213
118
  const Size interval_size = (::cuda::std::min) (interval_threshold, num_items);
214
119
 
215
- // FIXME(bgruber): we should also be able to use transform_iterator here, but it makes nvc++ hang. See:
216
- // https://github.com/NVIDIA/cccl/issues/3594. The problem does not occur with nvcc, so we could not add a test :/
217
- using XfrmIterator = __find_if::transform_input_iterator_t<bool, InputIt, Predicate>;
218
- // using XfrmIterator = transform_iterator<Predicate, InputIt>;
219
- using IteratorTuple = thrust::tuple<XfrmIterator, counting_iterator<Size>>;
220
- using ZipIterator = thrust::zip_iterator<IteratorTuple>;
221
-
222
- IteratorTuple iter_tuple = thrust::make_tuple(XfrmIterator(first, predicate), counting_iterator<Size>(0));
223
-
224
- ZipIterator begin = thrust::make_zip_iterator(iter_tuple);
225
- ZipIterator end = begin + num_items;
120
+ const auto begin = ::cuda::make_zip_iterator(
121
+ ::cuda::make_transform_iterator(try_unwrap_contiguous_iterator(first), predicate),
122
+ ::cuda::counting_iterator<Size>(0));
123
+ const auto end = begin + num_items;
226
124
 
227
- for (ZipIterator interval_begin = begin; interval_begin < end; interval_begin += interval_size)
125
+ for (auto interval_begin = begin; interval_begin < end; interval_begin += interval_size)
228
126
  {
229
- ZipIterator interval_end = interval_begin + interval_size;
127
+ auto interval_end = interval_begin + interval_size;
230
128
  if (end < interval_end)
231
129
  {
232
130
  interval_end = end;
233
131
  } // end if
234
132
 
235
- result_type result = reduce(
133
+ const result_type result = reduce(
236
134
  policy, interval_begin, interval_end, result_type(false, interval_end - begin), __find_if::functor<result_type>());
237
135
 
238
136
  // see if we found something
@@ -73,12 +73,14 @@ struct transform_pair_of_input_iterators_t
73
73
  using value_type = ValueType;
74
74
  using pointer = void;
75
75
  using reference = value_type;
76
- using iterator_category = std::random_access_iterator_tag;
76
+ using iterator_category = ::cuda::std::random_access_iterator_tag;
77
77
 
78
78
  InputIt1 input1;
79
79
  InputIt2 input2;
80
80
  mutable BinaryOp op;
81
81
 
82
+ transform_pair_of_input_iterators_t() = default;
83
+
82
84
  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
83
85
  transform_pair_of_input_iterators_t(InputIt1 input1_, InputIt2 input2_, BinaryOp op_)
84
86
  : input1(input1_)
@@ -107,7 +109,7 @@ struct transform_pair_of_input_iterators_t
107
109
  }
108
110
 
109
111
  /// Prefix increment
110
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator++()
112
+ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t& operator++()
111
113
  {
112
114
  ++input1;
113
115
  ++input2;
@@ -177,6 +179,10 @@ struct transform_pair_of_input_iterators_t
177
179
  return (input1 != rhs.input1) || (input2 != rhs.input2);
178
180
  }
179
181
 
182
+ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator<(const self_t& rhs) const
183
+ {
184
+ return input1 < rhs.input1;
185
+ }
180
186
  }; // struct transform_pair_of_input_iterators_t
181
187
  } // namespace detail
182
188
 
@@ -79,7 +79,7 @@ namespace detail
79
79
  template <typename Iterator>
80
80
  inline constexpr bool is_libcxx_wrap_iter_v = false;
81
81
 
82
- #if defined(_LIBCPP_VERSION)
82
+ #if _CCCL_HOST_STD_LIB(LIBCXX)
83
83
  template <typename Iterator>
84
84
  inline constexpr bool is_libcxx_wrap_iter_v<
85
85
  # if _LIBCPP_VERSION < 14000
@@ -88,23 +88,23 @@ inline constexpr bool is_libcxx_wrap_iter_v<
88
88
  std::__wrap_iter<Iterator>
89
89
  # endif
90
90
  > = true;
91
- #endif
91
+ #endif // _CCCL_HOST_STD_LIB(LIBCXX)
92
92
 
93
93
  template <typename Iterator>
94
94
  inline constexpr bool is_libstdcxx_normal_iterator_v = false;
95
95
 
96
- #if defined(__GLIBCXX__)
96
+ #if _CCCL_HOST_STD_LIB(LIBSTDCXX)
97
97
  template <typename Iterator, typename Container>
98
98
  inline constexpr bool is_libstdcxx_normal_iterator_v<::__gnu_cxx::__normal_iterator<Iterator, Container>> = true;
99
- #endif
99
+ #endif // _CCCL_HOST_STD_LIB(LIBSTDCXX)
100
100
 
101
- #if _CCCL_COMPILER(MSVC)
101
+ #if _CCCL_HOST_STD_LIB(STL)
102
102
  template <typename Iterator>
103
103
  inline constexpr bool is_msvc_contiguous_iterator_v = ::cuda::std::is_pointer_v<::std::_Unwrapped_t<Iterator>>;
104
- #else
104
+ #else // ^^^ _CCCL_HOST_STD_LIB(STL) ^^^ / vvv !_CCCL_HOST_STD_LIB(STL) vvv
105
105
  template <typename Iterator>
106
106
  inline constexpr bool is_msvc_contiguous_iterator_v = false;
107
- #endif
107
+ #endif // ^^^ !_CCCL_HOST_STD_LIB(STL) ^^^
108
108
 
109
109
  template <typename Iterator>
110
110
  inline constexpr bool is_contiguous_iterator_impl_v =
@@ -1,77 +1,24 @@
1
- # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
2
  #
3
- # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
4
14
 
5
- from .algorithms import (
6
- DoubleBuffer,
7
- SortOrder,
8
- binary_transform,
9
- exclusive_scan,
10
- histogram_even,
11
- inclusive_scan,
12
- make_binary_transform,
13
- make_exclusive_scan,
14
- make_histogram_even,
15
- make_inclusive_scan,
16
- make_merge_sort,
17
- make_radix_sort,
18
- make_reduce_into,
19
- make_segmented_reduce,
20
- make_three_way_partition,
21
- make_unary_transform,
22
- make_unique_by_key,
23
- merge_sort,
24
- radix_sort,
25
- reduce_into,
26
- segmented_reduce,
27
- three_way_partition,
28
- unary_transform,
29
- unique_by_key,
30
- )
31
- from .iterators import (
32
- CacheModifiedInputIterator,
33
- ConstantIterator,
34
- CountingIterator,
35
- ReverseIterator,
36
- TransformIterator,
37
- TransformOutputIterator,
38
- ZipIterator,
39
- )
40
- from .op import OpKind
41
- from .struct import gpu_struct
15
+ # alias for backwards compatibility
42
16
 
43
- __all__ = [
44
- "binary_transform",
45
- "CacheModifiedInputIterator",
46
- "ConstantIterator",
47
- "CountingIterator",
48
- "DoubleBuffer",
49
- "exclusive_scan",
50
- "gpu_struct",
51
- "histogram_even",
52
- "inclusive_scan",
53
- "make_binary_transform",
54
- "make_exclusive_scan",
55
- "make_histogram_even",
56
- "make_inclusive_scan",
57
- "make_merge_sort",
58
- "make_radix_sort",
59
- "make_reduce_into",
60
- "make_segmented_reduce",
61
- "make_three_way_partition",
62
- "make_unary_transform",
63
- "make_unique_by_key",
64
- "merge_sort",
65
- "OpKind",
66
- "radix_sort",
67
- "reduce_into",
68
- "ReverseIterator",
69
- "segmented_reduce",
70
- "SortOrder",
71
- "TransformIterator",
72
- "three_way_partition",
73
- "TransformOutputIterator",
74
- "unary_transform",
75
- "unique_by_key",
76
- "ZipIterator",
77
- ]
17
+ from warnings import warn
18
+
19
+ from cuda.compute import * # noqa: F403
20
+
21
+ warn(
22
+ "The module cuda.cccl.parallel.experimental is deprecated. Use cuda.compute instead.",
23
+ FutureWarning,
24
+ )
@@ -0,0 +1,79 @@
1
+ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
+
5
+ from .algorithms import (
6
+ DoubleBuffer,
7
+ SortOrder,
8
+ binary_transform,
9
+ exclusive_scan,
10
+ histogram_even,
11
+ inclusive_scan,
12
+ make_binary_transform,
13
+ make_exclusive_scan,
14
+ make_histogram_even,
15
+ make_inclusive_scan,
16
+ make_merge_sort,
17
+ make_radix_sort,
18
+ make_reduce_into,
19
+ make_segmented_reduce,
20
+ make_three_way_partition,
21
+ make_unary_transform,
22
+ make_unique_by_key,
23
+ merge_sort,
24
+ radix_sort,
25
+ reduce_into,
26
+ segmented_reduce,
27
+ three_way_partition,
28
+ unary_transform,
29
+ unique_by_key,
30
+ )
31
+ from .iterators import (
32
+ CacheModifiedInputIterator,
33
+ ConstantIterator,
34
+ CountingIterator,
35
+ PermutationIterator,
36
+ ReverseIterator,
37
+ TransformIterator,
38
+ TransformOutputIterator,
39
+ ZipIterator,
40
+ )
41
+ from .op import OpKind
42
+ from .struct import gpu_struct
43
+
44
+ __all__ = [
45
+ "binary_transform",
46
+ "CacheModifiedInputIterator",
47
+ "ConstantIterator",
48
+ "CountingIterator",
49
+ "DoubleBuffer",
50
+ "exclusive_scan",
51
+ "gpu_struct",
52
+ "histogram_even",
53
+ "inclusive_scan",
54
+ "make_binary_transform",
55
+ "make_exclusive_scan",
56
+ "make_histogram_even",
57
+ "make_inclusive_scan",
58
+ "make_merge_sort",
59
+ "make_radix_sort",
60
+ "make_reduce_into",
61
+ "make_segmented_reduce",
62
+ "make_three_way_partition",
63
+ "make_unary_transform",
64
+ "make_unique_by_key",
65
+ "merge_sort",
66
+ "OpKind",
67
+ "PermutationIterator",
68
+ "radix_sort",
69
+ "reduce_into",
70
+ "ReverseIterator",
71
+ "segmented_reduce",
72
+ "SortOrder",
73
+ "TransformIterator",
74
+ "TransformOutputIterator",
75
+ "three_way_partition",
76
+ "unary_transform",
77
+ "unique_by_key",
78
+ "ZipIterator",
79
+ ]
@@ -57,6 +57,12 @@ class SortOrder(IntEnum):
57
57
  ASCENDING = ...
58
58
  DESCENDING = ...
59
59
 
60
+ class InitKind(IntEnum):
61
+ _value_: int
62
+ NO_INIT = ...
63
+ FUTURE_VALUE_INIT = ...
64
+ VALUE_INIT = ...
65
+
60
66
  class Op:
61
67
  def __init__(
62
68
  self,
@@ -133,6 +139,8 @@ class Iterator:
133
139
  def state(self, value) -> None: ...
134
140
  @property
135
141
  def type(self) -> IteratorKind: ...
142
+ @property
143
+ def value_type(self) -> TypeInfo: ...
136
144
  def as_bytes(self) -> bytes: ...
137
145
  def is_kind_pointer(self) -> bool: ...
138
146
  def is_kind_iterator(self) -> bool: ...
@@ -197,8 +205,9 @@ class DeviceScanBuildResult:
197
205
  d_in: Iterator,
198
206
  d_out: Iterator,
199
207
  binary_op: Op,
200
- h_init: Value,
208
+ init_type: TypeInfo,
201
209
  force_inclusive: bool,
210
+ init_kind: InitKind,
202
211
  info: CommonData,
203
212
  ): ...
204
213
  def compute_inclusive(
@@ -223,6 +232,39 @@ class DeviceScanBuildResult:
223
232
  h_init: Value,
224
233
  stream,
225
234
  ) -> int: ...
235
+ def compute_inclusive_future_value(
236
+ self,
237
+ temp_storage_ptr: int | None,
238
+ temp_storage_nbytes: int,
239
+ d_in: Iterator,
240
+ d_out: Iterator,
241
+ num_items: int,
242
+ binary_op: Op,
243
+ h_init: Iterator,
244
+ stream,
245
+ ) -> int: ...
246
+ def compute_exclusive_future_value(
247
+ self,
248
+ temp_storage_ptr: int | None,
249
+ temp_storage_nbytes: int,
250
+ d_in: Iterator,
251
+ d_out: Iterator,
252
+ num_items: int,
253
+ binary_op: Op,
254
+ h_init: Iterator,
255
+ stream,
256
+ ) -> int: ...
257
+ def compute_inclusive_no_init(
258
+ self,
259
+ temp_storage_ptr: int | None,
260
+ temp_storage_nbytes: int,
261
+ d_in: Iterator,
262
+ d_out: Iterator,
263
+ num_items: int,
264
+ binary_op: Op,
265
+ h_init: None,
266
+ stream,
267
+ ) -> int: ...
226
268
 
227
269
  # ---------------------
228
270
  # DeviceSegmentedReduce