cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  7. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  8. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  14. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  15. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  20. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  21. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  22. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  23. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  24. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  25. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  26. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
  27. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  28. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  29. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  30. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  31. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  32. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  33. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  34. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  35. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  36. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
  37. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  38. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
  39. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  40. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
  41. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
  42. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  43. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  44. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  46. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  49. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  52. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  55. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  56. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  57. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  58. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  59. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  60. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  61. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  62. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  63. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  64. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  65. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  66. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  67. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  68. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  69. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  70. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  71. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  72. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  73. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  74. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  75. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  76. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  77. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  78. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  79. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  80. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  81. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  82. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  83. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  84. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  85. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  86. cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
  87. cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
  88. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  89. cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
  90. cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
  91. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  92. cuda/cccl/headers/include/cuda/__event/event.h +27 -26
  93. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  94. cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
  95. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  96. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  97. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  98. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  99. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  100. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  101. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  102. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  103. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
  104. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
  105. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
  106. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  107. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  108. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  109. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  110. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  111. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  112. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  113. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  114. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  115. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  116. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
  117. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  118. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
  119. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  120. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  121. cuda/cccl/headers/include/cuda/algorithm +1 -1
  122. cuda/cccl/headers/include/cuda/cmath +1 -0
  123. cuda/cccl/headers/include/cuda/devices +13 -0
  124. cuda/cccl/headers/include/cuda/iterator +1 -0
  125. cuda/cccl/headers/include/cuda/memory +1 -0
  126. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  127. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  128. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  129. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  130. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  131. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  132. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  133. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  134. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  135. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  136. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  137. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  138. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  139. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  140. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  141. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  142. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  143. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  144. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  145. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  146. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  147. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  148. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  149. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  150. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  151. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  152. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  153. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  154. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  155. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  156. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  157. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  158. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  159. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  160. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  161. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  162. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  163. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  164. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  165. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  166. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  167. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  168. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  169. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  170. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  171. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  172. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  173. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  174. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  175. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  176. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  177. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  178. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  179. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  180. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  181. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  182. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  183. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  184. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  185. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  186. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  187. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  188. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  189. cuda/cccl/headers/include/cuda/std/string_view +155 -13
  190. cuda/cccl/headers/include/cuda/std/version +1 -4
  191. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  192. cuda/cccl/headers/include/cuda/utility +1 -0
  193. cuda/cccl/headers/include/nv/target +7 -2
  194. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  195. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  196. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  197. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  198. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  199. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  200. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  201. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  202. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  203. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  204. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  205. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  206. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  207. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  208. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  209. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  210. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  211. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  212. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  213. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  214. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  215. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  216. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  217. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  218. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  219. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  220. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  221. cuda/cccl/parallel/experimental/__init__.py +21 -74
  222. cuda/compute/__init__.py +79 -0
  223. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
  224. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
  225. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  226. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  227. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  228. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  229. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
  230. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  231. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  232. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
  233. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  234. cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  235. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  236. cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  237. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  238. cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
  239. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
  240. cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
  241. cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
  242. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  243. cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
  244. cuda/coop/__init__.py +8 -0
  245. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  246. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  247. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  248. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  249. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  250. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  251. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  252. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  253. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  254. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  255. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  256. cuda/coop/warp/__init__.py +9 -0
  257. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  258. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  259. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  260. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  261. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
  262. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  263. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  264. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  265. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  266. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  267. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  268. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  269. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  270. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  271. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  272. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  273. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  274. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  275. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  276. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  277. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  278. cuda/cccl/parallel/experimental/.gitignore +0 -4
  279. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  280. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  281. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  282. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  283. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  284. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  285. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  286. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  287. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  288. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  289. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  290. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  291. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  292. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  293. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  294. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -165,7 +165,7 @@ def make_three_way_partition(
165
165
  Example:
166
166
  Below, ``make_three_way_partition`` is used to create a three-way partition object that can be reused.
167
167
 
168
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/three_way_partition/three_way_partition_object.py
168
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_object.py
169
169
  :language: python
170
170
  :start-after: # example-begin
171
171
 
@@ -214,7 +214,7 @@ def three_way_partition(
214
214
  Example:
215
215
  Below, ``three_way_partition`` is used to partition a sequence of integers into three parts.
216
216
 
217
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/three_way_partition/three_way_partition_basic.py
217
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_basic.py
218
218
  :language: python
219
219
  :start-after: # example-begin
220
220
 
@@ -11,7 +11,7 @@ from .._caching import CachableFunction, cache_with_key
11
11
  from .._cccl_interop import set_cccl_iterator_state
12
12
  from .._utils import protocols
13
13
  from ..iterators._iterators import IteratorBase
14
- from ..numba_utils import get_inferred_return_type
14
+ from ..numba_utils import get_inferred_return_type, signature_from_annotations
15
15
  from ..op import OpKind
16
16
  from ..typing import DeviceArrayLike
17
17
 
@@ -32,16 +32,20 @@ class _UnaryTransform:
32
32
  ):
33
33
  self.d_in_cccl = cccl.to_cccl_input_iter(d_in)
34
34
  self.d_out_cccl = cccl.to_cccl_output_iter(d_out)
35
- in_value_type = cccl.get_value_type(d_in)
36
- out_value_type = cccl.get_value_type(d_out)
37
35
 
38
36
  # For well-known operations, we don't need a signature
39
37
  if isinstance(op, OpKind):
40
38
  self.op_wrapper = cccl.to_cccl_op(op, None)
41
39
  else:
42
- if not out_value_type.is_internal:
43
- out_value_type = get_inferred_return_type(op, (in_value_type,))
44
- sig = out_value_type(in_value_type)
40
+ try:
41
+ sig = signature_from_annotations(op)
42
+ except ValueError:
43
+ in_value_type = cccl.get_value_type(d_in)
44
+ out_value_type = cccl.get_value_type(d_out)
45
+ if not out_value_type.is_internal:
46
+ out_value_type = get_inferred_return_type(op, (in_value_type,))
47
+ sig = out_value_type(in_value_type)
48
+
45
49
  self.op_wrapper = cccl.to_cccl_op(op, sig=sig)
46
50
  self.build_result = cccl.call_build(
47
51
  _bindings.DeviceUnaryTransform,
@@ -97,11 +101,14 @@ class _BinaryTransform:
97
101
  if isinstance(op, OpKind):
98
102
  self.op_wrapper = cccl.to_cccl_op(op, None)
99
103
  else:
100
- if not out_value_type.is_internal:
101
- out_value_type = get_inferred_return_type(
102
- op, (in1_value_type, in2_value_type)
103
- )
104
- sig = out_value_type(in1_value_type, in2_value_type)
104
+ try:
105
+ sig = signature_from_annotations(op)
106
+ except ValueError:
107
+ if not out_value_type.is_internal:
108
+ out_value_type = get_inferred_return_type(
109
+ op, (in1_value_type, in2_value_type)
110
+ )
111
+ sig = out_value_type(in1_value_type, in2_value_type)
105
112
  self.op_wrapper = cccl.to_cccl_op(op, sig=sig)
106
113
  self.build_result = cccl.call_build(
107
114
  _bindings.DeviceBinaryTransform,
@@ -196,7 +203,7 @@ def make_unary_transform(
196
203
  storage allocation. For simpler usage, consider using :func:`unary_transform`.
197
204
 
198
205
  Example:
199
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/unary_transform_object.py
206
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_object.py
200
207
  :language: python
201
208
  :start-after: # example-begin
202
209
 
@@ -227,7 +234,7 @@ def make_binary_transform(
227
234
  storage allocation. For simpler usage, consider using :func:`binary_transform`.
228
235
 
229
236
  Example:
230
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/binary_transform_object.py
237
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_object.py
231
238
  :language: python
232
239
  :start-after: # example-begin
233
240
 
@@ -259,7 +266,14 @@ def unary_transform(
259
266
  Example:
260
267
  Below, ``unary_transform`` is used to apply a transformation to each element of the input.
261
268
 
262
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/unary_transform_basic.py
269
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_basic.py
270
+ :language: python
271
+ :start-after: # example-begin
272
+
273
+ When working with custom struct types, you need to provide type annotations
274
+ to help with type inference. See the binary transform struct example for reference:
275
+
276
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_struct.py
263
277
  :language: python
264
278
  :start-after: # example-begin
265
279
 
@@ -291,7 +305,14 @@ def binary_transform(
291
305
  Example:
292
306
  Below, ``binary_transform`` is used to apply a transformation to pairs of elements from two input sequences.
293
307
 
294
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/binary_transform_basic.py
308
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_basic.py
309
+ :language: python
310
+ :start-after: # example-begin
311
+
312
+ When working with custom struct types, you need to provide type annotations
313
+ to help with type inference. See the following example:
314
+
315
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_struct.py
295
316
  :language: python
296
317
  :start-after: # example-begin
297
318
 
@@ -171,7 +171,7 @@ def make_unique_by_key(
171
171
  Example:
172
172
  Below, ``make_unique_by_key`` is used to create a unique by key object that can be reused.
173
173
 
174
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/unique/unique_by_key_object.py
174
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_object.py
175
175
  :language: python
176
176
  :start-after: # example-begin
177
177
 
@@ -211,7 +211,7 @@ def unique_by_key(
211
211
  Example:
212
212
  Below, ``unique_by_key`` is used to populate the arrays of output keys and items with the first key and its corresponding item from each sequence of equal keys. It also outputs the number of items selected.
213
213
 
214
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/unique/unique_by_key_basic.py
214
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_basic.py
215
215
  :language: python
216
216
  :start-after: # example-begin
217
217
 
@@ -2,6 +2,7 @@ from ._factories import (
2
2
  CacheModifiedInputIterator,
3
3
  ConstantIterator,
4
4
  CountingIterator,
5
+ PermutationIterator,
5
6
  ReverseIterator,
6
7
  TransformIterator,
7
8
  TransformOutputIterator,
@@ -12,6 +13,7 @@ __all__ = [
12
13
  "CacheModifiedInputIterator",
13
14
  "ConstantIterator",
14
15
  "CountingIterator",
16
+ "PermutationIterator",
15
17
  "ReverseIterator",
16
18
  "TransformIterator",
17
19
  "TransformOutputIterator",
@@ -10,6 +10,7 @@ from ._iterators import (
10
10
  CountingIterator as _CountingIterator,
11
11
  )
12
12
  from ._iterators import (
13
+ make_permutation_iterator,
13
14
  make_reverse_iterator,
14
15
  make_transform_iterator,
15
16
  )
@@ -26,7 +27,7 @@ def CacheModifiedInputIterator(device_array, modifier):
26
27
  Example:
27
28
  The code snippet below demonstrates the usage of a ``CacheModifiedInputIterator``:
28
29
 
29
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/cache_modified_iterator_basic.py
30
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/cache_modified_iterator_basic.py
30
31
  :language: python
31
32
  :start-after: # example-begin
32
33
 
@@ -55,7 +56,7 @@ def ConstantIterator(value):
55
56
  The code snippet below demonstrates the usage of a ``ConstantIterator``
56
57
  representing a sequence of constant values:
57
58
 
58
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/constant_iterator_basic.py
59
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/constant_iterator_basic.py
59
60
  :language: python
60
61
  :start-after: # example-begin
61
62
 
@@ -78,7 +79,7 @@ def CountingIterator(offset):
78
79
  The code snippet below demonstrates the usage of a ``CountingIterator``
79
80
  representing the sequence ``[10, 11, 12]``:
80
81
 
81
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/counting_iterator_basic.py
82
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/counting_iterator_basic.py
82
83
  :language: python
83
84
  :start-after: # example-begin
84
85
 
@@ -100,13 +101,13 @@ def ReverseIterator(sequence):
100
101
  Examples:
101
102
  The code snippet below demonstrates the usage of a ``ReverseIterator`` as an input iterator:
102
103
 
103
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/reverse_input_iterator.py
104
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_input_iterator.py
104
105
  :language: python
105
106
  :start-after: # example-begin
106
107
 
107
108
  The code snippet below demonstrates the usage of a ``ReverseIterator`` as an output iterator:
108
109
 
109
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/reverse_output_iterator.py
110
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_output_iterator.py
110
111
  :language: python
111
112
  :start-after: # example-begin
112
113
 
@@ -129,7 +130,7 @@ def TransformIterator(it, op):
129
130
  The code snippet below demonstrates the usage of a ``TransformIterator`` composed with a ``CountingIterator``
130
131
  to transform the input before performing a reduction.
131
132
 
132
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/transform_iterator_basic.py
133
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_iterator_basic.py
133
134
  :language: python
134
135
  :start-after: # example-begin
135
136
  Args:
@@ -151,7 +152,7 @@ def TransformOutputIterator(it, op):
151
152
  The code snippet below demonstrates the usage of a ``TransformOutputIterator`` to transform the output
152
153
  of a reduction before writing to an output array.
153
154
 
154
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/transform_output_iterator.py
155
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_output_iterator.py
155
156
  :language: python
156
157
  :start-after: # example-begin
157
158
 
@@ -165,6 +166,33 @@ def TransformOutputIterator(it, op):
165
166
  return make_transform_iterator(it, op, "output")
166
167
 
167
168
 
169
+ def PermutationIterator(values, indices):
170
+ """Returns an Iterator that accesses values through an index mapping.
171
+
172
+ Similar to https://nvidia.github.io/cccl/thrust/api/classthrust_1_1permutation__iterator.html
173
+
174
+ The permutation iterator accesses elements from the values collection using indices
175
+ from the indices collection, effectively computing values[indices[i]] at position i.
176
+ This is useful for gather/scatter operations and indirect array access patterns.
177
+
178
+ Example:
179
+ The code snippet below demonstrates the usage of a ``PermutationIterator``
180
+ to access values in a permuted order:
181
+
182
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/permutation_iterator_basic.py
183
+ :language: python
184
+ :start-after: # example-begin
185
+
186
+ Args:
187
+ values: The values array or iterator to be permuted
188
+ indices: An iterator or device array providing the indices for permutation
189
+
190
+ Returns:
191
+ A ``PermutationIterator`` object that yields values[indices[i]] at position i
192
+ """
193
+ return make_permutation_iterator(values, indices)
194
+
195
+
168
196
  def ZipIterator(*iterators):
169
197
  """Returns an Iterator representing a zipped sequence of values from N iterators.
170
198
 
@@ -178,7 +206,7 @@ def ZipIterator(*iterators):
178
206
  The code snippet below demonstrates the usage of a ``ZipIterator``
179
207
  combining two device arrays:
180
208
 
181
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/zip_iterator_elementwise.py
209
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/zip_iterator_elementwise.py
182
210
  :language: python
183
211
  :start-after: # example-begin
184
212
 
@@ -207,7 +207,15 @@ def pointer_add_intrinsic(context, ptr, offset):
207
207
  def codegen(context, builder, sig, args):
208
208
  ptr, index = args
209
209
  base = builder.ptrtoint(ptr, ir.IntType(_DEVICE_POINTER_BITWIDTH))
210
- offset = builder.mul(index, sizeof_pointee(context, ptr))
210
+ sizeof = sizeof_pointee(context, ptr)
211
+ # Cast index to match sizeof type if needed
212
+ if index.type != sizeof.type:
213
+ index = (
214
+ builder.sext(index, sizeof.type)
215
+ if index.type.width < sizeof.type.width
216
+ else builder.trunc(index, sizeof.type)
217
+ )
218
+ offset = builder.mul(index, sizeof)
211
219
  result = builder.add(base, offset)
212
220
  return builder.inttoptr(result, ptr.type)
213
221
 
@@ -610,3 +618,200 @@ def _get_last_element_ptr(device_array) -> int:
610
618
 
611
619
  ptr = get_data_pointer(device_array)
612
620
  return ptr + offset_to_last_element
621
+
622
+
623
+ class PermutationIteratorKind(IteratorKind):
624
+ pass
625
+
626
+
627
+ def make_permutation_iterator(values, indices):
628
+ """
629
+ Create a PermutationIterator that accesses values through an index mapping.
630
+
631
+ The permutation iterator accesses elements from `values` using indices from `indices`,
632
+ effectively computing values[indices[i]] at position i.
633
+
634
+ Args:
635
+ values: The values array or iterator to permute
636
+ indices: The indices array or iterator specifying the permutation
637
+
638
+ Returns:
639
+ PermutationIterator: Iterator that yields permuted values
640
+ """
641
+ # Convert arrays to iterators if needed
642
+ if hasattr(values, "__cuda_array_interface__"):
643
+ values = pointer(values, numba.from_dtype(get_dtype(values)))
644
+ elif not isinstance(values, IteratorBase):
645
+ raise TypeError("values must be a device array or iterator")
646
+
647
+ if hasattr(indices, "__cuda_array_interface__"):
648
+ indices = pointer(indices, numba.from_dtype(get_dtype(indices)))
649
+ elif not isinstance(indices, IteratorBase):
650
+ raise TypeError("indices must be an iterator or device array")
651
+
652
+ # JIT compile value advance/dereference methods
653
+ value_dtype = values.value_type
654
+ values_state_type = values.state_type
655
+ index_type = indices.value_type
656
+ value_advance = cuda.jit(values.advance, device=True)
657
+ value_input_dereference = cuda.jit(values.input_dereference, device=True)
658
+
659
+ try:
660
+ output_deref = values.output_dereference
661
+ if output_deref is not None:
662
+ value_output_dereference = cuda.jit(output_deref, device=True)
663
+ values_is_output_iterator = True
664
+ else:
665
+ values_is_output_iterator = False
666
+ except AttributeError:
667
+ values_is_output_iterator = False
668
+
669
+ # JIT compile index advance/dereference methods
670
+ index_advance = cuda.jit(indices.advance, device=True)
671
+ index_input_dereference = cuda.jit(indices.input_dereference, device=True)
672
+
673
+ # The cvalue and state for PermutationIterator are
674
+ # structs composed of the cvalues and states of the
675
+ # value and index iterators.
676
+ from ..struct import gpu_struct_from_numba_types
677
+
678
+ class PermutationCValueStruct(ctypes.Structure):
679
+ _fields_ = [
680
+ ("value_state", values.cvalue.__class__),
681
+ ("index_state", indices.cvalue.__class__),
682
+ ]
683
+
684
+ PermutationState = gpu_struct_from_numba_types(
685
+ "PermutationState",
686
+ ("value_state", "index_state"),
687
+ (values_state_type, indices.state_type),
688
+ )
689
+
690
+ cvalue = PermutationCValueStruct(values.cvalue, indices.cvalue)
691
+ state_type = PermutationState._numba_type
692
+ value_type = value_dtype
693
+
694
+ # Define intrinsics for accessing struct fields
695
+ @intrinsic
696
+ def get_value_state_field_ptr(context, struct_ptr_type):
697
+ def codegen(context, builder, sig, args):
698
+ struct_ptr = args[0]
699
+ # Use GEP to get pointer to field at index 0 (value_state)
700
+ field_ptr = builder.gep(
701
+ struct_ptr,
702
+ [ir.Constant(ir.IntType(32), 0), ir.Constant(ir.IntType(32), 0)],
703
+ )
704
+ return field_ptr
705
+
706
+ from numba.core.datamodel.registry import default_manager
707
+
708
+ struct_model = default_manager.lookup(struct_ptr_type.dtype)
709
+ field_type = struct_model._members[0]
710
+ return types.CPointer(field_type)(struct_ptr_type), codegen
711
+
712
+ @intrinsic
713
+ def get_index_state_field_ptr(context, struct_ptr_type):
714
+ def codegen(context, builder, sig, args):
715
+ struct_ptr = args[0]
716
+ # Use GEP to get pointer to field at index 1 (index_state)
717
+ field_ptr = builder.gep(
718
+ struct_ptr,
719
+ [ir.Constant(ir.IntType(32), 0), ir.Constant(ir.IntType(32), 1)],
720
+ )
721
+ return field_ptr
722
+
723
+ from numba.core.datamodel.registry import default_manager
724
+
725
+ struct_model = default_manager.lookup(struct_ptr_type.dtype)
726
+ field_type = struct_model._members[1]
727
+ return types.CPointer(field_type)(struct_ptr_type), codegen
728
+
729
+ # Create intrinsic for allocating temporary storage for index
730
+ @intrinsic
731
+ def alloca_temp_for_index_type(context):
732
+ def codegen(context, builder, sig, args):
733
+ temp_value_type = context.get_value_type(index_type)
734
+ temp_ptr = builder.alloca(temp_value_type)
735
+ return temp_ptr
736
+
737
+ return types.CPointer(index_type)(), codegen
738
+
739
+ # Create intrinsic for allocating temporary storage for value state
740
+ @intrinsic
741
+ def alloca_temp_for_value_state(context):
742
+ def codegen(context, builder, sig, args):
743
+ temp_state_type = context.get_value_type(values_state_type)
744
+ temp_ptr = builder.alloca(temp_state_type)
745
+ return temp_ptr
746
+
747
+ return types.CPointer(values_state_type)(), codegen
748
+
749
+ class PermutationIterator(IteratorBase):
750
+ iterator_kind_type = PermutationIteratorKind
751
+
752
+ def __init__(self, values_it, indices_it):
753
+ self._values = values_it
754
+ self._indices = indices_it
755
+ super().__init__(
756
+ cvalue=cvalue,
757
+ state_type=state_type,
758
+ value_type=value_type,
759
+ )
760
+ self._kind = self.__class__.iterator_kind_type(
761
+ (value_type, values_it.kind, indices_it.kind), state_type
762
+ )
763
+
764
+ @property
765
+ def advance(self):
766
+ return PermutationIterator._advance
767
+
768
+ @property
769
+ def input_dereference(self):
770
+ return PermutationIterator._input_dereference
771
+
772
+ @property
773
+ def output_dereference(self):
774
+ if not values_is_output_iterator:
775
+ raise AttributeError(
776
+ "PermutationIterator cannot be used as output iterator "
777
+ "when values iterator does not support output"
778
+ )
779
+ return PermutationIterator._output_dereference
780
+
781
+ @staticmethod
782
+ def _advance(state, distance):
783
+ # advance the index iterator
784
+ index_state_ptr = get_index_state_field_ptr(state)
785
+ index_advance(index_state_ptr, distance)
786
+
787
+ @staticmethod
788
+ def _input_dereference(state, result):
789
+ # dereference index to get the index value
790
+ index_state_ptr = get_index_state_field_ptr(state)
791
+ temp_index = alloca_temp_for_index_type()
792
+ index_input_dereference(index_state_ptr, temp_index)
793
+
794
+ # copy the value state (which always points to position 0)
795
+ # and advance it by the index value
796
+ value_state_ptr = get_value_state_field_ptr(state)
797
+ temp_value_state = alloca_temp_for_value_state()
798
+ temp_value_state[0] = value_state_ptr[0]
799
+ value_advance(temp_value_state, temp_index[0])
800
+ value_input_dereference(temp_value_state, result)
801
+
802
+ @staticmethod
803
+ def _output_dereference(state, x):
804
+ # dereference index to get the index value
805
+ index_state_ptr = get_index_state_field_ptr(state)
806
+ temp_index = alloca_temp_for_index_type()
807
+ index_input_dereference(index_state_ptr, temp_index)
808
+
809
+ # copy the value state (which always points to position 0)
810
+ # and advance it by the index value
811
+ value_state_ptr = get_value_state_field_ptr(state)
812
+ temp_value_state = alloca_temp_for_value_state()
813
+ temp_value_state[0] = value_state_ptr[0]
814
+ value_advance(temp_value_state, temp_index[0])
815
+ value_output_dereference(temp_value_state, x)
816
+
817
+ return PermutationIterator(values, indices)
@@ -39,10 +39,10 @@ def signature_from_annotations(func) -> numba.core.typing.Signature:
39
39
  argspec = inspect.getfullargspec(func)
40
40
  num_args = len(argspec.args)
41
41
  try:
42
- retty = to_numba_type(argspec.annotations["return"])
42
+ ret_ann = argspec.annotations["return"]
43
43
  except KeyError:
44
44
  raise ValueError("Function has incomplete annotations: missing return type")
45
-
45
+ retty = to_numba_type(ret_ann)
46
46
  if num_args != len(argspec.annotations) - 1: # -1 for the return type
47
47
  raise ValueError("One or more arguments are missing type annotations")
48
48
  argtys = tuple(
@@ -207,7 +207,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
207
207
  to a dataclass). The type of each field must be a subclass of
208
208
  `np.number`, like `np.int32` or `np.float64`.
209
209
 
210
- Arrays of GPUStruct objects can be used as inputs to cuda.cccl.parallel
210
+ Arrays of GPUStruct objects can be used as inputs to cuda.compute
211
211
  algorithms.
212
212
 
213
213
  Example:
@@ -216,7 +216,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
216
216
  a reduction on an input array of floating point values to compute its
217
217
  the smallest and the largest absolute values:
218
218
 
219
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/minmax_reduction.py
219
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/minmax_reduction.py
220
220
  :language: python
221
221
  :start-after: # example-begin
222
222
 
@@ -7,9 +7,11 @@ from typing import Any
7
7
 
8
8
  from typing_extensions import (
9
9
  Protocol,
10
+ runtime_checkable,
10
11
  ) # TODO: typing_extensions required for Python 3.7 docs env
11
12
 
12
13
 
14
+ @runtime_checkable
13
15
  class DeviceArrayLike(Protocol):
14
16
  """
15
17
  Objects representing a device array, having a `.__cuda_array_interface__`
cuda/coop/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
+
5
+ from . import block, warp
6
+ from ._types import StatefulFunction
7
+
8
+ __all__ = ["block", "warp", "StatefulFunction"]
@@ -5,8 +5,9 @@
5
5
  import functools
6
6
 
7
7
  from cuda.bindings import nvrtc
8
- from cuda.cccl.cooperative.experimental._caching import disk_cache
9
- from cuda.cccl.cooperative.experimental._common import check_in, version
8
+
9
+ from ._caching import disk_cache
10
+ from ._common import check_in, version
10
11
 
11
12
 
12
13
  def CHECK_NVRTC(err, prog):
@@ -3,8 +3,8 @@
3
3
  # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
4
 
5
5
  """
6
- cuda.cccl.cooperative.experimental._scan_op
7
- ======================================
6
+ cuda.coop._scan_op
7
+ ==================
8
8
 
9
9
  This module implements the ``ScanOp`` class and related functions.
10
10
  """
@@ -14,7 +14,7 @@ from enum import Enum
14
14
 
15
15
  import numpy as np
16
16
 
17
- from cuda.cccl.cooperative.experimental._typing import (
17
+ from ._typing import (
18
18
  ScanOpType,
19
19
  )
20
20
 
@@ -17,8 +17,8 @@ from numba.core.typing import signature
17
17
  from numba.cuda import LTOIR
18
18
  from numba.cuda.cudadrv import driver as cuda_driver
19
19
 
20
- import cuda.cccl.cooperative.experimental._nvrtc as nvrtc
21
- from cuda.cccl.cooperative.experimental._common import find_unsigned
20
+ from . import _nvrtc as nvrtc
21
+ from ._common import find_unsigned
22
22
 
23
23
  NUMBA_TYPES_TO_CPP = {
24
24
  types.boolean: "bool",
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
9
9
  import numba
10
10
  import numpy as np
11
11
 
12
- from cuda.cccl.cooperative.experimental._common import dim3
12
+ from ._common import dim3
13
13
 
14
14
  # Type alias for dimension parameters that can be passed to CUDA functions.
15
15
  DimType = Union["dim3", int, Tuple[int, int], Tuple[int, int, int]]
@@ -2,18 +2,18 @@
2
2
  #
3
3
  # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
4
 
5
- from cuda.cccl.cooperative.experimental.block._block_exchange import (
5
+ from ._block_exchange import (
6
6
  BlockExchangeType,
7
7
  exchange,
8
8
  )
9
- from cuda.cccl.cooperative.experimental.block._block_load_store import load, store
10
- from cuda.cccl.cooperative.experimental.block._block_merge_sort import merge_sort_keys
11
- from cuda.cccl.cooperative.experimental.block._block_radix_sort import (
9
+ from ._block_load_store import load, store
10
+ from ._block_merge_sort import merge_sort_keys
11
+ from ._block_radix_sort import (
12
12
  radix_sort_keys,
13
13
  radix_sort_keys_descending,
14
14
  )
15
- from cuda.cccl.cooperative.experimental.block._block_reduce import reduce, sum
16
- from cuda.cccl.cooperative.experimental.block._block_scan import (
15
+ from ._block_reduce import reduce, sum
16
+ from ._block_scan import (
17
17
  exclusive_scan,
18
18
  exclusive_sum,
19
19
  inclusive_scan,
@@ -3,7 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
4
 
5
5
  """
6
- cuda.cccl.cooperative.block_exchange
6
+ cuda.coop.block_exchange
7
7
  ====================================
8
8
 
9
9
  This module provides a set of :ref:`collective <collective-primitives>` methods
@@ -105,13 +105,13 @@ def exchange(
105
105
  perform. Currently, only :py:attr:`StripedToBlocked` is supported.
106
106
 
107
107
  :param dtype: Supplies the data type of the input and output arrays.
108
- :type dtype: :py:class:`cuda.cccl.cooperative.experimental._typing.DtypeType`
108
+ :type dtype: :py:class:`cuda.coop._typing.DtypeType`
109
109
 
110
110
  :param threads_per_block: Supplies the number of threads in the block,
111
111
  either as an integer for a 1D block or a tuple of two or three integers
112
112
  for a 2D or 3D block, respectively.
113
113
  :type threads_per_block:
114
- :py:class:`cuda.cccl.cooperative.experimental._typing.DimType`
114
+ :py:class:`cuda.coop._typing.DimType`
115
115
 
116
116
  :param items_per_thread: Supplies the number of items partitioned onto each
117
117
  thread.
@@ -137,7 +137,7 @@ def exchange(
137
137
  :raises ValueError: If ``items_per_thread`` is greater than 1 and
138
138
  ``methods`` is not *None* (i.e. a user-defined type is being used).
139
139
 
140
- :returns: An :py:class:`cuda.cccl.cooperative.experimental._types.Invocable`
140
+ :returns: An :py:class:`cuda.coop._types.Invocable`
141
141
  object representing the specialized kernel that call be called from
142
142
  a Numba JIT'd CUDA kernel.
143
143