cuda-cccl 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  7. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  20. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  21. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
  22. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  23. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  24. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
  25. cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
  26. cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
  27. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
  28. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  29. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
  30. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  31. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
  32. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
  33. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  34. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
  35. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +10 -5
  36. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  37. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  38. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
  39. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
  40. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  41. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  42. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  43. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  44. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  46. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  49. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
  52. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  55. cuda/cccl/headers/include/cub/util_device.cuh +51 -35
  56. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  57. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  58. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  59. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  60. cuda/cccl/headers/include/cuda/__algorithm/copy.h +4 -4
  61. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  62. cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
  63. cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
  64. cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
  65. cuda/cccl/headers/include/cuda/__device/device_ref.h +32 -51
  66. cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
  67. cuda/cccl/headers/include/cuda/__driver/driver_api.h +330 -36
  68. cuda/cccl/headers/include/cuda/__event/event.h +8 -8
  69. cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
  70. cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -4
  71. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  72. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  73. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
  74. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
  75. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
  76. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
  77. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
  78. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
  79. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
  80. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
  81. cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
  82. cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
  83. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
  84. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
  85. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  86. cuda/cccl/headers/include/cuda/algorithm +1 -1
  87. cuda/cccl/headers/include/cuda/devices +10 -0
  88. cuda/cccl/headers/include/cuda/iterator +1 -0
  89. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  90. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  91. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  92. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
  93. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
  94. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  95. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  96. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  97. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
  98. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
  99. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
  100. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
  101. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
  102. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
  103. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
  104. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  105. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
  106. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  107. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  108. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  109. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
  110. cuda/cccl/headers/include/cuda/std/string_view +12 -5
  111. cuda/cccl/headers/include/cuda/std/version +1 -4
  112. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  113. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  114. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  115. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
  116. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
  117. cuda/cccl/parallel/experimental/__init__.py +21 -70
  118. cuda/compute/__init__.py +77 -0
  119. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
  120. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +141 -1
  121. cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
  122. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  123. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  124. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  125. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
  126. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
  127. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  128. cuda/compute/algorithms/_three_way_partition.py +261 -0
  129. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
  130. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  131. cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  132. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  133. cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  134. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  135. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
  136. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  137. cuda/coop/__init__.py +8 -0
  138. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  139. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  140. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  141. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  142. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  143. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  144. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  145. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  146. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  147. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  148. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  149. cuda/coop/warp/__init__.py +9 -0
  150. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  151. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  152. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  153. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
  154. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +171 -166
  155. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  156. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  157. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
  158. cuda/cccl/parallel/experimental/.gitignore +0 -4
  159. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  160. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  161. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  162. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  163. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  164. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  165. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  166. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  167. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  168. /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
  169. /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
  170. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  171. /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
  172. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  173. /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
  174. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  175. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  176. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
  177. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,261 @@
1
+ # Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2
+ #
3
+ #
4
+ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5
+
6
+ from typing import Callable
7
+
8
+ import numba
9
+
10
+ from .. import _bindings
11
+ from .. import _cccl_interop as cccl
12
+ from .._caching import CachableFunction, cache_with_key
13
+ from .._cccl_interop import call_build, set_cccl_iterator_state
14
+ from .._utils import protocols
15
+ from .._utils.temp_storage_buffer import TempStorageBuffer
16
+ from ..iterators._iterators import IteratorBase
17
+ from ..typing import DeviceArrayLike
18
+
19
+
20
+ def make_cache_key(
21
+ d_in: DeviceArrayLike | IteratorBase,
22
+ d_first_part_out: DeviceArrayLike | IteratorBase,
23
+ d_second_part_out: DeviceArrayLike | IteratorBase,
24
+ d_unselected_out: DeviceArrayLike | IteratorBase,
25
+ d_num_selected_out: DeviceArrayLike | IteratorBase,
26
+ select_first_part_op: Callable,
27
+ select_second_part_op: Callable,
28
+ ):
29
+ d_in_key = (
30
+ d_in.kind if isinstance(d_in, IteratorBase) else protocols.get_dtype(d_in)
31
+ )
32
+ d_first_part_out_key = (
33
+ d_first_part_out.kind
34
+ if isinstance(d_first_part_out, IteratorBase)
35
+ else protocols.get_dtype(d_first_part_out)
36
+ )
37
+ d_second_part_out_key = (
38
+ d_second_part_out.kind
39
+ if isinstance(d_second_part_out, IteratorBase)
40
+ else protocols.get_dtype(d_second_part_out)
41
+ )
42
+ d_unselected_out_key = (
43
+ d_unselected_out.kind
44
+ if isinstance(d_unselected_out, IteratorBase)
45
+ else protocols.get_dtype(d_unselected_out)
46
+ )
47
+ d_num_selected_out_key = (
48
+ d_num_selected_out.kind
49
+ if isinstance(d_num_selected_out, IteratorBase)
50
+ else protocols.get_dtype(d_num_selected_out)
51
+ )
52
+ select_first_part_op_key = CachableFunction(select_first_part_op)
53
+ select_second_part_op_key = CachableFunction(select_second_part_op)
54
+ return (
55
+ d_in_key,
56
+ d_first_part_out_key,
57
+ d_second_part_out_key,
58
+ d_unselected_out_key,
59
+ d_num_selected_out_key,
60
+ select_first_part_op_key,
61
+ select_second_part_op_key,
62
+ )
63
+
64
+
65
+ class _ThreeWayPartition:
66
+ __slots__ = [
67
+ "build_result",
68
+ "d_in_cccl",
69
+ "d_first_part_out_cccl",
70
+ "d_second_part_out_cccl",
71
+ "d_unselected_out_cccl",
72
+ "d_num_selected_out_cccl",
73
+ "select_first_part_op_wrapper",
74
+ "select_second_part_op_wrapper",
75
+ ]
76
+
77
+ def __init__(
78
+ self,
79
+ d_in: DeviceArrayLike | IteratorBase,
80
+ d_first_part_out: DeviceArrayLike | IteratorBase,
81
+ d_second_part_out: DeviceArrayLike | IteratorBase,
82
+ d_unselected_out: DeviceArrayLike | IteratorBase,
83
+ d_num_selected_out: DeviceArrayLike | IteratorBase,
84
+ select_first_part_op: Callable,
85
+ select_second_part_op: Callable,
86
+ ):
87
+ self.d_in_cccl = cccl.to_cccl_input_iter(d_in)
88
+ self.d_first_part_out_cccl = cccl.to_cccl_output_iter(d_first_part_out)
89
+ self.d_second_part_out_cccl = cccl.to_cccl_output_iter(d_second_part_out)
90
+ self.d_unselected_out_cccl = cccl.to_cccl_output_iter(d_unselected_out)
91
+ self.d_num_selected_out_cccl = cccl.to_cccl_output_iter(d_num_selected_out)
92
+
93
+ value_type = cccl.get_value_type(d_in)
94
+ sig = numba.types.uint8(value_type)
95
+
96
+ # There are no well-known operations that can be used with three_way_partition
97
+ self.select_first_part_op_wrapper = cccl.to_cccl_op(select_first_part_op, sig)
98
+ self.select_second_part_op_wrapper = cccl.to_cccl_op(select_second_part_op, sig)
99
+
100
+ self.build_result = call_build(
101
+ _bindings.DeviceThreeWayPartitionBuildResult,
102
+ self.d_in_cccl,
103
+ self.d_first_part_out_cccl,
104
+ self.d_second_part_out_cccl,
105
+ self.d_unselected_out_cccl,
106
+ self.d_num_selected_out_cccl,
107
+ self.select_first_part_op_wrapper,
108
+ self.select_second_part_op_wrapper,
109
+ )
110
+
111
+ def __call__(
112
+ self,
113
+ temp_storage,
114
+ d_in,
115
+ d_first_part_out,
116
+ d_second_part_out,
117
+ d_unselected_out,
118
+ d_num_selected_out,
119
+ num_items: int,
120
+ stream=None,
121
+ ):
122
+ set_cccl_iterator_state(self.d_in_cccl, d_in)
123
+ set_cccl_iterator_state(self.d_first_part_out_cccl, d_first_part_out)
124
+ set_cccl_iterator_state(self.d_second_part_out_cccl, d_second_part_out)
125
+ set_cccl_iterator_state(self.d_unselected_out_cccl, d_unselected_out)
126
+ set_cccl_iterator_state(self.d_num_selected_out_cccl, d_num_selected_out)
127
+ stream_handle = protocols.validate_and_get_stream(stream)
128
+
129
+ if temp_storage is None:
130
+ temp_storage_bytes = 0
131
+ d_temp_storage = 0
132
+ else:
133
+ temp_storage_bytes = temp_storage.nbytes
134
+ d_temp_storage = protocols.get_data_pointer(temp_storage)
135
+
136
+ temp_storage_bytes = self.build_result.compute(
137
+ d_temp_storage,
138
+ temp_storage_bytes,
139
+ self.d_in_cccl,
140
+ self.d_first_part_out_cccl,
141
+ self.d_second_part_out_cccl,
142
+ self.d_unselected_out_cccl,
143
+ self.d_num_selected_out_cccl,
144
+ self.select_first_part_op_wrapper,
145
+ self.select_second_part_op_wrapper,
146
+ num_items,
147
+ stream_handle,
148
+ )
149
+ return temp_storage_bytes
150
+
151
+
152
+ @cache_with_key(make_cache_key)
153
+ def make_three_way_partition(
154
+ d_in: DeviceArrayLike | IteratorBase,
155
+ d_first_part_out: DeviceArrayLike | IteratorBase,
156
+ d_second_part_out: DeviceArrayLike | IteratorBase,
157
+ d_unselected_out: DeviceArrayLike | IteratorBase,
158
+ d_num_selected_out: DeviceArrayLike | IteratorBase,
159
+ select_first_part_op: Callable,
160
+ select_second_part_op: Callable,
161
+ ):
162
+ """
163
+ Computes a device-wide three-way partition using the specified unary ``select_first_part_op`` and ``select_second_part_op`` operators.
164
+
165
+ Example:
166
+ Below, ``make_three_way_partition`` is used to create a three-way partition object that can be reused.
167
+
168
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_object.py
169
+ :language: python
170
+ :start-after: # example-begin
171
+
172
+ Args:
173
+ d_in: Device array or iterator containing the input sequence of data items
174
+ d_first_part_out: Device array or iterator to store the first part of the output
175
+ d_second_part_out: Device array or iterator to store the second part of the output
176
+ d_unselected_out: Device array or iterator to store the unselected items
177
+ d_num_selected_out: Device array to store the number of items selected. The total number of items selected by ``select_first_part_op`` and ``select_second_part_op`` is stored in ``d_num_selected_out[0]`` and ``d_num_selected_out[1]``, respectively.
178
+ select_first_part_op: Callable representing the unary operator to select the first part
179
+ select_second_part_op: Callable representing the unary operator to select the second part
180
+
181
+ Returns:
182
+ A callable object that can be used to perform the three-way partition
183
+ """
184
+ return _ThreeWayPartition(
185
+ d_in,
186
+ d_first_part_out,
187
+ d_second_part_out,
188
+ d_unselected_out,
189
+ d_num_selected_out,
190
+ select_first_part_op,
191
+ select_second_part_op,
192
+ )
193
+
194
+
195
+ def three_way_partition(
196
+ d_in: DeviceArrayLike | IteratorBase,
197
+ d_first_part_out: DeviceArrayLike | IteratorBase,
198
+ d_second_part_out: DeviceArrayLike | IteratorBase,
199
+ d_unselected_out: DeviceArrayLike | IteratorBase,
200
+ d_num_selected_out: DeviceArrayLike | IteratorBase,
201
+ select_first_part_op: Callable,
202
+ select_second_part_op: Callable,
203
+ num_items: int,
204
+ stream=None,
205
+ ):
206
+ """
207
+ Performs device-wide three-way partition. Given an input sequence of data items, it partitions the items into three parts:
208
+ - The first part is selected by the ``select_first_part_op`` operator.
209
+ - The second part is selected by the ``select_second_part_op`` operator.
210
+ - The unselected items are not selected by either operator.
211
+
212
+ This function automatically handles temporary storage allocation and execution.
213
+
214
+ Example:
215
+ Below, ``three_way_partition`` is used to partition a sequence of integers into three parts.
216
+
217
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_basic.py
218
+ :language: python
219
+ :start-after: # example-begin
220
+
221
+ Args:
222
+ d_in: Device array or iterator containing the input sequence of data items
223
+ d_first_part_out: Device array or iterator to store the first part of the output
224
+ d_second_part_out: Device array or iterator to store the second part of the output
225
+ d_unselected_out: Device array or iterator to store the unselected items
226
+ d_num_selected_out: Device array to store the number of items selected. The total number of items selected by ``select_first_part_op`` and ``select_second_part_op`` is stored in ``d_num_selected_out[0]`` and ``d_num_selected_out[1]``, respectively.
227
+ select_first_part_op: Callable representing the unary operator to select the first part
228
+ select_second_part_op: Callable representing the unary operator to select the second part
229
+ num_items: Number of items to partition
230
+ stream: CUDA stream for the operation (optional)
231
+ """
232
+ partitioner = make_three_way_partition(
233
+ d_in,
234
+ d_first_part_out,
235
+ d_second_part_out,
236
+ d_unselected_out,
237
+ d_num_selected_out,
238
+ select_first_part_op,
239
+ select_second_part_op,
240
+ )
241
+ tmp_storage_bytes = partitioner(
242
+ None,
243
+ d_in,
244
+ d_first_part_out,
245
+ d_second_part_out,
246
+ d_unselected_out,
247
+ d_num_selected_out,
248
+ num_items,
249
+ stream,
250
+ )
251
+ tmp_storage = TempStorageBuffer(tmp_storage_bytes, stream)
252
+ partitioner(
253
+ tmp_storage,
254
+ d_in,
255
+ d_first_part_out,
256
+ d_second_part_out,
257
+ d_unselected_out,
258
+ d_num_selected_out,
259
+ num_items,
260
+ stream,
261
+ )
@@ -196,7 +196,7 @@ def make_unary_transform(
196
196
  storage allocation. For simpler usage, consider using :func:`unary_transform`.
197
197
 
198
198
  Example:
199
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/unary_transform_object.py
199
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_object.py
200
200
  :language: python
201
201
  :start-after: # example-begin
202
202
 
@@ -227,7 +227,7 @@ def make_binary_transform(
227
227
  storage allocation. For simpler usage, consider using :func:`binary_transform`.
228
228
 
229
229
  Example:
230
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/binary_transform_object.py
230
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_object.py
231
231
  :language: python
232
232
  :start-after: # example-begin
233
233
 
@@ -259,7 +259,7 @@ def unary_transform(
259
259
  Example:
260
260
  Below, ``unary_transform`` is used to apply a transformation to each element of the input.
261
261
 
262
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/unary_transform_basic.py
262
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_basic.py
263
263
  :language: python
264
264
  :start-after: # example-begin
265
265
 
@@ -291,7 +291,7 @@ def binary_transform(
291
291
  Example:
292
292
  Below, ``binary_transform`` is used to apply a transformation to pairs of elements from two input sequences.
293
293
 
294
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/transform/binary_transform_basic.py
294
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_basic.py
295
295
  :language: python
296
296
  :start-after: # example-begin
297
297
 
@@ -171,7 +171,7 @@ def make_unique_by_key(
171
171
  Example:
172
172
  Below, ``make_unique_by_key`` is used to create a unique by key object that can be reused.
173
173
 
174
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/unique/unique_by_key_object.py
174
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_object.py
175
175
  :language: python
176
176
  :start-after: # example-begin
177
177
 
@@ -211,7 +211,7 @@ def unique_by_key(
211
211
  Example:
212
212
  Below, ``unique_by_key`` is used to populate the arrays of output keys and items with the first key and its corresponding item from each sequence of equal keys. It also outputs the number of items selected.
213
213
 
214
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/unique/unique_by_key_basic.py
214
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_basic.py
215
215
  :language: python
216
216
  :start-after: # example-begin
217
217
 
@@ -26,7 +26,7 @@ def CacheModifiedInputIterator(device_array, modifier):
26
26
  Example:
27
27
  The code snippet below demonstrates the usage of a ``CacheModifiedInputIterator``:
28
28
 
29
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/cache_modified_iterator_basic.py
29
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/cache_modified_iterator_basic.py
30
30
  :language: python
31
31
  :start-after: # example-begin
32
32
 
@@ -55,7 +55,7 @@ def ConstantIterator(value):
55
55
  The code snippet below demonstrates the usage of a ``ConstantIterator``
56
56
  representing a sequence of constant values:
57
57
 
58
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/constant_iterator_basic.py
58
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/constant_iterator_basic.py
59
59
  :language: python
60
60
  :start-after: # example-begin
61
61
 
@@ -78,7 +78,7 @@ def CountingIterator(offset):
78
78
  The code snippet below demonstrates the usage of a ``CountingIterator``
79
79
  representing the sequence ``[10, 11, 12]``:
80
80
 
81
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/counting_iterator_basic.py
81
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/counting_iterator_basic.py
82
82
  :language: python
83
83
  :start-after: # example-begin
84
84
 
@@ -100,13 +100,13 @@ def ReverseIterator(sequence):
100
100
  Examples:
101
101
  The code snippet below demonstrates the usage of a ``ReverseIterator`` as an input iterator:
102
102
 
103
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/reverse_input_iterator.py
103
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_input_iterator.py
104
104
  :language: python
105
105
  :start-after: # example-begin
106
106
 
107
107
  The code snippet below demonstrates the usage of a ``ReverseIterator`` as an output iterator:
108
108
 
109
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/reverse_output_iterator.py
109
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_output_iterator.py
110
110
  :language: python
111
111
  :start-after: # example-begin
112
112
 
@@ -129,7 +129,7 @@ def TransformIterator(it, op):
129
129
  The code snippet below demonstrates the usage of a ``TransformIterator`` composed with a ``CountingIterator``
130
130
  to transform the input before performing a reduction.
131
131
 
132
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/transform_iterator_basic.py
132
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_iterator_basic.py
133
133
  :language: python
134
134
  :start-after: # example-begin
135
135
  Args:
@@ -151,7 +151,7 @@ def TransformOutputIterator(it, op):
151
151
  The code snippet below demonstrates the usage of a ``TransformOutputIterator`` to transform the output
152
152
  of a reduction before writing to an output array.
153
153
 
154
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/transform_output_iterator.py
154
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_output_iterator.py
155
155
  :language: python
156
156
  :start-after: # example-begin
157
157
 
@@ -178,7 +178,7 @@ def ZipIterator(*iterators):
178
178
  The code snippet below demonstrates the usage of a ``ZipIterator``
179
179
  combining two device arrays:
180
180
 
181
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/iterator/zip_iterator_elementwise.py
181
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/zip_iterator_elementwise.py
182
182
  :language: python
183
183
  :start-after: # example-begin
184
184
 
@@ -207,7 +207,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
207
207
  to a dataclass). The type of each field must be a subclass of
208
208
  `np.number`, like `np.int32` or `np.float64`.
209
209
 
210
- Arrays of GPUStruct objects can be used as inputs to cuda.cccl.parallel
210
+ Arrays of GPUStruct objects can be used as inputs to cuda.compute
211
211
  algorithms.
212
212
 
213
213
  Example:
@@ -216,7 +216,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
216
216
  a reduction on an input array of floating point values to compute its
217
217
  the smallest and the largest absolute values:
218
218
 
219
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/minmax_reduction.py
219
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/minmax_reduction.py
220
220
  :language: python
221
221
  :start-after: # example-begin
222
222
 
cuda/coop/__init__.py ADDED
@@ -0,0 +1,8 @@
1
+ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
+
5
+ from . import block, warp
6
+ from ._types import StatefulFunction
7
+
8
+ __all__ = ["block", "warp", "StatefulFunction"]
@@ -5,8 +5,9 @@
5
5
  import functools
6
6
 
7
7
  from cuda.bindings import nvrtc
8
- from cuda.cccl.cooperative.experimental._caching import disk_cache
9
- from cuda.cccl.cooperative.experimental._common import check_in, version
8
+
9
+ from ._caching import disk_cache
10
+ from ._common import check_in, version
10
11
 
11
12
 
12
13
  def CHECK_NVRTC(err, prog):
@@ -3,8 +3,8 @@
3
3
  # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
4
 
5
5
  """
6
- cuda.cccl.cooperative.experimental._scan_op
7
- ======================================
6
+ cuda.coop._scan_op
7
+ ==================
8
8
 
9
9
  This module implements the ``ScanOp`` class and related functions.
10
10
  """
@@ -14,7 +14,7 @@ from enum import Enum
14
14
 
15
15
  import numpy as np
16
16
 
17
- from cuda.cccl.cooperative.experimental._typing import (
17
+ from ._typing import (
18
18
  ScanOpType,
19
19
  )
20
20
 
@@ -17,8 +17,8 @@ from numba.core.typing import signature
17
17
  from numba.cuda import LTOIR
18
18
  from numba.cuda.cudadrv import driver as cuda_driver
19
19
 
20
- import cuda.cccl.cooperative.experimental._nvrtc as nvrtc
21
- from cuda.cccl.cooperative.experimental._common import find_unsigned
20
+ from . import _nvrtc as nvrtc
21
+ from ._common import find_unsigned
22
22
 
23
23
  NUMBA_TYPES_TO_CPP = {
24
24
  types.boolean: "bool",
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
9
9
  import numba
10
10
  import numpy as np
11
11
 
12
- from cuda.cccl.cooperative.experimental._common import dim3
12
+ from ._common import dim3
13
13
 
14
14
  # Type alias for dimension parameters that can be passed to CUDA functions.
15
15
  DimType = Union["dim3", int, Tuple[int, int], Tuple[int, int, int]]
@@ -2,18 +2,18 @@
2
2
  #
3
3
  # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
4
 
5
- from cuda.cccl.cooperative.experimental.block._block_exchange import (
5
+ from ._block_exchange import (
6
6
  BlockExchangeType,
7
7
  exchange,
8
8
  )
9
- from cuda.cccl.cooperative.experimental.block._block_load_store import load, store
10
- from cuda.cccl.cooperative.experimental.block._block_merge_sort import merge_sort_keys
11
- from cuda.cccl.cooperative.experimental.block._block_radix_sort import (
9
+ from ._block_load_store import load, store
10
+ from ._block_merge_sort import merge_sort_keys
11
+ from ._block_radix_sort import (
12
12
  radix_sort_keys,
13
13
  radix_sort_keys_descending,
14
14
  )
15
- from cuda.cccl.cooperative.experimental.block._block_reduce import reduce, sum
16
- from cuda.cccl.cooperative.experimental.block._block_scan import (
15
+ from ._block_reduce import reduce, sum
16
+ from ._block_scan import (
17
17
  exclusive_scan,
18
18
  exclusive_sum,
19
19
  inclusive_scan,
@@ -3,7 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
4
 
5
5
  """
6
- cuda.cccl.cooperative.block_exchange
6
+ cuda.coop.block_exchange
7
7
  ====================================
8
8
 
9
9
  This module provides a set of :ref:`collective <collective-primitives>` methods
@@ -105,13 +105,13 @@ def exchange(
105
105
  perform. Currently, only :py:attr:`StripedToBlocked` is supported.
106
106
 
107
107
  :param dtype: Supplies the data type of the input and output arrays.
108
- :type dtype: :py:class:`cuda.cccl.cooperative.experimental._typing.DtypeType`
108
+ :type dtype: :py:class:`cuda.coop._typing.DtypeType`
109
109
 
110
110
  :param threads_per_block: Supplies the number of threads in the block,
111
111
  either as an integer for a 1D block or a tuple of two or three integers
112
112
  for a 2D or 3D block, respectively.
113
113
  :type threads_per_block:
114
- :py:class:`cuda.cccl.cooperative.experimental._typing.DimType`
114
+ :py:class:`cuda.coop._typing.DimType`
115
115
 
116
116
  :param items_per_thread: Supplies the number of items partitioned onto each
117
117
  thread.
@@ -137,7 +137,7 @@ def exchange(
137
137
  :raises ValueError: If ``items_per_thread`` is greater than 1 and
138
138
  ``methods`` is not *None* (i.e. a user-defined type is being used).
139
139
 
140
- :returns: An :py:class:`cuda.cccl.cooperative.experimental._types.Invocable`
140
+ :returns: An :py:class:`cuda.coop._types.Invocable`
141
141
  object representing the specialized kernel that call be called from
142
142
  a Numba JIT'd CUDA kernel.
143
143
 
@@ -5,12 +5,12 @@
5
5
 
6
6
  import numba
7
7
 
8
- from cuda.cccl.cooperative.experimental._common import (
8
+ from .._common import (
9
9
  make_binary_tempfile,
10
10
  normalize_dim_param,
11
11
  normalize_dtype_param,
12
12
  )
13
- from cuda.cccl.cooperative.experimental._types import (
13
+ from .._types import (
14
14
  Algorithm,
15
15
  Dependency,
16
16
  DependentArray,
@@ -70,13 +70,13 @@ def load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
70
70
  The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
71
71
  each thread handling 4 integers.
72
72
 
73
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
73
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
74
74
  :language: python
75
75
  :dedent:
76
76
  :start-after: example-begin imports
77
77
  :end-before: example-end imports
78
78
 
79
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
79
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
80
80
  :language: python
81
81
  :dedent:
82
82
  :start-after: example-begin load_store
@@ -158,13 +158,13 @@ def store(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
158
158
  The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
159
159
  each thread handling 4 integers.
160
160
 
161
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
161
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
162
162
  :language: python
163
163
  :dedent:
164
164
  :start-after: example-begin imports
165
165
  :end-before: example-end imports
166
166
 
167
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
167
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
168
168
  :language: python
169
169
  :dedent:
170
170
  :start-after: example-begin load_store
@@ -6,12 +6,12 @@ from typing import TYPE_CHECKING, Callable, Literal, Union
6
6
 
7
7
  import numba
8
8
 
9
- from cuda.cccl.cooperative.experimental._common import (
9
+ from .._common import (
10
10
  make_binary_tempfile,
11
11
  normalize_dim_param,
12
12
  normalize_dtype_param,
13
13
  )
14
- from cuda.cccl.cooperative.experimental._types import (
14
+ from .._types import (
15
15
  Algorithm,
16
16
  Constant,
17
17
  Dependency,
@@ -41,7 +41,7 @@ def merge_sort_keys(
41
41
  are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
42
42
  where each thread owns 4 consecutive keys. We start by importing necessary modules:
43
43
 
44
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py
44
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
45
45
  :language: python
46
46
  :dedent:
47
47
  :start-after: example-begin imports
@@ -49,7 +49,7 @@ def merge_sort_keys(
49
49
 
50
50
  Below is the code snippet that demonstrates the usage of the ``merge_sort_keys`` API:
51
51
 
52
- .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py
52
+ .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
53
53
  :language: python
54
54
  :dedent:
55
55
  :start-after: example-begin merge-sort