cuda-cccl 0.1.3.2.0.dev438__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +23 -0
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +22 -14
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
- cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
- cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +321 -262
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +203 -51
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
- cuda/cccl/headers/include/cub/util_device.cuh +51 -35
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +3 -3
- cuda/cccl/headers/include/cuda/__device/all_devices.h +3 -6
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +3 -3
- cuda/cccl/headers/include/cuda/__device/attributes.h +7 -7
- cuda/cccl/headers/include/cuda/__device/device_ref.h +3 -10
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +225 -33
- cuda/cccl/headers/include/cuda/__event/event.h +7 -8
- cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +3 -4
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
- cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
- cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -12
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
- cuda/cccl/parallel/experimental/__init__.py +4 -0
- cuda/cccl/parallel/experimental/_bindings.pyi +28 -0
- cuda/cccl/parallel/experimental/_bindings_impl.pyx +140 -0
- cuda/cccl/parallel/experimental/algorithms/__init__.py +4 -0
- cuda/cccl/parallel/experimental/algorithms/_reduce.py +0 -2
- cuda/cccl/parallel/experimental/algorithms/_scan.py +0 -2
- cuda/cccl/parallel/experimental/algorithms/_three_way_partition.py +261 -0
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/METADATA +1 -1
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/RECORD +59 -57
- cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
|
|
2
|
+
#
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
5
|
+
|
|
6
|
+
from typing import Callable
|
|
7
|
+
|
|
8
|
+
import numba
|
|
9
|
+
|
|
10
|
+
from .. import _bindings
|
|
11
|
+
from .. import _cccl_interop as cccl
|
|
12
|
+
from .._caching import CachableFunction, cache_with_key
|
|
13
|
+
from .._cccl_interop import call_build, set_cccl_iterator_state
|
|
14
|
+
from .._utils import protocols
|
|
15
|
+
from .._utils.temp_storage_buffer import TempStorageBuffer
|
|
16
|
+
from ..iterators._iterators import IteratorBase
|
|
17
|
+
from ..typing import DeviceArrayLike
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def make_cache_key(
|
|
21
|
+
d_in: DeviceArrayLike | IteratorBase,
|
|
22
|
+
d_first_part_out: DeviceArrayLike | IteratorBase,
|
|
23
|
+
d_second_part_out: DeviceArrayLike | IteratorBase,
|
|
24
|
+
d_unselected_out: DeviceArrayLike | IteratorBase,
|
|
25
|
+
d_num_selected_out: DeviceArrayLike | IteratorBase,
|
|
26
|
+
select_first_part_op: Callable,
|
|
27
|
+
select_second_part_op: Callable,
|
|
28
|
+
):
|
|
29
|
+
d_in_key = (
|
|
30
|
+
d_in.kind if isinstance(d_in, IteratorBase) else protocols.get_dtype(d_in)
|
|
31
|
+
)
|
|
32
|
+
d_first_part_out_key = (
|
|
33
|
+
d_first_part_out.kind
|
|
34
|
+
if isinstance(d_first_part_out, IteratorBase)
|
|
35
|
+
else protocols.get_dtype(d_first_part_out)
|
|
36
|
+
)
|
|
37
|
+
d_second_part_out_key = (
|
|
38
|
+
d_second_part_out.kind
|
|
39
|
+
if isinstance(d_second_part_out, IteratorBase)
|
|
40
|
+
else protocols.get_dtype(d_second_part_out)
|
|
41
|
+
)
|
|
42
|
+
d_unselected_out_key = (
|
|
43
|
+
d_unselected_out.kind
|
|
44
|
+
if isinstance(d_unselected_out, IteratorBase)
|
|
45
|
+
else protocols.get_dtype(d_unselected_out)
|
|
46
|
+
)
|
|
47
|
+
d_num_selected_out_key = (
|
|
48
|
+
d_num_selected_out.kind
|
|
49
|
+
if isinstance(d_num_selected_out, IteratorBase)
|
|
50
|
+
else protocols.get_dtype(d_num_selected_out)
|
|
51
|
+
)
|
|
52
|
+
select_first_part_op_key = CachableFunction(select_first_part_op)
|
|
53
|
+
select_second_part_op_key = CachableFunction(select_second_part_op)
|
|
54
|
+
return (
|
|
55
|
+
d_in_key,
|
|
56
|
+
d_first_part_out_key,
|
|
57
|
+
d_second_part_out_key,
|
|
58
|
+
d_unselected_out_key,
|
|
59
|
+
d_num_selected_out_key,
|
|
60
|
+
select_first_part_op_key,
|
|
61
|
+
select_second_part_op_key,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class _ThreeWayPartition:
|
|
66
|
+
__slots__ = [
|
|
67
|
+
"build_result",
|
|
68
|
+
"d_in_cccl",
|
|
69
|
+
"d_first_part_out_cccl",
|
|
70
|
+
"d_second_part_out_cccl",
|
|
71
|
+
"d_unselected_out_cccl",
|
|
72
|
+
"d_num_selected_out_cccl",
|
|
73
|
+
"select_first_part_op_wrapper",
|
|
74
|
+
"select_second_part_op_wrapper",
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
d_in: DeviceArrayLike | IteratorBase,
|
|
80
|
+
d_first_part_out: DeviceArrayLike | IteratorBase,
|
|
81
|
+
d_second_part_out: DeviceArrayLike | IteratorBase,
|
|
82
|
+
d_unselected_out: DeviceArrayLike | IteratorBase,
|
|
83
|
+
d_num_selected_out: DeviceArrayLike | IteratorBase,
|
|
84
|
+
select_first_part_op: Callable,
|
|
85
|
+
select_second_part_op: Callable,
|
|
86
|
+
):
|
|
87
|
+
self.d_in_cccl = cccl.to_cccl_input_iter(d_in)
|
|
88
|
+
self.d_first_part_out_cccl = cccl.to_cccl_output_iter(d_first_part_out)
|
|
89
|
+
self.d_second_part_out_cccl = cccl.to_cccl_output_iter(d_second_part_out)
|
|
90
|
+
self.d_unselected_out_cccl = cccl.to_cccl_output_iter(d_unselected_out)
|
|
91
|
+
self.d_num_selected_out_cccl = cccl.to_cccl_output_iter(d_num_selected_out)
|
|
92
|
+
|
|
93
|
+
value_type = cccl.get_value_type(d_in)
|
|
94
|
+
sig = numba.types.uint8(value_type)
|
|
95
|
+
|
|
96
|
+
# There are no well-known operations that can be used with three_way_partition
|
|
97
|
+
self.select_first_part_op_wrapper = cccl.to_cccl_op(select_first_part_op, sig)
|
|
98
|
+
self.select_second_part_op_wrapper = cccl.to_cccl_op(select_second_part_op, sig)
|
|
99
|
+
|
|
100
|
+
self.build_result = call_build(
|
|
101
|
+
_bindings.DeviceThreeWayPartitionBuildResult,
|
|
102
|
+
self.d_in_cccl,
|
|
103
|
+
self.d_first_part_out_cccl,
|
|
104
|
+
self.d_second_part_out_cccl,
|
|
105
|
+
self.d_unselected_out_cccl,
|
|
106
|
+
self.d_num_selected_out_cccl,
|
|
107
|
+
self.select_first_part_op_wrapper,
|
|
108
|
+
self.select_second_part_op_wrapper,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def __call__(
|
|
112
|
+
self,
|
|
113
|
+
temp_storage,
|
|
114
|
+
d_in,
|
|
115
|
+
d_first_part_out,
|
|
116
|
+
d_second_part_out,
|
|
117
|
+
d_unselected_out,
|
|
118
|
+
d_num_selected_out,
|
|
119
|
+
num_items: int,
|
|
120
|
+
stream=None,
|
|
121
|
+
):
|
|
122
|
+
set_cccl_iterator_state(self.d_in_cccl, d_in)
|
|
123
|
+
set_cccl_iterator_state(self.d_first_part_out_cccl, d_first_part_out)
|
|
124
|
+
set_cccl_iterator_state(self.d_second_part_out_cccl, d_second_part_out)
|
|
125
|
+
set_cccl_iterator_state(self.d_unselected_out_cccl, d_unselected_out)
|
|
126
|
+
set_cccl_iterator_state(self.d_num_selected_out_cccl, d_num_selected_out)
|
|
127
|
+
stream_handle = protocols.validate_and_get_stream(stream)
|
|
128
|
+
|
|
129
|
+
if temp_storage is None:
|
|
130
|
+
temp_storage_bytes = 0
|
|
131
|
+
d_temp_storage = 0
|
|
132
|
+
else:
|
|
133
|
+
temp_storage_bytes = temp_storage.nbytes
|
|
134
|
+
d_temp_storage = protocols.get_data_pointer(temp_storage)
|
|
135
|
+
|
|
136
|
+
temp_storage_bytes = self.build_result.compute(
|
|
137
|
+
d_temp_storage,
|
|
138
|
+
temp_storage_bytes,
|
|
139
|
+
self.d_in_cccl,
|
|
140
|
+
self.d_first_part_out_cccl,
|
|
141
|
+
self.d_second_part_out_cccl,
|
|
142
|
+
self.d_unselected_out_cccl,
|
|
143
|
+
self.d_num_selected_out_cccl,
|
|
144
|
+
self.select_first_part_op_wrapper,
|
|
145
|
+
self.select_second_part_op_wrapper,
|
|
146
|
+
num_items,
|
|
147
|
+
stream_handle,
|
|
148
|
+
)
|
|
149
|
+
return temp_storage_bytes
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@cache_with_key(make_cache_key)
|
|
153
|
+
def make_three_way_partition(
|
|
154
|
+
d_in: DeviceArrayLike | IteratorBase,
|
|
155
|
+
d_first_part_out: DeviceArrayLike | IteratorBase,
|
|
156
|
+
d_second_part_out: DeviceArrayLike | IteratorBase,
|
|
157
|
+
d_unselected_out: DeviceArrayLike | IteratorBase,
|
|
158
|
+
d_num_selected_out: DeviceArrayLike | IteratorBase,
|
|
159
|
+
select_first_part_op: Callable,
|
|
160
|
+
select_second_part_op: Callable,
|
|
161
|
+
):
|
|
162
|
+
"""
|
|
163
|
+
Computes a device-wide three-way partition using the specified unary ``select_first_part_op`` and ``select_second_part_op`` operators.
|
|
164
|
+
|
|
165
|
+
Example:
|
|
166
|
+
Below, ``make_three_way_partition`` is used to create a three-way partition object that can be reused.
|
|
167
|
+
|
|
168
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/three_way_partition/three_way_partition_object.py
|
|
169
|
+
:language: python
|
|
170
|
+
:start-after: # example-begin
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
d_in: Device array or iterator containing the input sequence of data items
|
|
174
|
+
d_first_part_out: Device array or iterator to store the first part of the output
|
|
175
|
+
d_second_part_out: Device array or iterator to store the second part of the output
|
|
176
|
+
d_unselected_out: Device array or iterator to store the unselected items
|
|
177
|
+
d_num_selected_out: Device array to store the number of items selected. The total number of items selected by ``select_first_part_op`` and ``select_second_part_op`` is stored in ``d_num_selected_out[0]`` and ``d_num_selected_out[1]``, respectively.
|
|
178
|
+
select_first_part_op: Callable representing the unary operator to select the first part
|
|
179
|
+
select_second_part_op: Callable representing the unary operator to select the second part
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
A callable object that can be used to perform the three-way partition
|
|
183
|
+
"""
|
|
184
|
+
return _ThreeWayPartition(
|
|
185
|
+
d_in,
|
|
186
|
+
d_first_part_out,
|
|
187
|
+
d_second_part_out,
|
|
188
|
+
d_unselected_out,
|
|
189
|
+
d_num_selected_out,
|
|
190
|
+
select_first_part_op,
|
|
191
|
+
select_second_part_op,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def three_way_partition(
|
|
196
|
+
d_in: DeviceArrayLike | IteratorBase,
|
|
197
|
+
d_first_part_out: DeviceArrayLike | IteratorBase,
|
|
198
|
+
d_second_part_out: DeviceArrayLike | IteratorBase,
|
|
199
|
+
d_unselected_out: DeviceArrayLike | IteratorBase,
|
|
200
|
+
d_num_selected_out: DeviceArrayLike | IteratorBase,
|
|
201
|
+
select_first_part_op: Callable,
|
|
202
|
+
select_second_part_op: Callable,
|
|
203
|
+
num_items: int,
|
|
204
|
+
stream=None,
|
|
205
|
+
):
|
|
206
|
+
"""
|
|
207
|
+
Performs device-wide three-way partition. Given an input sequence of data items, it partitions the items into three parts:
|
|
208
|
+
- The first part is selected by the ``select_first_part_op`` operator.
|
|
209
|
+
- The second part is selected by the ``select_second_part_op`` operator.
|
|
210
|
+
- The unselected items are not selected by either operator.
|
|
211
|
+
|
|
212
|
+
This function automatically handles temporary storage allocation and execution.
|
|
213
|
+
|
|
214
|
+
Example:
|
|
215
|
+
Below, ``three_way_partition`` is used to partition a sequence of integers into three parts.
|
|
216
|
+
|
|
217
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/three_way_partition/three_way_partition_basic.py
|
|
218
|
+
:language: python
|
|
219
|
+
:start-after: # example-begin
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
d_in: Device array or iterator containing the input sequence of data items
|
|
223
|
+
d_first_part_out: Device array or iterator to store the first part of the output
|
|
224
|
+
d_second_part_out: Device array or iterator to store the second part of the output
|
|
225
|
+
d_unselected_out: Device array or iterator to store the unselected items
|
|
226
|
+
d_num_selected_out: Device array to store the number of items selected. The total number of items selected by ``select_first_part_op`` and ``select_second_part_op`` is stored in ``d_num_selected_out[0]`` and ``d_num_selected_out[1]``, respectively.
|
|
227
|
+
select_first_part_op: Callable representing the unary operator to select the first part
|
|
228
|
+
select_second_part_op: Callable representing the unary operator to select the second part
|
|
229
|
+
num_items: Number of items to partition
|
|
230
|
+
stream: CUDA stream for the operation (optional)
|
|
231
|
+
"""
|
|
232
|
+
partitioner = make_three_way_partition(
|
|
233
|
+
d_in,
|
|
234
|
+
d_first_part_out,
|
|
235
|
+
d_second_part_out,
|
|
236
|
+
d_unselected_out,
|
|
237
|
+
d_num_selected_out,
|
|
238
|
+
select_first_part_op,
|
|
239
|
+
select_second_part_op,
|
|
240
|
+
)
|
|
241
|
+
tmp_storage_bytes = partitioner(
|
|
242
|
+
None,
|
|
243
|
+
d_in,
|
|
244
|
+
d_first_part_out,
|
|
245
|
+
d_second_part_out,
|
|
246
|
+
d_unselected_out,
|
|
247
|
+
d_num_selected_out,
|
|
248
|
+
num_items,
|
|
249
|
+
stream,
|
|
250
|
+
)
|
|
251
|
+
tmp_storage = TempStorageBuffer(tmp_storage_bytes, stream)
|
|
252
|
+
partitioner(
|
|
253
|
+
tmp_storage,
|
|
254
|
+
d_in,
|
|
255
|
+
d_first_part_out,
|
|
256
|
+
d_second_part_out,
|
|
257
|
+
d_unselected_out,
|
|
258
|
+
d_num_selected_out,
|
|
259
|
+
num_items,
|
|
260
|
+
stream,
|
|
261
|
+
)
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|