cuda-cccl 0.1.3.2.0.dev438__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.0__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (60) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +23 -0
  2. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +22 -14
  3. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  4. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
  5. cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
  6. cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
  7. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  8. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +321 -262
  9. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +8 -0
  10. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
  11. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
  12. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +203 -51
  13. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
  14. cuda/cccl/headers/include/cub/util_device.cuh +51 -35
  15. cuda/cccl/headers/include/cuda/__algorithm/copy.h +3 -3
  16. cuda/cccl/headers/include/cuda/__device/all_devices.h +3 -6
  17. cuda/cccl/headers/include/cuda/__device/arch_traits.h +3 -3
  18. cuda/cccl/headers/include/cuda/__device/attributes.h +7 -7
  19. cuda/cccl/headers/include/cuda/__device/device_ref.h +3 -10
  20. cuda/cccl/headers/include/cuda/__driver/driver_api.h +225 -33
  21. cuda/cccl/headers/include/cuda/__event/event.h +7 -8
  22. cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
  23. cuda/cccl/headers/include/cuda/__event/timed_event.h +3 -4
  24. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
  25. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
  26. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
  27. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
  28. cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
  29. cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
  30. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -12
  31. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
  32. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
  33. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
  34. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
  35. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
  36. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
  37. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
  38. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
  39. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
  40. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
  41. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
  42. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
  43. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
  44. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
  45. cuda/cccl/parallel/experimental/__init__.py +4 -0
  46. cuda/cccl/parallel/experimental/_bindings.pyi +28 -0
  47. cuda/cccl/parallel/experimental/_bindings_impl.pyx +140 -0
  48. cuda/cccl/parallel/experimental/algorithms/__init__.py +4 -0
  49. cuda/cccl/parallel/experimental/algorithms/_reduce.py +0 -2
  50. cuda/cccl/parallel/experimental/algorithms/_scan.py +0 -2
  51. cuda/cccl/parallel/experimental/algorithms/_three_way_partition.py +261 -0
  52. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  53. cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
  54. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  55. cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
  56. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/METADATA +1 -1
  57. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/RECORD +59 -57
  58. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
  59. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/WHEEL +0 -0
  60. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,261 @@
1
+ # Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2
+ #
3
+ #
4
+ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5
+
6
+ from typing import Callable
7
+
8
+ import numba
9
+
10
+ from .. import _bindings
11
+ from .. import _cccl_interop as cccl
12
+ from .._caching import CachableFunction, cache_with_key
13
+ from .._cccl_interop import call_build, set_cccl_iterator_state
14
+ from .._utils import protocols
15
+ from .._utils.temp_storage_buffer import TempStorageBuffer
16
+ from ..iterators._iterators import IteratorBase
17
+ from ..typing import DeviceArrayLike
18
+
19
+
20
+ def make_cache_key(
21
+ d_in: DeviceArrayLike | IteratorBase,
22
+ d_first_part_out: DeviceArrayLike | IteratorBase,
23
+ d_second_part_out: DeviceArrayLike | IteratorBase,
24
+ d_unselected_out: DeviceArrayLike | IteratorBase,
25
+ d_num_selected_out: DeviceArrayLike | IteratorBase,
26
+ select_first_part_op: Callable,
27
+ select_second_part_op: Callable,
28
+ ):
29
+ d_in_key = (
30
+ d_in.kind if isinstance(d_in, IteratorBase) else protocols.get_dtype(d_in)
31
+ )
32
+ d_first_part_out_key = (
33
+ d_first_part_out.kind
34
+ if isinstance(d_first_part_out, IteratorBase)
35
+ else protocols.get_dtype(d_first_part_out)
36
+ )
37
+ d_second_part_out_key = (
38
+ d_second_part_out.kind
39
+ if isinstance(d_second_part_out, IteratorBase)
40
+ else protocols.get_dtype(d_second_part_out)
41
+ )
42
+ d_unselected_out_key = (
43
+ d_unselected_out.kind
44
+ if isinstance(d_unselected_out, IteratorBase)
45
+ else protocols.get_dtype(d_unselected_out)
46
+ )
47
+ d_num_selected_out_key = (
48
+ d_num_selected_out.kind
49
+ if isinstance(d_num_selected_out, IteratorBase)
50
+ else protocols.get_dtype(d_num_selected_out)
51
+ )
52
+ select_first_part_op_key = CachableFunction(select_first_part_op)
53
+ select_second_part_op_key = CachableFunction(select_second_part_op)
54
+ return (
55
+ d_in_key,
56
+ d_first_part_out_key,
57
+ d_second_part_out_key,
58
+ d_unselected_out_key,
59
+ d_num_selected_out_key,
60
+ select_first_part_op_key,
61
+ select_second_part_op_key,
62
+ )
63
+
64
+
65
+ class _ThreeWayPartition:
66
+ __slots__ = [
67
+ "build_result",
68
+ "d_in_cccl",
69
+ "d_first_part_out_cccl",
70
+ "d_second_part_out_cccl",
71
+ "d_unselected_out_cccl",
72
+ "d_num_selected_out_cccl",
73
+ "select_first_part_op_wrapper",
74
+ "select_second_part_op_wrapper",
75
+ ]
76
+
77
+ def __init__(
78
+ self,
79
+ d_in: DeviceArrayLike | IteratorBase,
80
+ d_first_part_out: DeviceArrayLike | IteratorBase,
81
+ d_second_part_out: DeviceArrayLike | IteratorBase,
82
+ d_unselected_out: DeviceArrayLike | IteratorBase,
83
+ d_num_selected_out: DeviceArrayLike | IteratorBase,
84
+ select_first_part_op: Callable,
85
+ select_second_part_op: Callable,
86
+ ):
87
+ self.d_in_cccl = cccl.to_cccl_input_iter(d_in)
88
+ self.d_first_part_out_cccl = cccl.to_cccl_output_iter(d_first_part_out)
89
+ self.d_second_part_out_cccl = cccl.to_cccl_output_iter(d_second_part_out)
90
+ self.d_unselected_out_cccl = cccl.to_cccl_output_iter(d_unselected_out)
91
+ self.d_num_selected_out_cccl = cccl.to_cccl_output_iter(d_num_selected_out)
92
+
93
+ value_type = cccl.get_value_type(d_in)
94
+ sig = numba.types.uint8(value_type)
95
+
96
+ # There are no well-known operations that can be used with three_way_partition
97
+ self.select_first_part_op_wrapper = cccl.to_cccl_op(select_first_part_op, sig)
98
+ self.select_second_part_op_wrapper = cccl.to_cccl_op(select_second_part_op, sig)
99
+
100
+ self.build_result = call_build(
101
+ _bindings.DeviceThreeWayPartitionBuildResult,
102
+ self.d_in_cccl,
103
+ self.d_first_part_out_cccl,
104
+ self.d_second_part_out_cccl,
105
+ self.d_unselected_out_cccl,
106
+ self.d_num_selected_out_cccl,
107
+ self.select_first_part_op_wrapper,
108
+ self.select_second_part_op_wrapper,
109
+ )
110
+
111
+ def __call__(
112
+ self,
113
+ temp_storage,
114
+ d_in,
115
+ d_first_part_out,
116
+ d_second_part_out,
117
+ d_unselected_out,
118
+ d_num_selected_out,
119
+ num_items: int,
120
+ stream=None,
121
+ ):
122
+ set_cccl_iterator_state(self.d_in_cccl, d_in)
123
+ set_cccl_iterator_state(self.d_first_part_out_cccl, d_first_part_out)
124
+ set_cccl_iterator_state(self.d_second_part_out_cccl, d_second_part_out)
125
+ set_cccl_iterator_state(self.d_unselected_out_cccl, d_unselected_out)
126
+ set_cccl_iterator_state(self.d_num_selected_out_cccl, d_num_selected_out)
127
+ stream_handle = protocols.validate_and_get_stream(stream)
128
+
129
+ if temp_storage is None:
130
+ temp_storage_bytes = 0
131
+ d_temp_storage = 0
132
+ else:
133
+ temp_storage_bytes = temp_storage.nbytes
134
+ d_temp_storage = protocols.get_data_pointer(temp_storage)
135
+
136
+ temp_storage_bytes = self.build_result.compute(
137
+ d_temp_storage,
138
+ temp_storage_bytes,
139
+ self.d_in_cccl,
140
+ self.d_first_part_out_cccl,
141
+ self.d_second_part_out_cccl,
142
+ self.d_unselected_out_cccl,
143
+ self.d_num_selected_out_cccl,
144
+ self.select_first_part_op_wrapper,
145
+ self.select_second_part_op_wrapper,
146
+ num_items,
147
+ stream_handle,
148
+ )
149
+ return temp_storage_bytes
150
+
151
+
152
+ @cache_with_key(make_cache_key)
153
+ def make_three_way_partition(
154
+ d_in: DeviceArrayLike | IteratorBase,
155
+ d_first_part_out: DeviceArrayLike | IteratorBase,
156
+ d_second_part_out: DeviceArrayLike | IteratorBase,
157
+ d_unselected_out: DeviceArrayLike | IteratorBase,
158
+ d_num_selected_out: DeviceArrayLike | IteratorBase,
159
+ select_first_part_op: Callable,
160
+ select_second_part_op: Callable,
161
+ ):
162
+ """
163
+ Computes a device-wide three-way partition using the specified unary ``select_first_part_op`` and ``select_second_part_op`` operators.
164
+
165
+ Example:
166
+ Below, ``make_three_way_partition`` is used to create a three-way partition object that can be reused.
167
+
168
+ .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/three_way_partition/three_way_partition_object.py
169
+ :language: python
170
+ :start-after: # example-begin
171
+
172
+ Args:
173
+ d_in: Device array or iterator containing the input sequence of data items
174
+ d_first_part_out: Device array or iterator to store the first part of the output
175
+ d_second_part_out: Device array or iterator to store the second part of the output
176
+ d_unselected_out: Device array or iterator to store the unselected items
177
+ d_num_selected_out: Device array to store the number of items selected. The total number of items selected by ``select_first_part_op`` and ``select_second_part_op`` is stored in ``d_num_selected_out[0]`` and ``d_num_selected_out[1]``, respectively.
178
+ select_first_part_op: Callable representing the unary operator to select the first part
179
+ select_second_part_op: Callable representing the unary operator to select the second part
180
+
181
+ Returns:
182
+ A callable object that can be used to perform the three-way partition
183
+ """
184
+ return _ThreeWayPartition(
185
+ d_in,
186
+ d_first_part_out,
187
+ d_second_part_out,
188
+ d_unselected_out,
189
+ d_num_selected_out,
190
+ select_first_part_op,
191
+ select_second_part_op,
192
+ )
193
+
194
+
195
+ def three_way_partition(
196
+ d_in: DeviceArrayLike | IteratorBase,
197
+ d_first_part_out: DeviceArrayLike | IteratorBase,
198
+ d_second_part_out: DeviceArrayLike | IteratorBase,
199
+ d_unselected_out: DeviceArrayLike | IteratorBase,
200
+ d_num_selected_out: DeviceArrayLike | IteratorBase,
201
+ select_first_part_op: Callable,
202
+ select_second_part_op: Callable,
203
+ num_items: int,
204
+ stream=None,
205
+ ):
206
+ """
207
+ Performs device-wide three-way partition. Given an input sequence of data items, it partitions the items into three parts:
208
+ - The first part is selected by the ``select_first_part_op`` operator.
209
+ - The second part is selected by the ``select_second_part_op`` operator.
210
+ - The unselected items are not selected by either operator.
211
+
212
+ This function automatically handles temporary storage allocation and execution.
213
+
214
+ Example:
215
+ Below, ``three_way_partition`` is used to partition a sequence of integers into three parts.
216
+
217
+ .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/three_way_partition/three_way_partition_basic.py
218
+ :language: python
219
+ :start-after: # example-begin
220
+
221
+ Args:
222
+ d_in: Device array or iterator containing the input sequence of data items
223
+ d_first_part_out: Device array or iterator to store the first part of the output
224
+ d_second_part_out: Device array or iterator to store the second part of the output
225
+ d_unselected_out: Device array or iterator to store the unselected items
226
+ d_num_selected_out: Device array to store the number of items selected. The total number of items selected by ``select_first_part_op`` and ``select_second_part_op`` is stored in ``d_num_selected_out[0]`` and ``d_num_selected_out[1]``, respectively.
227
+ select_first_part_op: Callable representing the unary operator to select the first part
228
+ select_second_part_op: Callable representing the unary operator to select the second part
229
+ num_items: Number of items to partition
230
+ stream: CUDA stream for the operation (optional)
231
+ """
232
+ partitioner = make_three_way_partition(
233
+ d_in,
234
+ d_first_part_out,
235
+ d_second_part_out,
236
+ d_unselected_out,
237
+ d_num_selected_out,
238
+ select_first_part_op,
239
+ select_second_part_op,
240
+ )
241
+ tmp_storage_bytes = partitioner(
242
+ None,
243
+ d_in,
244
+ d_first_part_out,
245
+ d_second_part_out,
246
+ d_unselected_out,
247
+ d_num_selected_out,
248
+ num_items,
249
+ stream,
250
+ )
251
+ tmp_storage = TempStorageBuffer(tmp_storage_bytes, stream)
252
+ partitioner(
253
+ tmp_storage,
254
+ d_in,
255
+ d_first_part_out,
256
+ d_second_part_out,
257
+ d_unselected_out,
258
+ d_num_selected_out,
259
+ num_items,
260
+ stream,
261
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cuda-cccl
3
- Version: 0.1.3.2.0.dev438
3
+ Version: 0.3.0
4
4
  Summary: CUDA Core Library for Python
5
5
  Author: NVIDIA Corporation
6
6
  Classifier: Programming Language :: Python :: 3 :: Only