cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.1__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +48 -46
- cuda/cccl/headers/include/cuda/__device/attributes.h +171 -121
- cuda/cccl/headers/include/cuda/__device/device_ref.h +30 -42
- cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +1 -0
- cuda/cccl/headers/include/cuda/__event/timed_event.h +1 -0
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/devices +10 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/string_view +12 -5
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +77 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +1 -1
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +141 -138
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -166,7 +166,7 @@ def make_merge_sort(
|
|
|
166
166
|
Example:
|
|
167
167
|
Below, ``make_merge_sort`` is used to create a merge sort object that can be reused.
|
|
168
168
|
|
|
169
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
169
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_object.py
|
|
170
170
|
:language: python
|
|
171
171
|
:start-after: # example-begin
|
|
172
172
|
|
|
@@ -201,7 +201,7 @@ def merge_sort(
|
|
|
201
201
|
Example:
|
|
202
202
|
Below, ``merge_sort`` is used to sort a sequence of keys inplace. It also rearranges the items according to the keys' order.
|
|
203
203
|
|
|
204
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
204
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_basic.py
|
|
205
205
|
:language: python
|
|
206
206
|
:start-after: # example-begin
|
|
207
207
|
|
|
@@ -222,7 +222,7 @@ def make_radix_sort(
|
|
|
222
222
|
Example:
|
|
223
223
|
Below, ``make_radix_sort`` is used to create a radix sort object that can be reused.
|
|
224
224
|
|
|
225
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
225
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_object.py
|
|
226
226
|
:language: python
|
|
227
227
|
:start-after: # example-begin
|
|
228
228
|
|
|
@@ -259,14 +259,14 @@ def radix_sort(
|
|
|
259
259
|
Example:
|
|
260
260
|
Below, ``radix_sort`` is used to sort a sequence of keys. It also rearranges the values according to the keys' order.
|
|
261
261
|
|
|
262
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
262
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_basic.py
|
|
263
263
|
:language: python
|
|
264
264
|
:start-after: # example-begin
|
|
265
265
|
|
|
266
266
|
|
|
267
267
|
In the following example, ``radix_sort`` is used to sort a sequence of keys with a ``DoubleBuffer` for reduced temporary storage.
|
|
268
268
|
|
|
269
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
269
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_buffer.py
|
|
270
270
|
:language: python
|
|
271
271
|
:start-after: # example-begin
|
|
272
272
|
|
|
@@ -130,7 +130,7 @@ def make_reduce_into(
|
|
|
130
130
|
Example:
|
|
131
131
|
Below, ``make_reduce_into`` is used to create a reduction object that can be reused.
|
|
132
132
|
|
|
133
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
133
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/reduce_object.py
|
|
134
134
|
:language: python
|
|
135
135
|
:start-after: # example-begin
|
|
136
136
|
|
|
@@ -163,7 +163,7 @@ def reduce_into(
|
|
|
163
163
|
Example:
|
|
164
164
|
Below, ``reduce_into`` is used to compute the sum of a sequence of integers.
|
|
165
165
|
|
|
166
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
166
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/sum_reduction.py
|
|
167
167
|
:language: python
|
|
168
168
|
:start-after: # example-begin
|
|
169
169
|
|
|
@@ -141,7 +141,7 @@ def make_exclusive_scan(
|
|
|
141
141
|
Example:
|
|
142
142
|
Below, ``make_exclusive_scan`` is used to create an exclusive scan object that can be reused.
|
|
143
143
|
|
|
144
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
144
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_object.py
|
|
145
145
|
:language: python
|
|
146
146
|
:start-after: # example-begin
|
|
147
147
|
|
|
@@ -174,7 +174,7 @@ def exclusive_scan(
|
|
|
174
174
|
Example:
|
|
175
175
|
Below, ``exclusive_scan`` is used to compute an exclusive scan with max operation.
|
|
176
176
|
|
|
177
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
177
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_max.py
|
|
178
178
|
:language: python
|
|
179
179
|
:start-after: # example-begin
|
|
180
180
|
|
|
@@ -207,7 +207,7 @@ def make_inclusive_scan(
|
|
|
207
207
|
Example:
|
|
208
208
|
Below, ``make_inclusive_scan`` is used to create an inclusive scan object that can be reused.
|
|
209
209
|
|
|
210
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
210
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_object.py
|
|
211
211
|
:language: python
|
|
212
212
|
:start-after: # example-begin
|
|
213
213
|
|
|
@@ -240,7 +240,7 @@ def inclusive_scan(
|
|
|
240
240
|
Example:
|
|
241
241
|
Below, ``inclusive_scan`` is used to compute an inclusive scan (prefix sum).
|
|
242
242
|
|
|
243
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
243
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_custom.py
|
|
244
244
|
:language: python
|
|
245
245
|
:start-after: # example-begin
|
|
246
246
|
|
|
@@ -179,7 +179,7 @@ def make_segmented_reduce(
|
|
|
179
179
|
Example:
|
|
180
180
|
Below, ``make_segmented_reduce`` is used to create a segmented reduction object that can be reused.
|
|
181
181
|
|
|
182
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
182
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_object.py
|
|
183
183
|
:language: python
|
|
184
184
|
:start-after: # example-begin
|
|
185
185
|
|
|
@@ -216,7 +216,7 @@ def segmented_reduce(
|
|
|
216
216
|
Example:
|
|
217
217
|
Below, ``segmented_reduce`` is used to compute the minimum value of segments in a sequence of integers.
|
|
218
218
|
|
|
219
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
219
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_basic.py
|
|
220
220
|
:language: python
|
|
221
221
|
:start-after: # example-begin
|
|
222
222
|
|
|
@@ -165,7 +165,7 @@ def make_three_way_partition(
|
|
|
165
165
|
Example:
|
|
166
166
|
Below, ``make_three_way_partition`` is used to create a three-way partition object that can be reused.
|
|
167
167
|
|
|
168
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
168
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_object.py
|
|
169
169
|
:language: python
|
|
170
170
|
:start-after: # example-begin
|
|
171
171
|
|
|
@@ -214,7 +214,7 @@ def three_way_partition(
|
|
|
214
214
|
Example:
|
|
215
215
|
Below, ``three_way_partition`` is used to partition a sequence of integers into three parts.
|
|
216
216
|
|
|
217
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
217
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_basic.py
|
|
218
218
|
:language: python
|
|
219
219
|
:start-after: # example-begin
|
|
220
220
|
|
|
@@ -196,7 +196,7 @@ def make_unary_transform(
|
|
|
196
196
|
storage allocation. For simpler usage, consider using :func:`unary_transform`.
|
|
197
197
|
|
|
198
198
|
Example:
|
|
199
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
199
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_object.py
|
|
200
200
|
:language: python
|
|
201
201
|
:start-after: # example-begin
|
|
202
202
|
|
|
@@ -227,7 +227,7 @@ def make_binary_transform(
|
|
|
227
227
|
storage allocation. For simpler usage, consider using :func:`binary_transform`.
|
|
228
228
|
|
|
229
229
|
Example:
|
|
230
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
230
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_object.py
|
|
231
231
|
:language: python
|
|
232
232
|
:start-after: # example-begin
|
|
233
233
|
|
|
@@ -259,7 +259,7 @@ def unary_transform(
|
|
|
259
259
|
Example:
|
|
260
260
|
Below, ``unary_transform`` is used to apply a transformation to each element of the input.
|
|
261
261
|
|
|
262
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
262
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_basic.py
|
|
263
263
|
:language: python
|
|
264
264
|
:start-after: # example-begin
|
|
265
265
|
|
|
@@ -291,7 +291,7 @@ def binary_transform(
|
|
|
291
291
|
Example:
|
|
292
292
|
Below, ``binary_transform`` is used to apply a transformation to pairs of elements from two input sequences.
|
|
293
293
|
|
|
294
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
294
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_basic.py
|
|
295
295
|
:language: python
|
|
296
296
|
:start-after: # example-begin
|
|
297
297
|
|
|
@@ -171,7 +171,7 @@ def make_unique_by_key(
|
|
|
171
171
|
Example:
|
|
172
172
|
Below, ``make_unique_by_key`` is used to create a unique by key object that can be reused.
|
|
173
173
|
|
|
174
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
174
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_object.py
|
|
175
175
|
:language: python
|
|
176
176
|
:start-after: # example-begin
|
|
177
177
|
|
|
@@ -211,7 +211,7 @@ def unique_by_key(
|
|
|
211
211
|
Example:
|
|
212
212
|
Below, ``unique_by_key`` is used to populate the arrays of output keys and items with the first key and its corresponding item from each sequence of equal keys. It also outputs the number of items selected.
|
|
213
213
|
|
|
214
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
214
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_basic.py
|
|
215
215
|
:language: python
|
|
216
216
|
:start-after: # example-begin
|
|
217
217
|
|
cuda/{cccl/parallel/experimental → compute}/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so
RENAMED
|
Binary file
|
|
Binary file
|
cuda/{cccl/parallel/experimental → compute}/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so
RENAMED
|
Binary file
|
|
Binary file
|
|
@@ -26,7 +26,7 @@ def CacheModifiedInputIterator(device_array, modifier):
|
|
|
26
26
|
Example:
|
|
27
27
|
The code snippet below demonstrates the usage of a ``CacheModifiedInputIterator``:
|
|
28
28
|
|
|
29
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
29
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/cache_modified_iterator_basic.py
|
|
30
30
|
:language: python
|
|
31
31
|
:start-after: # example-begin
|
|
32
32
|
|
|
@@ -55,7 +55,7 @@ def ConstantIterator(value):
|
|
|
55
55
|
The code snippet below demonstrates the usage of a ``ConstantIterator``
|
|
56
56
|
representing a sequence of constant values:
|
|
57
57
|
|
|
58
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
58
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/constant_iterator_basic.py
|
|
59
59
|
:language: python
|
|
60
60
|
:start-after: # example-begin
|
|
61
61
|
|
|
@@ -78,7 +78,7 @@ def CountingIterator(offset):
|
|
|
78
78
|
The code snippet below demonstrates the usage of a ``CountingIterator``
|
|
79
79
|
representing the sequence ``[10, 11, 12]``:
|
|
80
80
|
|
|
81
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
81
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/counting_iterator_basic.py
|
|
82
82
|
:language: python
|
|
83
83
|
:start-after: # example-begin
|
|
84
84
|
|
|
@@ -100,13 +100,13 @@ def ReverseIterator(sequence):
|
|
|
100
100
|
Examples:
|
|
101
101
|
The code snippet below demonstrates the usage of a ``ReverseIterator`` as an input iterator:
|
|
102
102
|
|
|
103
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
103
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_input_iterator.py
|
|
104
104
|
:language: python
|
|
105
105
|
:start-after: # example-begin
|
|
106
106
|
|
|
107
107
|
The code snippet below demonstrates the usage of a ``ReverseIterator`` as an output iterator:
|
|
108
108
|
|
|
109
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
109
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_output_iterator.py
|
|
110
110
|
:language: python
|
|
111
111
|
:start-after: # example-begin
|
|
112
112
|
|
|
@@ -129,7 +129,7 @@ def TransformIterator(it, op):
|
|
|
129
129
|
The code snippet below demonstrates the usage of a ``TransformIterator`` composed with a ``CountingIterator``
|
|
130
130
|
to transform the input before performing a reduction.
|
|
131
131
|
|
|
132
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
132
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_iterator_basic.py
|
|
133
133
|
:language: python
|
|
134
134
|
:start-after: # example-begin
|
|
135
135
|
Args:
|
|
@@ -151,7 +151,7 @@ def TransformOutputIterator(it, op):
|
|
|
151
151
|
The code snippet below demonstrates the usage of a ``TransformOutputIterator`` to transform the output
|
|
152
152
|
of a reduction before writing to an output array.
|
|
153
153
|
|
|
154
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
154
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_output_iterator.py
|
|
155
155
|
:language: python
|
|
156
156
|
:start-after: # example-begin
|
|
157
157
|
|
|
@@ -178,7 +178,7 @@ def ZipIterator(*iterators):
|
|
|
178
178
|
The code snippet below demonstrates the usage of a ``ZipIterator``
|
|
179
179
|
combining two device arrays:
|
|
180
180
|
|
|
181
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
181
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/zip_iterator_elementwise.py
|
|
182
182
|
:language: python
|
|
183
183
|
:start-after: # example-begin
|
|
184
184
|
|
|
@@ -207,7 +207,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
|
|
|
207
207
|
to a dataclass). The type of each field must be a subclass of
|
|
208
208
|
`np.number`, like `np.int32` or `np.float64`.
|
|
209
209
|
|
|
210
|
-
Arrays of GPUStruct objects can be used as inputs to cuda.
|
|
210
|
+
Arrays of GPUStruct objects can be used as inputs to cuda.compute
|
|
211
211
|
algorithms.
|
|
212
212
|
|
|
213
213
|
Example:
|
|
@@ -216,7 +216,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
|
|
|
216
216
|
a reduction on an input array of floating point values to compute its
|
|
217
217
|
the smallest and the largest absolute values:
|
|
218
218
|
|
|
219
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
219
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/minmax_reduction.py
|
|
220
220
|
:language: python
|
|
221
221
|
:start-after: # example-begin
|
|
222
222
|
|
cuda/coop/__init__.py
ADDED
|
@@ -5,8 +5,9 @@
|
|
|
5
5
|
import functools
|
|
6
6
|
|
|
7
7
|
from cuda.bindings import nvrtc
|
|
8
|
-
|
|
9
|
-
from
|
|
8
|
+
|
|
9
|
+
from ._caching import disk_cache
|
|
10
|
+
from ._common import check_in, version
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def CHECK_NVRTC(err, prog):
|
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
4
|
|
|
5
5
|
"""
|
|
6
|
-
cuda.
|
|
7
|
-
|
|
6
|
+
cuda.coop._scan_op
|
|
7
|
+
==================
|
|
8
8
|
|
|
9
9
|
This module implements the ``ScanOp`` class and related functions.
|
|
10
10
|
"""
|
|
@@ -14,7 +14,7 @@ from enum import Enum
|
|
|
14
14
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
|
|
17
|
-
from
|
|
17
|
+
from ._typing import (
|
|
18
18
|
ScanOpType,
|
|
19
19
|
)
|
|
20
20
|
|
|
@@ -17,8 +17,8 @@ from numba.core.typing import signature
|
|
|
17
17
|
from numba.cuda import LTOIR
|
|
18
18
|
from numba.cuda.cudadrv import driver as cuda_driver
|
|
19
19
|
|
|
20
|
-
import
|
|
21
|
-
from
|
|
20
|
+
from . import _nvrtc as nvrtc
|
|
21
|
+
from ._common import find_unsigned
|
|
22
22
|
|
|
23
23
|
NUMBA_TYPES_TO_CPP = {
|
|
24
24
|
types.boolean: "bool",
|
|
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
|
|
|
9
9
|
import numba
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
12
|
-
from
|
|
12
|
+
from ._common import dim3
|
|
13
13
|
|
|
14
14
|
# Type alias for dimension parameters that can be passed to CUDA functions.
|
|
15
15
|
DimType = Union["dim3", int, Tuple[int, int], Tuple[int, int, int]]
|
|
@@ -2,18 +2,18 @@
|
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from ._block_exchange import (
|
|
6
6
|
BlockExchangeType,
|
|
7
7
|
exchange,
|
|
8
8
|
)
|
|
9
|
-
from
|
|
10
|
-
from
|
|
11
|
-
from
|
|
9
|
+
from ._block_load_store import load, store
|
|
10
|
+
from ._block_merge_sort import merge_sort_keys
|
|
11
|
+
from ._block_radix_sort import (
|
|
12
12
|
radix_sort_keys,
|
|
13
13
|
radix_sort_keys_descending,
|
|
14
14
|
)
|
|
15
|
-
from
|
|
16
|
-
from
|
|
15
|
+
from ._block_reduce import reduce, sum
|
|
16
|
+
from ._block_scan import (
|
|
17
17
|
exclusive_scan,
|
|
18
18
|
exclusive_sum,
|
|
19
19
|
inclusive_scan,
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
4
|
|
|
5
5
|
"""
|
|
6
|
-
cuda.
|
|
6
|
+
cuda.coop.block_exchange
|
|
7
7
|
====================================
|
|
8
8
|
|
|
9
9
|
This module provides a set of :ref:`collective <collective-primitives>` methods
|
|
@@ -105,13 +105,13 @@ def exchange(
|
|
|
105
105
|
perform. Currently, only :py:attr:`StripedToBlocked` is supported.
|
|
106
106
|
|
|
107
107
|
:param dtype: Supplies the data type of the input and output arrays.
|
|
108
|
-
:type dtype: :py:class:`cuda.
|
|
108
|
+
:type dtype: :py:class:`cuda.coop._typing.DtypeType`
|
|
109
109
|
|
|
110
110
|
:param threads_per_block: Supplies the number of threads in the block,
|
|
111
111
|
either as an integer for a 1D block or a tuple of two or three integers
|
|
112
112
|
for a 2D or 3D block, respectively.
|
|
113
113
|
:type threads_per_block:
|
|
114
|
-
:py:class:`cuda.
|
|
114
|
+
:py:class:`cuda.coop._typing.DimType`
|
|
115
115
|
|
|
116
116
|
:param items_per_thread: Supplies the number of items partitioned onto each
|
|
117
117
|
thread.
|
|
@@ -137,7 +137,7 @@ def exchange(
|
|
|
137
137
|
:raises ValueError: If ``items_per_thread`` is greater than 1 and
|
|
138
138
|
``methods`` is not *None* (i.e. a user-defined type is being used).
|
|
139
139
|
|
|
140
|
-
:returns: An :py:class:`cuda.
|
|
140
|
+
:returns: An :py:class:`cuda.coop._types.Invocable`
|
|
141
141
|
object representing the specialized kernel that call be called from
|
|
142
142
|
a Numba JIT'd CUDA kernel.
|
|
143
143
|
|
|
@@ -5,12 +5,12 @@
|
|
|
5
5
|
|
|
6
6
|
import numba
|
|
7
7
|
|
|
8
|
-
from
|
|
8
|
+
from .._common import (
|
|
9
9
|
make_binary_tempfile,
|
|
10
10
|
normalize_dim_param,
|
|
11
11
|
normalize_dtype_param,
|
|
12
12
|
)
|
|
13
|
-
from
|
|
13
|
+
from .._types import (
|
|
14
14
|
Algorithm,
|
|
15
15
|
Dependency,
|
|
16
16
|
DependentArray,
|
|
@@ -70,13 +70,13 @@ def load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
|
|
|
70
70
|
The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
|
|
71
71
|
each thread handling 4 integers.
|
|
72
72
|
|
|
73
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
73
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
|
|
74
74
|
:language: python
|
|
75
75
|
:dedent:
|
|
76
76
|
:start-after: example-begin imports
|
|
77
77
|
:end-before: example-end imports
|
|
78
78
|
|
|
79
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
79
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
|
|
80
80
|
:language: python
|
|
81
81
|
:dedent:
|
|
82
82
|
:start-after: example-begin load_store
|
|
@@ -158,13 +158,13 @@ def store(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
|
|
|
158
158
|
The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
|
|
159
159
|
each thread handling 4 integers.
|
|
160
160
|
|
|
161
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
161
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
|
|
162
162
|
:language: python
|
|
163
163
|
:dedent:
|
|
164
164
|
:start-after: example-begin imports
|
|
165
165
|
:end-before: example-end imports
|
|
166
166
|
|
|
167
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
167
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
|
|
168
168
|
:language: python
|
|
169
169
|
:dedent:
|
|
170
170
|
:start-after: example-begin load_store
|
|
@@ -6,12 +6,12 @@ from typing import TYPE_CHECKING, Callable, Literal, Union
|
|
|
6
6
|
|
|
7
7
|
import numba
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from .._common import (
|
|
10
10
|
make_binary_tempfile,
|
|
11
11
|
normalize_dim_param,
|
|
12
12
|
normalize_dtype_param,
|
|
13
13
|
)
|
|
14
|
-
from
|
|
14
|
+
from .._types import (
|
|
15
15
|
Algorithm,
|
|
16
16
|
Constant,
|
|
17
17
|
Dependency,
|
|
@@ -41,7 +41,7 @@ def merge_sort_keys(
|
|
|
41
41
|
are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
|
|
42
42
|
where each thread owns 4 consecutive keys. We start by importing necessary modules:
|
|
43
43
|
|
|
44
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
44
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
|
|
45
45
|
:language: python
|
|
46
46
|
:dedent:
|
|
47
47
|
:start-after: example-begin imports
|
|
@@ -49,7 +49,7 @@ def merge_sort_keys(
|
|
|
49
49
|
|
|
50
50
|
Below is the code snippet that demonstrates the usage of the ``merge_sort_keys`` API:
|
|
51
51
|
|
|
52
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
52
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
|
|
53
53
|
:language: python
|
|
54
54
|
:dedent:
|
|
55
55
|
:start-after: example-begin merge-sort
|
|
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Tuple, Union
|
|
|
6
6
|
|
|
7
7
|
import numba
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from .._common import (
|
|
10
10
|
CUB_BLOCK_SCAN_ALGOS,
|
|
11
11
|
CudaSharedMemConfig,
|
|
12
12
|
dim3,
|
|
@@ -14,7 +14,7 @@ from cuda.cccl.cooperative.experimental._common import (
|
|
|
14
14
|
normalize_dim_param,
|
|
15
15
|
normalize_dtype_param,
|
|
16
16
|
)
|
|
17
|
-
from
|
|
17
|
+
from .._types import (
|
|
18
18
|
Algorithm,
|
|
19
19
|
Dependency,
|
|
20
20
|
DependentArray,
|
|
@@ -140,7 +140,7 @@ def radix_sort_keys(dtype, threads_per_block, items_per_thread):
|
|
|
140
140
|
are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
|
|
141
141
|
where each thread owns 4 consecutive keys. We start by importing necessary modules:
|
|
142
142
|
|
|
143
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
143
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
|
|
144
144
|
:language: python
|
|
145
145
|
:dedent:
|
|
146
146
|
:start-after: example-begin imports
|
|
@@ -148,7 +148,7 @@ def radix_sort_keys(dtype, threads_per_block, items_per_thread):
|
|
|
148
148
|
|
|
149
149
|
Below is the code snippet that demonstrates the usage of the ``radix_sort_keys`` API:
|
|
150
150
|
|
|
151
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
151
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
|
|
152
152
|
:language: python
|
|
153
153
|
:dedent:
|
|
154
154
|
:start-after: example-begin radix-sort
|
|
@@ -181,7 +181,7 @@ def radix_sort_keys_descending(dtype, threads_per_block, items_per_thread):
|
|
|
181
181
|
are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
|
|
182
182
|
where each thread owns 4 consecutive keys. We start by importing necessary modules:
|
|
183
183
|
|
|
184
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
184
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
|
|
185
185
|
:language: python
|
|
186
186
|
:dedent:
|
|
187
187
|
:start-after: example-begin imports
|
|
@@ -189,7 +189,7 @@ def radix_sort_keys_descending(dtype, threads_per_block, items_per_thread):
|
|
|
189
189
|
|
|
190
190
|
Below is the code snippet that demonstrates the usage of the ``radix_sort_keys`` API:
|
|
191
191
|
|
|
192
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
192
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
|
|
193
193
|
:language: python
|
|
194
194
|
:dedent:
|
|
195
195
|
:start-after: example-begin radix-sort-descending
|
|
@@ -6,13 +6,13 @@ from typing import TYPE_CHECKING, Callable, Literal, Tuple, Union
|
|
|
6
6
|
|
|
7
7
|
import numba
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from .._common import (
|
|
10
10
|
CUB_BLOCK_REDUCE_ALGOS,
|
|
11
11
|
make_binary_tempfile,
|
|
12
12
|
normalize_dim_param,
|
|
13
13
|
normalize_dtype_param,
|
|
14
14
|
)
|
|
15
|
-
from
|
|
15
|
+
from .._types import (
|
|
16
16
|
Algorithm,
|
|
17
17
|
Dependency,
|
|
18
18
|
DependentArray,
|
|
@@ -208,13 +208,13 @@ def reduce(
|
|
|
208
208
|
The code snippet below illustrates a max reduction of 128 integer items that are
|
|
209
209
|
partitioned across 128 threads.
|
|
210
210
|
|
|
211
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
211
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
|
|
212
212
|
:language: python
|
|
213
213
|
:dedent:
|
|
214
214
|
:start-after: example-begin imports
|
|
215
215
|
:end-before: example-end imports
|
|
216
216
|
|
|
217
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
217
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
|
|
218
218
|
:language: python
|
|
219
219
|
:dedent:
|
|
220
220
|
:start-after: example-begin reduce
|
|
@@ -269,13 +269,13 @@ def sum(
|
|
|
269
269
|
The code snippet below illustrates a sum of 128 integer items that are partitioned
|
|
270
270
|
across 128 threads.
|
|
271
271
|
|
|
272
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
272
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
|
|
273
273
|
:language: python
|
|
274
274
|
:dedent:
|
|
275
275
|
:start-after: example-begin imports
|
|
276
276
|
:end-before: example-end imports
|
|
277
277
|
|
|
278
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
278
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
|
|
279
279
|
:language: python
|
|
280
280
|
:dedent:
|
|
281
281
|
:start-after: example-begin sum
|