cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.1__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +48 -46
- cuda/cccl/headers/include/cuda/__device/attributes.h +171 -121
- cuda/cccl/headers/include/cuda/__device/device_ref.h +30 -42
- cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +1 -0
- cuda/cccl/headers/include/cuda/__event/timed_event.h +1 -0
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/devices +10 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/string_view +12 -5
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +77 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +1 -1
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +141 -138
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
4
|
|
|
5
5
|
"""
|
|
6
|
-
cuda.
|
|
6
|
+
cuda.coop.block_scan
|
|
7
7
|
===========================
|
|
8
8
|
|
|
9
9
|
This module provides a set of :ref:`collective <collective-primitives>`
|
|
@@ -73,16 +73,16 @@ from typing import Any, Callable, Literal
|
|
|
73
73
|
|
|
74
74
|
import numba
|
|
75
75
|
|
|
76
|
-
from
|
|
76
|
+
from .._common import (
|
|
77
77
|
CUB_BLOCK_SCAN_ALGOS,
|
|
78
78
|
make_binary_tempfile,
|
|
79
79
|
normalize_dim_param,
|
|
80
80
|
normalize_dtype_param,
|
|
81
81
|
)
|
|
82
|
-
from
|
|
82
|
+
from .._scan_op import (
|
|
83
83
|
ScanOp,
|
|
84
84
|
)
|
|
85
|
-
from
|
|
85
|
+
from .._types import (
|
|
86
86
|
Algorithm,
|
|
87
87
|
Dependency,
|
|
88
88
|
DependentArray,
|
|
@@ -94,7 +94,7 @@ from cuda.cccl.cooperative.experimental._types import (
|
|
|
94
94
|
TemplateParameter,
|
|
95
95
|
numba_type_to_wrapper,
|
|
96
96
|
)
|
|
97
|
-
from
|
|
97
|
+
from .._typing import (
|
|
98
98
|
DimType,
|
|
99
99
|
DtypeType,
|
|
100
100
|
ScanOpType,
|
|
@@ -669,7 +669,7 @@ def exclusive_sum(
|
|
|
669
669
|
:ref:`blocked arrangement <flexible-data-arrangement>` across 128
|
|
670
670
|
threads where each thread owns 4 consecutive items.
|
|
671
671
|
|
|
672
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
672
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_scan_api.py
|
|
673
673
|
:language: python
|
|
674
674
|
:dedent:
|
|
675
675
|
:start-after: example-begin imports
|
|
@@ -678,7 +678,7 @@ def exclusive_sum(
|
|
|
678
678
|
Below is the code snippet that demonstrates the usage of the
|
|
679
679
|
``exclusive_sum`` API:
|
|
680
680
|
|
|
681
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
681
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_scan_api.py
|
|
682
682
|
:language: python
|
|
683
683
|
:dedent:
|
|
684
684
|
:start-after: example-begin exclusive-sum
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
|
+
|
|
5
|
+
from ._warp_merge_sort import merge_sort_keys
|
|
6
|
+
from ._warp_reduce import reduce, sum
|
|
7
|
+
from ._warp_scan import exclusive_sum
|
|
8
|
+
|
|
9
|
+
__all__ = ["exclusive_sum", "reduce", "sum", "merge_sort_keys"]
|
|
@@ -4,8 +4,8 @@
|
|
|
4
4
|
|
|
5
5
|
import numba
|
|
6
6
|
|
|
7
|
-
from
|
|
8
|
-
from
|
|
7
|
+
from .._common import make_binary_tempfile
|
|
8
|
+
from .._types import (
|
|
9
9
|
Algorithm,
|
|
10
10
|
Constant,
|
|
11
11
|
Dependency,
|
|
@@ -30,7 +30,7 @@ def merge_sort_keys(
|
|
|
30
30
|
|
|
31
31
|
Below is the code snippet that demonstrates the usage of the ``merge_sort_keys`` API:
|
|
32
32
|
|
|
33
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
33
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_merge_sort_api.py
|
|
34
34
|
:language: python
|
|
35
35
|
:dedent:
|
|
36
36
|
:start-after: example-begin merge-sort
|
|
@@ -4,8 +4,8 @@
|
|
|
4
4
|
|
|
5
5
|
import numba
|
|
6
6
|
|
|
7
|
-
from
|
|
8
|
-
from
|
|
7
|
+
from .._common import make_binary_tempfile
|
|
8
|
+
from .._types import (
|
|
9
9
|
Algorithm,
|
|
10
10
|
Dependency,
|
|
11
11
|
DependentPythonOperator,
|
|
@@ -28,7 +28,7 @@ def reduce(dtype, binary_op, threads_in_warp=32, methods=None):
|
|
|
28
28
|
The code snippet below illustrates a max reduction of 32 integer items that
|
|
29
29
|
are partitioned across a warp of threads.
|
|
30
30
|
|
|
31
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
31
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
|
|
32
32
|
:language: python
|
|
33
33
|
:dedent:
|
|
34
34
|
:start-after: example-begin imports
|
|
@@ -36,7 +36,7 @@ def reduce(dtype, binary_op, threads_in_warp=32, methods=None):
|
|
|
36
36
|
|
|
37
37
|
Below is the code snippet that demonstrates the usage of the ``reduce`` API:
|
|
38
38
|
|
|
39
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
39
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
|
|
40
40
|
:language: python
|
|
41
41
|
:dedent:
|
|
42
42
|
:start-after: example-begin reduce
|
|
@@ -100,7 +100,7 @@ def sum(dtype, threads_in_warp=32):
|
|
|
100
100
|
The code snippet below illustrates a reduction of 32 integer items that
|
|
101
101
|
are partitioned across a warp of threads.
|
|
102
102
|
|
|
103
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
103
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
|
|
104
104
|
:language: python
|
|
105
105
|
:dedent:
|
|
106
106
|
:start-after: example-begin imports
|
|
@@ -108,7 +108,7 @@ def sum(dtype, threads_in_warp=32):
|
|
|
108
108
|
|
|
109
109
|
Below is the code snippet that demonstrates the usage of the ``reduce`` API:
|
|
110
110
|
|
|
111
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
111
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
|
|
112
112
|
:language: python
|
|
113
113
|
:dedent:
|
|
114
114
|
:start-after: example-begin sum
|
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
|
|
6
6
|
import numba
|
|
7
7
|
|
|
8
|
-
from
|
|
9
|
-
from
|
|
8
|
+
from .._common import make_binary_tempfile
|
|
9
|
+
from .._types import (
|
|
10
10
|
Algorithm,
|
|
11
11
|
Dependency,
|
|
12
12
|
DependentReference,
|
|
@@ -23,7 +23,7 @@ def exclusive_sum(dtype, threads_in_warp=32):
|
|
|
23
23
|
Example:
|
|
24
24
|
The code snippet below illustrates an exclusive prefix sum of 32 integer items:
|
|
25
25
|
|
|
26
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
26
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_scan_api.py
|
|
27
27
|
:language: python
|
|
28
28
|
:dedent:
|
|
29
29
|
:start-after: example-begin imports
|
|
@@ -31,7 +31,7 @@ def exclusive_sum(dtype, threads_in_warp=32):
|
|
|
31
31
|
|
|
32
32
|
Below is the code snippet that demonstrates the usage of the ``exclusive_sum`` API:
|
|
33
33
|
|
|
34
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
34
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_scan_api.py
|
|
35
35
|
:language: python
|
|
36
36
|
:dedent:
|
|
37
37
|
:start-after: example-begin exclusive-sum
|