PyPI - cuda-cccl - Versions diffs - 0.3.0__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.0__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (294) hide show

cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py RENAMED Viewed

@@ -5,12 +5,12 @@
 import numba
-from cuda.cccl.cooperative.experimental._common import (
+from .._common import (
     make_binary_tempfile,
     normalize_dim_param,
     normalize_dtype_param,
 )
-from cuda.cccl.cooperative.experimental._types import (
+from .._types import (
     Algorithm,
     Dependency,
     DependentArray,
@@ -70,13 +70,13 @@ def load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
         The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
         each thread handling 4 integers.
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
             :end-before: example-end imports
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
             :language: python
             :dedent:
             :start-after: example-begin load_store
@@ -158,13 +158,13 @@ def store(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
         The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
         each thread handling 4 integers.
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
             :end-before: example-end imports
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_load_store_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
             :language: python
             :dedent:
             :start-after: example-begin load_store

cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py RENAMED Viewed

@@ -6,12 +6,12 @@ from typing import TYPE_CHECKING, Callable, Literal, Union
 import numba
-from cuda.cccl.cooperative.experimental._common import (
+from .._common import (
     make_binary_tempfile,
     normalize_dim_param,
     normalize_dtype_param,
 )
-from cuda.cccl.cooperative.experimental._types import (
+from .._types import (
     Algorithm,
     Constant,
     Dependency,
@@ -41,7 +41,7 @@ def merge_sort_keys(
         are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
         where each thread owns 4 consecutive keys. We start by importing necessary modules:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -49,7 +49,7 @@ def merge_sort_keys(
         Below is the code snippet that demonstrates the usage of the ``merge_sort_keys`` API:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_merge_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin merge-sort

cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py RENAMED Viewed

@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Tuple, Union
 import numba
-from cuda.cccl.cooperative.experimental._common import (
+from .._common import (
     CUB_BLOCK_SCAN_ALGOS,
     CudaSharedMemConfig,
     dim3,
@@ -14,7 +14,7 @@ from cuda.cccl.cooperative.experimental._common import (
     normalize_dim_param,
     normalize_dtype_param,
 )
-from cuda.cccl.cooperative.experimental._types import (
+from .._types import (
     Algorithm,
     Dependency,
     DependentArray,
@@ -140,7 +140,7 @@ def radix_sort_keys(dtype, threads_per_block, items_per_thread):
         are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
         where each thread owns 4 consecutive keys. We start by importing necessary modules:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -148,7 +148,7 @@ def radix_sort_keys(dtype, threads_per_block, items_per_thread):
         Below is the code snippet that demonstrates the usage of the ``radix_sort_keys`` API:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin radix-sort
@@ -181,7 +181,7 @@ def radix_sort_keys_descending(dtype, threads_per_block, items_per_thread):
         are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
         where each thread owns 4 consecutive keys. We start by importing necessary modules:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -189,7 +189,7 @@ def radix_sort_keys_descending(dtype, threads_per_block, items_per_thread):
         Below is the code snippet that demonstrates the usage of the ``radix_sort_keys`` API:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_radix_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin radix-sort-descending

cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py RENAMED Viewed

@@ -6,13 +6,13 @@ from typing import TYPE_CHECKING, Callable, Literal, Tuple, Union
 import numba
-from cuda.cccl.cooperative.experimental._common import (
+from .._common import (
     CUB_BLOCK_REDUCE_ALGOS,
     make_binary_tempfile,
     normalize_dim_param,
     normalize_dtype_param,
 )
-from cuda.cccl.cooperative.experimental._types import (
+from .._types import (
     Algorithm,
     Dependency,
     DependentArray,
@@ -208,13 +208,13 @@ def reduce(
         The code snippet below illustrates a max reduction of 128 integer items that are
         partitioned across 128 threads.
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
             :end-before: example-end imports
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin reduce
@@ -269,13 +269,13 @@ def sum(
         The code snippet below illustrates a sum of 128 integer items that are partitioned
         across 128 threads.
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
             :end-before: example-end imports
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin sum

cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py RENAMED Viewed

@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """
-cuda.cccl.cooperative.block_scan
+cuda.coop.block_scan
 ===========================
 This module provides a set of :ref:`collective <collective-primitives>`
@@ -73,16 +73,16 @@ from typing import Any, Callable, Literal
 import numba
-from cuda.cccl.cooperative.experimental._common import (
+from .._common import (
     CUB_BLOCK_SCAN_ALGOS,
     make_binary_tempfile,
     normalize_dim_param,
     normalize_dtype_param,
 )
-from cuda.cccl.cooperative.experimental._scan_op import (
+from .._scan_op import (
     ScanOp,
 )
-from cuda.cccl.cooperative.experimental._types import (
+from .._types import (
     Algorithm,
     Dependency,
     DependentArray,
@@ -94,7 +94,7 @@ from cuda.cccl.cooperative.experimental._types import (
     TemplateParameter,
     numba_type_to_wrapper,
 )
-from cuda.cccl.cooperative.experimental._typing import (
+from .._typing import (
     DimType,
     DtypeType,
     ScanOpType,
@@ -669,7 +669,7 @@ def exclusive_sum(
         :ref:`blocked arrangement <flexible-data-arrangement>` across 128
         threads where each thread owns 4 consecutive items.
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_scan_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_scan_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -678,7 +678,7 @@ def exclusive_sum(
         Below is the code snippet that demonstrates the usage of the
         ``exclusive_sum`` API:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_block_scan_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_scan_api.py
             :language: python
             :dedent:
             :start-after: example-begin exclusive-sum

cuda/coop/warp/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+from ._warp_merge_sort import merge_sort_keys
+from ._warp_reduce import reduce, sum
+from ._warp_scan import exclusive_sum
+__all__ = ["exclusive_sum", "reduce", "sum", "merge_sort_keys"]

cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py RENAMED Viewed

@@ -4,8 +4,8 @@
 import numba
-from cuda.cccl.cooperative.experimental._common import make_binary_tempfile
-from cuda.cccl.cooperative.experimental._types import (
+from .._common import make_binary_tempfile
+from .._types import (
     Algorithm,
     Constant,
     Dependency,
@@ -30,7 +30,7 @@ def merge_sort_keys(
         Below is the code snippet that demonstrates the usage of the ``merge_sort_keys`` API:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_merge_sort_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_merge_sort_api.py
             :language: python
             :dedent:
             :start-after: example-begin merge-sort

cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py RENAMED Viewed

@@ -4,8 +4,8 @@
 import numba
-from cuda.cccl.cooperative.experimental._common import make_binary_tempfile
-from cuda.cccl.cooperative.experimental._types import (
+from .._common import make_binary_tempfile
+from .._types import (
     Algorithm,
     Dependency,
     DependentPythonOperator,
@@ -28,7 +28,7 @@ def reduce(dtype, binary_op, threads_in_warp=32, methods=None):
         The code snippet below illustrates a max reduction of 32 integer items that
         are partitioned across a warp of threads.
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -36,7 +36,7 @@ def reduce(dtype, binary_op, threads_in_warp=32, methods=None):
         Below is the code snippet that demonstrates the usage of the ``reduce`` API:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin reduce
@@ -100,7 +100,7 @@ def sum(dtype, threads_in_warp=32):
         The code snippet below illustrates a reduction of 32 integer items that
         are partitioned across a warp of threads.
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -108,7 +108,7 @@ def sum(dtype, threads_in_warp=32):
         Below is the code snippet that demonstrates the usage of the ``reduce`` API:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_reduce_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
             :language: python
             :dedent:
             :start-after: example-begin sum

cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py RENAMED Viewed

@@ -5,8 +5,8 @@
 import numba
-from cuda.cccl.cooperative.experimental._common import make_binary_tempfile
-from cuda.cccl.cooperative.experimental._types import (
+from .._common import make_binary_tempfile
+from .._types import (
     Algorithm,
     Dependency,
     DependentReference,
@@ -23,7 +23,7 @@ def exclusive_sum(dtype, threads_in_warp=32):
     Example:
         The code snippet below illustrates an exclusive prefix sum of 32 integer items:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_scan_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_scan_api.py
             :language: python
             :dedent:
             :start-after: example-begin imports
@@ -31,7 +31,7 @@ def exclusive_sum(dtype, threads_in_warp=32):
         Below is the code snippet that demonstrates the usage of the ``exclusive_sum`` API:
-        .. literalinclude:: ../../python/cuda_cccl/tests/cooperative/test_warp_scan_api.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_scan_api.py
             :language: python
             :dedent:
             :start-after: example-begin exclusive-sum

{cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: cuda-cccl
-Version: 0.3.0
+Version: 0.3.2
 Summary: CUDA Core Library for Python
 Author: NVIDIA Corporation
 Classifier: Programming Language :: Python :: 3 :: Only