PyPI - cuda-cccl - Versions diffs - 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show

cuda/compute/__init__.py ADDED Viewed

@@ -0,0 +1,77 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+from .algorithms import (
+    DoubleBuffer,
+    SortOrder,
+    binary_transform,
+    exclusive_scan,
+    histogram_even,
+    inclusive_scan,
+    make_binary_transform,
+    make_exclusive_scan,
+    make_histogram_even,
+    make_inclusive_scan,
+    make_merge_sort,
+    make_radix_sort,
+    make_reduce_into,
+    make_segmented_reduce,
+    make_three_way_partition,
+    make_unary_transform,
+    make_unique_by_key,
+    merge_sort,
+    radix_sort,
+    reduce_into,
+    segmented_reduce,
+    three_way_partition,
+    unary_transform,
+    unique_by_key,
+)
+from .iterators import (
+    CacheModifiedInputIterator,
+    ConstantIterator,
+    CountingIterator,
+    ReverseIterator,
+    TransformIterator,
+    TransformOutputIterator,
+    ZipIterator,
+)
+from .op import OpKind
+from .struct import gpu_struct
+__all__ = [
+    "binary_transform",
+    "CacheModifiedInputIterator",
+    "ConstantIterator",
+    "CountingIterator",
+    "DoubleBuffer",
+    "exclusive_scan",
+    "gpu_struct",
+    "histogram_even",
+    "inclusive_scan",
+    "make_binary_transform",
+    "make_exclusive_scan",
+    "make_histogram_even",
+    "make_inclusive_scan",
+    "make_merge_sort",
+    "make_radix_sort",
+    "make_reduce_into",
+    "make_segmented_reduce",
+    "make_three_way_partition",
+    "make_unary_transform",
+    "make_unique_by_key",
+    "merge_sort",
+    "OpKind",
+    "radix_sort",
+    "reduce_into",
+    "ReverseIterator",
+    "segmented_reduce",
+    "SortOrder",
+    "TransformIterator",
+    "TransformOutputIterator",
+    "three_way_partition",
+    "unary_transform",
+    "unique_by_key",
+    "ZipIterator",
+]

cuda/{cccl/parallel/experimental → compute}/_bindings.pyi RENAMED Viewed

@@ -390,6 +390,7 @@ class DeviceHistogramBuildResult:
         num_rows: int,
         row_stride_samples: int,
         is_evenly_segmented: bool,
+        info: CommonData,
     ): ...
     def compute_even(
         self,
@@ -403,3 +404,30 @@ class DeviceHistogramBuildResult:
         row_stride_samples: int,
         stream,
     ) -> None: ...
+# ---------------------
+# DeviceThreeWayPartition
+# ---------------------
+class DeviceThreeWayPartitionBuildResult:
+    def __init__(
+        self,
+        d_in: Iterator,
+        d_first_part_out: Iterator,
+        d_second_part_out: Iterator,
+        d_unselected_out: Iterator,
+        d_num_selected_out: Iterator,
+        select_first_part_op: Op,
+        select_second_part_op: Op,
+        info: CommonData,
+    ): ...
+    def compute(
+        self,
+        d_in: Iterator,
+        d_first_part_out: Iterator,
+        d_second_part_out: Iterator,
+        d_unselected_out: Iterator,
+        d_num_selected_out: Iterator,
+        num_items: int,
+        stream,
+    ) -> int: ...

cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx RENAMED Viewed

@@ -4,7 +4,7 @@
 # Python signatures are declared in the companion Python stub file _bindings.pyi
 # Make sure to update PYI with change to Python API to ensure that Python
-# static type checker tools like mypy green-lights cuda.cccl.parallel
+# static type checker tools like mypy green-lights cuda.compute
 from libc.string cimport memset, memcpy
 from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
@@ -1982,3 +1982,143 @@ cdef class DeviceHistogramBuildResult:
             <const char*>self.build_data.cubin,
             self.build_data.cubin_size
         )
+# ----------------------------------
+# DeviceThreeWayPartitionBuildResult
+# ----------------------------------
+cdef extern from "cccl/c/three_way_partition.h":
+    cdef struct cccl_device_three_way_partition_build_result_t 'cccl_device_three_way_partition_build_result_t':
+        const char* cubin
+        size_t cubin_size
+    cdef CUresult cccl_device_three_way_partition_build(
+        cccl_device_three_way_partition_build_result_t *build_ptr,
+        cccl_iterator_t d_in,
+        cccl_iterator_t d_first_part_out,
+        cccl_iterator_t d_second_part_out,
+        cccl_iterator_t d_unselected_out,
+        cccl_iterator_t d_num_selected_out,
+        cccl_op_t select_first_part_op,
+        cccl_op_t select_second_part_op,
+        int, int, const char *, const char *, const char *, const char *
+    ) nogil
+    CUresult cccl_device_three_way_partition(
+        cccl_device_three_way_partition_build_result_t build,
+        void* d_temp_storage,
+        size_t* temp_storage_bytes,
+        cccl_iterator_t d_in,
+        cccl_iterator_t d_first_part_out,
+        cccl_iterator_t d_second_part_out,
+        cccl_iterator_t d_unselected_out,
+        cccl_iterator_t d_num_selected_out,
+        cccl_op_t select_first_part_op,
+        cccl_op_t select_second_part_op,
+        int64_t num_items,
+        CUstream stream
+    ) nogil
+    cdef CUresult cccl_device_three_way_partition_cleanup(
+        cccl_device_three_way_partition_build_result_t *build_ptr
+    ) nogil
+cdef class DeviceThreeWayPartitionBuildResult:
+    cdef cccl_device_three_way_partition_build_result_t build_data
+    def __dealloc__(DeviceThreeWayPartitionBuildResult self):
+        cdef CUresult status = -1
+        with nogil:
+            status = cccl_device_three_way_partition_cleanup(&self.build_data)
+        if (status != 0):
+            print(f"Return code {status} encountered during three_way_partition result cleanup")
+    def __cinit__(
+        DeviceThreeWayPartitionBuildResult self,
+        Iterator d_in,
+        Iterator d_first_part_out,
+        Iterator d_second_part_out,
+        Iterator d_unselected_out,
+        Iterator d_num_selected_out,
+        Op select_first_part_op,
+        Op select_second_part_op,
+        CommonData common_data
+    ):
+        cdef CUresult status = -1
+        cdef int cc_major = common_data.get_cc_major()
+        cdef int cc_minor = common_data.get_cc_minor()
+        cdef const char *cub_path = common_data.cub_path_get_c_str()
+        cdef const char *thrust_path = common_data.thrust_path_get_c_str()
+        cdef const char *libcudacxx_path = common_data.libcudacxx_path_get_c_str()
+        cdef const char *ctk_path = common_data.ctk_path_get_c_str()
+        memset(&self.build_data, 0, sizeof(cccl_device_three_way_partition_build_result_t))
+        with nogil:
+            status = cccl_device_three_way_partition_build(
+                &self.build_data,
+                d_in.iter_data,
+                d_first_part_out.iter_data,
+                d_second_part_out.iter_data,
+                d_unselected_out.iter_data,
+                d_num_selected_out.iter_data,
+                select_first_part_op.op_data,
+                select_second_part_op.op_data,
+                cc_major,
+                cc_minor,
+                cub_path,
+                thrust_path,
+                libcudacxx_path,
+                ctk_path,
+            )
+        if status != 0:
+            raise RuntimeError(
+                f"Failed building three_way_partition, error code: {status}"
+            )
+    cpdef int compute(
+        DeviceThreeWayPartitionBuildResult self,
+        temp_storage_ptr,
+        temp_storage_bytes,
+        Iterator d_in,
+        Iterator d_first_part_out,
+        Iterator d_second_part_out,
+        Iterator d_unselected_out,
+        Iterator d_num_selected_out,
+        Op select_first_part_op,
+        Op select_second_part_op,
+        size_t num_items,
+        stream
+    ):
+        cdef CUresult status = -1
+        cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
+        cdef size_t storage_sz = <size_t>temp_storage_bytes
+        cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
+        with nogil:
+            status = cccl_device_three_way_partition(
+                self.build_data,
+                storage_ptr,
+                &storage_sz,
+                d_in.iter_data,
+                d_first_part_out.iter_data,
+                d_second_part_out.iter_data,
+                d_unselected_out.iter_data,
+                d_num_selected_out.iter_data,
+                select_first_part_op.op_data,
+                select_second_part_op.op_data,
+                <uint64_t>num_items,
+                c_stream
+            )
+        if status != 0:
+            raise RuntimeError(
+                f"Failed executing three_way_partition, error code: {status}"
+            )
+        return storage_sz
+    def _get_cubin(self):
+        return PyBytes_FromStringAndSize(
+            <const char*>self.build_data.cubin,
+            self.build_data.cubin_size
+        )

cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py RENAMED Viewed

@@ -18,6 +18,8 @@ from ._scan import make_exclusive_scan as make_exclusive_scan
 from ._scan import make_inclusive_scan as make_inclusive_scan
 from ._segmented_reduce import make_segmented_reduce as make_segmented_reduce
 from ._segmented_reduce import segmented_reduce
+from ._three_way_partition import make_three_way_partition as make_three_way_partition
+from ._three_way_partition import three_way_partition as three_way_partition
 from ._transform import binary_transform, unary_transform
 from ._transform import make_binary_transform as make_binary_transform
 from ._transform import make_unary_transform as make_unary_transform
@@ -45,6 +47,8 @@ __all__ = [
     "make_segmented_reduce",
     "unique_by_key",
     "make_unique_by_key",
+    "three_way_partition",
+    "make_three_way_partition",
     "DoubleBuffer",
     "SortOrder",
 ]

cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py RENAMED Viewed

@@ -148,7 +148,7 @@ def make_histogram_even(
     Example:
         Below, ``make_histogram_even`` is used to create a histogram object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/histogram/histogram_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_object.py
           :language: python
           :start-after: # example-begin
@@ -190,7 +190,7 @@ def histogram_even(
     Example:
         Below, ``histogram_even`` is used to compute a histogram with evenly-spaced bins.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/histogram/histogram_even_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_even_basic.py
             :language: python
             :start-after: # example-begin
             :caption: Basic histogram example.

cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py RENAMED Viewed

@@ -166,7 +166,7 @@ def make_merge_sort(
     Example:
         Below, ``make_merge_sort`` is used to create a merge sort object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/merge_sort_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_object.py
           :language: python
           :start-after: # example-begin
@@ -201,7 +201,7 @@ def merge_sort(
     Example:
         Below, ``merge_sort`` is used to sort a sequence of keys inplace. It also rearranges the items according to the keys' order.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/merge_sort_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_basic.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py RENAMED Viewed

@@ -222,7 +222,7 @@ def make_radix_sort(
     Example:
         Below, ``make_radix_sort`` is used to create a radix sort object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_object.py
           :language: python
           :start-after: # example-begin
@@ -259,14 +259,14 @@ def radix_sort(
     Example:
         Below, ``radix_sort`` is used to sort a sequence of keys. It also rearranges the values according to the keys' order.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_basic.py
             :language: python
             :start-after: # example-begin
         In the following example, ``radix_sort`` is used to sort a sequence of keys with a ``DoubleBuffer` for reduced temporary storage.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/sort/radix_sort_buffer.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_buffer.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py RENAMED Viewed

@@ -3,8 +3,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-from __future__ import annotations  # TODO: required for Python 3.7 docs env
 from typing import Callable, Union
 import numba
@@ -132,7 +130,7 @@ def make_reduce_into(
     Example:
         Below, ``make_reduce_into`` is used to create a reduction object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/reduce_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/reduce_object.py
             :language: python
             :start-after: # example-begin
@@ -165,7 +163,7 @@ def reduce_into(
     Example:
         Below, ``reduce_into`` is used to compute the sum of a sequence of integers.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/reduction/sum_reduction.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/sum_reduction.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py RENAMED Viewed

@@ -3,8 +3,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-from __future__ import annotations  # TODO: required for Python 3.7 docs env
 from typing import Callable, Union
 import numba
@@ -143,7 +141,7 @@ def make_exclusive_scan(
     Example:
         Below, ``make_exclusive_scan`` is used to create an exclusive scan object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/exclusive_scan_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_object.py
           :language: python
           :start-after: # example-begin
@@ -176,7 +174,7 @@ def exclusive_scan(
     Example:
         Below, ``exclusive_scan`` is used to compute an exclusive scan with max operation.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/exclusive_scan_max.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_max.py
             :language: python
             :start-after: # example-begin
@@ -209,7 +207,7 @@ def make_inclusive_scan(
     Example:
         Below, ``make_inclusive_scan`` is used to create an inclusive scan object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/inclusive_scan_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_object.py
           :language: python
           :start-after: # example-begin
@@ -242,7 +240,7 @@ def inclusive_scan(
     Example:
         Below, ``inclusive_scan`` is used to compute an inclusive scan (prefix sum).
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/scan/inclusive_scan_custom.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_custom.py
             :language: python
             :start-after: # example-begin

cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py RENAMED Viewed

@@ -179,7 +179,7 @@ def make_segmented_reduce(
     Example:
         Below, ``make_segmented_reduce`` is used to create a segmented reduction object that can be reused.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/segmented/segmented_reduce_object.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_object.py
             :language: python
             :start-after: # example-begin
@@ -216,7 +216,7 @@ def segmented_reduce(
     Example:
         Below, ``segmented_reduce`` is used to compute the minimum value of segments in a sequence of integers.
-        .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/segmented/segmented_reduce_basic.py
+        .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_basic.py
             :language: python
             :start-after: # example-begin