cuda-cccl 0.1.3.2.0.dev438__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.1__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
- cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
- cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +10 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/util_device.cuh +51 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +4 -4
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
- cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
- cuda/cccl/headers/include/cuda/__device/device_ref.h +32 -51
- cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +330 -36
- cuda/cccl/headers/include/cuda/__event/event.h +8 -8
- cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -4
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
- cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
- cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/devices +10 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
- cuda/cccl/headers/include/cuda/std/string_view +12 -5
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
- cuda/cccl/parallel/experimental/__init__.py +21 -70
- cuda/compute/__init__.py +77 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +141 -1
- cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +171 -166
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
cuda/compute/__init__.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
|
+
|
|
5
|
+
from .algorithms import (
|
|
6
|
+
DoubleBuffer,
|
|
7
|
+
SortOrder,
|
|
8
|
+
binary_transform,
|
|
9
|
+
exclusive_scan,
|
|
10
|
+
histogram_even,
|
|
11
|
+
inclusive_scan,
|
|
12
|
+
make_binary_transform,
|
|
13
|
+
make_exclusive_scan,
|
|
14
|
+
make_histogram_even,
|
|
15
|
+
make_inclusive_scan,
|
|
16
|
+
make_merge_sort,
|
|
17
|
+
make_radix_sort,
|
|
18
|
+
make_reduce_into,
|
|
19
|
+
make_segmented_reduce,
|
|
20
|
+
make_three_way_partition,
|
|
21
|
+
make_unary_transform,
|
|
22
|
+
make_unique_by_key,
|
|
23
|
+
merge_sort,
|
|
24
|
+
radix_sort,
|
|
25
|
+
reduce_into,
|
|
26
|
+
segmented_reduce,
|
|
27
|
+
three_way_partition,
|
|
28
|
+
unary_transform,
|
|
29
|
+
unique_by_key,
|
|
30
|
+
)
|
|
31
|
+
from .iterators import (
|
|
32
|
+
CacheModifiedInputIterator,
|
|
33
|
+
ConstantIterator,
|
|
34
|
+
CountingIterator,
|
|
35
|
+
ReverseIterator,
|
|
36
|
+
TransformIterator,
|
|
37
|
+
TransformOutputIterator,
|
|
38
|
+
ZipIterator,
|
|
39
|
+
)
|
|
40
|
+
from .op import OpKind
|
|
41
|
+
from .struct import gpu_struct
|
|
42
|
+
|
|
43
|
+
__all__ = [
|
|
44
|
+
"binary_transform",
|
|
45
|
+
"CacheModifiedInputIterator",
|
|
46
|
+
"ConstantIterator",
|
|
47
|
+
"CountingIterator",
|
|
48
|
+
"DoubleBuffer",
|
|
49
|
+
"exclusive_scan",
|
|
50
|
+
"gpu_struct",
|
|
51
|
+
"histogram_even",
|
|
52
|
+
"inclusive_scan",
|
|
53
|
+
"make_binary_transform",
|
|
54
|
+
"make_exclusive_scan",
|
|
55
|
+
"make_histogram_even",
|
|
56
|
+
"make_inclusive_scan",
|
|
57
|
+
"make_merge_sort",
|
|
58
|
+
"make_radix_sort",
|
|
59
|
+
"make_reduce_into",
|
|
60
|
+
"make_segmented_reduce",
|
|
61
|
+
"make_three_way_partition",
|
|
62
|
+
"make_unary_transform",
|
|
63
|
+
"make_unique_by_key",
|
|
64
|
+
"merge_sort",
|
|
65
|
+
"OpKind",
|
|
66
|
+
"radix_sort",
|
|
67
|
+
"reduce_into",
|
|
68
|
+
"ReverseIterator",
|
|
69
|
+
"segmented_reduce",
|
|
70
|
+
"SortOrder",
|
|
71
|
+
"TransformIterator",
|
|
72
|
+
"TransformOutputIterator",
|
|
73
|
+
"three_way_partition",
|
|
74
|
+
"unary_transform",
|
|
75
|
+
"unique_by_key",
|
|
76
|
+
"ZipIterator",
|
|
77
|
+
]
|
|
@@ -390,6 +390,7 @@ class DeviceHistogramBuildResult:
|
|
|
390
390
|
num_rows: int,
|
|
391
391
|
row_stride_samples: int,
|
|
392
392
|
is_evenly_segmented: bool,
|
|
393
|
+
info: CommonData,
|
|
393
394
|
): ...
|
|
394
395
|
def compute_even(
|
|
395
396
|
self,
|
|
@@ -403,3 +404,30 @@ class DeviceHistogramBuildResult:
|
|
|
403
404
|
row_stride_samples: int,
|
|
404
405
|
stream,
|
|
405
406
|
) -> None: ...
|
|
407
|
+
|
|
408
|
+
# ---------------------
|
|
409
|
+
# DeviceThreeWayPartition
|
|
410
|
+
# ---------------------
|
|
411
|
+
|
|
412
|
+
class DeviceThreeWayPartitionBuildResult:
|
|
413
|
+
def __init__(
|
|
414
|
+
self,
|
|
415
|
+
d_in: Iterator,
|
|
416
|
+
d_first_part_out: Iterator,
|
|
417
|
+
d_second_part_out: Iterator,
|
|
418
|
+
d_unselected_out: Iterator,
|
|
419
|
+
d_num_selected_out: Iterator,
|
|
420
|
+
select_first_part_op: Op,
|
|
421
|
+
select_second_part_op: Op,
|
|
422
|
+
info: CommonData,
|
|
423
|
+
): ...
|
|
424
|
+
def compute(
|
|
425
|
+
self,
|
|
426
|
+
d_in: Iterator,
|
|
427
|
+
d_first_part_out: Iterator,
|
|
428
|
+
d_second_part_out: Iterator,
|
|
429
|
+
d_unselected_out: Iterator,
|
|
430
|
+
d_num_selected_out: Iterator,
|
|
431
|
+
num_items: int,
|
|
432
|
+
stream,
|
|
433
|
+
) -> int: ...
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
# Python signatures are declared in the companion Python stub file _bindings.pyi
|
|
6
6
|
# Make sure to update PYI with change to Python API to ensure that Python
|
|
7
|
-
# static type checker tools like mypy green-lights cuda.
|
|
7
|
+
# static type checker tools like mypy green-lights cuda.compute
|
|
8
8
|
|
|
9
9
|
from libc.string cimport memset, memcpy
|
|
10
10
|
from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
|
|
@@ -1982,3 +1982,143 @@ cdef class DeviceHistogramBuildResult:
|
|
|
1982
1982
|
<const char*>self.build_data.cubin,
|
|
1983
1983
|
self.build_data.cubin_size
|
|
1984
1984
|
)
|
|
1985
|
+
|
|
1986
|
+
|
|
1987
|
+
# ----------------------------------
|
|
1988
|
+
# DeviceThreeWayPartitionBuildResult
|
|
1989
|
+
# ----------------------------------
|
|
1990
|
+
cdef extern from "cccl/c/three_way_partition.h":
|
|
1991
|
+
cdef struct cccl_device_three_way_partition_build_result_t 'cccl_device_three_way_partition_build_result_t':
|
|
1992
|
+
const char* cubin
|
|
1993
|
+
size_t cubin_size
|
|
1994
|
+
|
|
1995
|
+
cdef CUresult cccl_device_three_way_partition_build(
|
|
1996
|
+
cccl_device_three_way_partition_build_result_t *build_ptr,
|
|
1997
|
+
cccl_iterator_t d_in,
|
|
1998
|
+
cccl_iterator_t d_first_part_out,
|
|
1999
|
+
cccl_iterator_t d_second_part_out,
|
|
2000
|
+
cccl_iterator_t d_unselected_out,
|
|
2001
|
+
cccl_iterator_t d_num_selected_out,
|
|
2002
|
+
cccl_op_t select_first_part_op,
|
|
2003
|
+
cccl_op_t select_second_part_op,
|
|
2004
|
+
int, int, const char *, const char *, const char *, const char *
|
|
2005
|
+
) nogil
|
|
2006
|
+
|
|
2007
|
+
CUresult cccl_device_three_way_partition(
|
|
2008
|
+
cccl_device_three_way_partition_build_result_t build,
|
|
2009
|
+
void* d_temp_storage,
|
|
2010
|
+
size_t* temp_storage_bytes,
|
|
2011
|
+
cccl_iterator_t d_in,
|
|
2012
|
+
cccl_iterator_t d_first_part_out,
|
|
2013
|
+
cccl_iterator_t d_second_part_out,
|
|
2014
|
+
cccl_iterator_t d_unselected_out,
|
|
2015
|
+
cccl_iterator_t d_num_selected_out,
|
|
2016
|
+
cccl_op_t select_first_part_op,
|
|
2017
|
+
cccl_op_t select_second_part_op,
|
|
2018
|
+
int64_t num_items,
|
|
2019
|
+
CUstream stream
|
|
2020
|
+
) nogil
|
|
2021
|
+
|
|
2022
|
+
cdef CUresult cccl_device_three_way_partition_cleanup(
|
|
2023
|
+
cccl_device_three_way_partition_build_result_t *build_ptr
|
|
2024
|
+
) nogil
|
|
2025
|
+
|
|
2026
|
+
|
|
2027
|
+
cdef class DeviceThreeWayPartitionBuildResult:
|
|
2028
|
+
cdef cccl_device_three_way_partition_build_result_t build_data
|
|
2029
|
+
|
|
2030
|
+
def __dealloc__(DeviceThreeWayPartitionBuildResult self):
|
|
2031
|
+
cdef CUresult status = -1
|
|
2032
|
+
with nogil:
|
|
2033
|
+
status = cccl_device_three_way_partition_cleanup(&self.build_data)
|
|
2034
|
+
if (status != 0):
|
|
2035
|
+
print(f"Return code {status} encountered during three_way_partition result cleanup")
|
|
2036
|
+
|
|
2037
|
+
|
|
2038
|
+
def __cinit__(
|
|
2039
|
+
DeviceThreeWayPartitionBuildResult self,
|
|
2040
|
+
Iterator d_in,
|
|
2041
|
+
Iterator d_first_part_out,
|
|
2042
|
+
Iterator d_second_part_out,
|
|
2043
|
+
Iterator d_unselected_out,
|
|
2044
|
+
Iterator d_num_selected_out,
|
|
2045
|
+
Op select_first_part_op,
|
|
2046
|
+
Op select_second_part_op,
|
|
2047
|
+
CommonData common_data
|
|
2048
|
+
):
|
|
2049
|
+
cdef CUresult status = -1
|
|
2050
|
+
cdef int cc_major = common_data.get_cc_major()
|
|
2051
|
+
cdef int cc_minor = common_data.get_cc_minor()
|
|
2052
|
+
cdef const char *cub_path = common_data.cub_path_get_c_str()
|
|
2053
|
+
cdef const char *thrust_path = common_data.thrust_path_get_c_str()
|
|
2054
|
+
cdef const char *libcudacxx_path = common_data.libcudacxx_path_get_c_str()
|
|
2055
|
+
cdef const char *ctk_path = common_data.ctk_path_get_c_str()
|
|
2056
|
+
|
|
2057
|
+
memset(&self.build_data, 0, sizeof(cccl_device_three_way_partition_build_result_t))
|
|
2058
|
+
with nogil:
|
|
2059
|
+
status = cccl_device_three_way_partition_build(
|
|
2060
|
+
&self.build_data,
|
|
2061
|
+
d_in.iter_data,
|
|
2062
|
+
d_first_part_out.iter_data,
|
|
2063
|
+
d_second_part_out.iter_data,
|
|
2064
|
+
d_unselected_out.iter_data,
|
|
2065
|
+
d_num_selected_out.iter_data,
|
|
2066
|
+
select_first_part_op.op_data,
|
|
2067
|
+
select_second_part_op.op_data,
|
|
2068
|
+
cc_major,
|
|
2069
|
+
cc_minor,
|
|
2070
|
+
cub_path,
|
|
2071
|
+
thrust_path,
|
|
2072
|
+
libcudacxx_path,
|
|
2073
|
+
ctk_path,
|
|
2074
|
+
)
|
|
2075
|
+
if status != 0:
|
|
2076
|
+
raise RuntimeError(
|
|
2077
|
+
f"Failed building three_way_partition, error code: {status}"
|
|
2078
|
+
)
|
|
2079
|
+
|
|
2080
|
+
cpdef int compute(
|
|
2081
|
+
DeviceThreeWayPartitionBuildResult self,
|
|
2082
|
+
temp_storage_ptr,
|
|
2083
|
+
temp_storage_bytes,
|
|
2084
|
+
Iterator d_in,
|
|
2085
|
+
Iterator d_first_part_out,
|
|
2086
|
+
Iterator d_second_part_out,
|
|
2087
|
+
Iterator d_unselected_out,
|
|
2088
|
+
Iterator d_num_selected_out,
|
|
2089
|
+
Op select_first_part_op,
|
|
2090
|
+
Op select_second_part_op,
|
|
2091
|
+
size_t num_items,
|
|
2092
|
+
stream
|
|
2093
|
+
):
|
|
2094
|
+
cdef CUresult status = -1
|
|
2095
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
2096
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
2097
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
2098
|
+
|
|
2099
|
+
with nogil:
|
|
2100
|
+
status = cccl_device_three_way_partition(
|
|
2101
|
+
self.build_data,
|
|
2102
|
+
storage_ptr,
|
|
2103
|
+
&storage_sz,
|
|
2104
|
+
d_in.iter_data,
|
|
2105
|
+
d_first_part_out.iter_data,
|
|
2106
|
+
d_second_part_out.iter_data,
|
|
2107
|
+
d_unselected_out.iter_data,
|
|
2108
|
+
d_num_selected_out.iter_data,
|
|
2109
|
+
select_first_part_op.op_data,
|
|
2110
|
+
select_second_part_op.op_data,
|
|
2111
|
+
<uint64_t>num_items,
|
|
2112
|
+
c_stream
|
|
2113
|
+
)
|
|
2114
|
+
if status != 0:
|
|
2115
|
+
raise RuntimeError(
|
|
2116
|
+
f"Failed executing three_way_partition, error code: {status}"
|
|
2117
|
+
)
|
|
2118
|
+
return storage_sz
|
|
2119
|
+
|
|
2120
|
+
def _get_cubin(self):
|
|
2121
|
+
return PyBytes_FromStringAndSize(
|
|
2122
|
+
<const char*>self.build_data.cubin,
|
|
2123
|
+
self.build_data.cubin_size
|
|
2124
|
+
)
|
|
@@ -18,6 +18,8 @@ from ._scan import make_exclusive_scan as make_exclusive_scan
|
|
|
18
18
|
from ._scan import make_inclusive_scan as make_inclusive_scan
|
|
19
19
|
from ._segmented_reduce import make_segmented_reduce as make_segmented_reduce
|
|
20
20
|
from ._segmented_reduce import segmented_reduce
|
|
21
|
+
from ._three_way_partition import make_three_way_partition as make_three_way_partition
|
|
22
|
+
from ._three_way_partition import three_way_partition as three_way_partition
|
|
21
23
|
from ._transform import binary_transform, unary_transform
|
|
22
24
|
from ._transform import make_binary_transform as make_binary_transform
|
|
23
25
|
from ._transform import make_unary_transform as make_unary_transform
|
|
@@ -45,6 +47,8 @@ __all__ = [
|
|
|
45
47
|
"make_segmented_reduce",
|
|
46
48
|
"unique_by_key",
|
|
47
49
|
"make_unique_by_key",
|
|
50
|
+
"three_way_partition",
|
|
51
|
+
"make_three_way_partition",
|
|
48
52
|
"DoubleBuffer",
|
|
49
53
|
"SortOrder",
|
|
50
54
|
]
|
|
@@ -148,7 +148,7 @@ def make_histogram_even(
|
|
|
148
148
|
Example:
|
|
149
149
|
Below, ``make_histogram_even`` is used to create a histogram object that can be reused.
|
|
150
150
|
|
|
151
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
151
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_object.py
|
|
152
152
|
:language: python
|
|
153
153
|
:start-after: # example-begin
|
|
154
154
|
|
|
@@ -190,7 +190,7 @@ def histogram_even(
|
|
|
190
190
|
Example:
|
|
191
191
|
Below, ``histogram_even`` is used to compute a histogram with evenly-spaced bins.
|
|
192
192
|
|
|
193
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
193
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_even_basic.py
|
|
194
194
|
:language: python
|
|
195
195
|
:start-after: # example-begin
|
|
196
196
|
:caption: Basic histogram example.
|
|
@@ -166,7 +166,7 @@ def make_merge_sort(
|
|
|
166
166
|
Example:
|
|
167
167
|
Below, ``make_merge_sort`` is used to create a merge sort object that can be reused.
|
|
168
168
|
|
|
169
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
169
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_object.py
|
|
170
170
|
:language: python
|
|
171
171
|
:start-after: # example-begin
|
|
172
172
|
|
|
@@ -201,7 +201,7 @@ def merge_sort(
|
|
|
201
201
|
Example:
|
|
202
202
|
Below, ``merge_sort`` is used to sort a sequence of keys inplace. It also rearranges the items according to the keys' order.
|
|
203
203
|
|
|
204
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
204
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_basic.py
|
|
205
205
|
:language: python
|
|
206
206
|
:start-after: # example-begin
|
|
207
207
|
|
|
@@ -222,7 +222,7 @@ def make_radix_sort(
|
|
|
222
222
|
Example:
|
|
223
223
|
Below, ``make_radix_sort`` is used to create a radix sort object that can be reused.
|
|
224
224
|
|
|
225
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
225
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_object.py
|
|
226
226
|
:language: python
|
|
227
227
|
:start-after: # example-begin
|
|
228
228
|
|
|
@@ -259,14 +259,14 @@ def radix_sort(
|
|
|
259
259
|
Example:
|
|
260
260
|
Below, ``radix_sort`` is used to sort a sequence of keys. It also rearranges the values according to the keys' order.
|
|
261
261
|
|
|
262
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
262
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_basic.py
|
|
263
263
|
:language: python
|
|
264
264
|
:start-after: # example-begin
|
|
265
265
|
|
|
266
266
|
|
|
267
267
|
In the following example, ``radix_sort`` is used to sort a sequence of keys with a ``DoubleBuffer` for reduced temporary storage.
|
|
268
268
|
|
|
269
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
269
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_buffer.py
|
|
270
270
|
:language: python
|
|
271
271
|
:start-after: # example-begin
|
|
272
272
|
|
|
@@ -3,8 +3,6 @@
|
|
|
3
3
|
#
|
|
4
4
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
5
5
|
|
|
6
|
-
from __future__ import annotations # TODO: required for Python 3.7 docs env
|
|
7
|
-
|
|
8
6
|
from typing import Callable, Union
|
|
9
7
|
|
|
10
8
|
import numba
|
|
@@ -132,7 +130,7 @@ def make_reduce_into(
|
|
|
132
130
|
Example:
|
|
133
131
|
Below, ``make_reduce_into`` is used to create a reduction object that can be reused.
|
|
134
132
|
|
|
135
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
133
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/reduce_object.py
|
|
136
134
|
:language: python
|
|
137
135
|
:start-after: # example-begin
|
|
138
136
|
|
|
@@ -165,7 +163,7 @@ def reduce_into(
|
|
|
165
163
|
Example:
|
|
166
164
|
Below, ``reduce_into`` is used to compute the sum of a sequence of integers.
|
|
167
165
|
|
|
168
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
166
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/sum_reduction.py
|
|
169
167
|
:language: python
|
|
170
168
|
:start-after: # example-begin
|
|
171
169
|
|
|
@@ -3,8 +3,6 @@
|
|
|
3
3
|
#
|
|
4
4
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
5
5
|
|
|
6
|
-
from __future__ import annotations # TODO: required for Python 3.7 docs env
|
|
7
|
-
|
|
8
6
|
from typing import Callable, Union
|
|
9
7
|
|
|
10
8
|
import numba
|
|
@@ -143,7 +141,7 @@ def make_exclusive_scan(
|
|
|
143
141
|
Example:
|
|
144
142
|
Below, ``make_exclusive_scan`` is used to create an exclusive scan object that can be reused.
|
|
145
143
|
|
|
146
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
144
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_object.py
|
|
147
145
|
:language: python
|
|
148
146
|
:start-after: # example-begin
|
|
149
147
|
|
|
@@ -176,7 +174,7 @@ def exclusive_scan(
|
|
|
176
174
|
Example:
|
|
177
175
|
Below, ``exclusive_scan`` is used to compute an exclusive scan with max operation.
|
|
178
176
|
|
|
179
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
177
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_max.py
|
|
180
178
|
:language: python
|
|
181
179
|
:start-after: # example-begin
|
|
182
180
|
|
|
@@ -209,7 +207,7 @@ def make_inclusive_scan(
|
|
|
209
207
|
Example:
|
|
210
208
|
Below, ``make_inclusive_scan`` is used to create an inclusive scan object that can be reused.
|
|
211
209
|
|
|
212
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
210
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_object.py
|
|
213
211
|
:language: python
|
|
214
212
|
:start-after: # example-begin
|
|
215
213
|
|
|
@@ -242,7 +240,7 @@ def inclusive_scan(
|
|
|
242
240
|
Example:
|
|
243
241
|
Below, ``inclusive_scan`` is used to compute an inclusive scan (prefix sum).
|
|
244
242
|
|
|
245
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
243
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_custom.py
|
|
246
244
|
:language: python
|
|
247
245
|
:start-after: # example-begin
|
|
248
246
|
|
|
@@ -179,7 +179,7 @@ def make_segmented_reduce(
|
|
|
179
179
|
Example:
|
|
180
180
|
Below, ``make_segmented_reduce`` is used to create a segmented reduction object that can be reused.
|
|
181
181
|
|
|
182
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
182
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_object.py
|
|
183
183
|
:language: python
|
|
184
184
|
:start-after: # example-begin
|
|
185
185
|
|
|
@@ -216,7 +216,7 @@ def segmented_reduce(
|
|
|
216
216
|
Example:
|
|
217
217
|
Below, ``segmented_reduce`` is used to compute the minimum value of segments in a sequence of integers.
|
|
218
218
|
|
|
219
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
219
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_basic.py
|
|
220
220
|
:language: python
|
|
221
221
|
:start-after: # example-begin
|
|
222
222
|
|