cuda-cccl 0.1.3.2.0.dev438__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
- cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
- cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +10 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/util_device.cuh +51 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +4 -4
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
- cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
- cuda/cccl/headers/include/cuda/__device/device_ref.h +32 -51
- cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +330 -36
- cuda/cccl/headers/include/cuda/__event/event.h +8 -8
- cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -4
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
- cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
- cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/devices +10 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
- cuda/cccl/headers/include/cuda/std/string_view +12 -5
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
- cuda/cccl/parallel/experimental/__init__.py +21 -70
- cuda/compute/__init__.py +77 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +141 -1
- cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +171 -166
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
|
|
2
|
+
#
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
5
|
+
|
|
6
|
+
from typing import Callable
|
|
7
|
+
|
|
8
|
+
import numba
|
|
9
|
+
|
|
10
|
+
from .. import _bindings
|
|
11
|
+
from .. import _cccl_interop as cccl
|
|
12
|
+
from .._caching import CachableFunction, cache_with_key
|
|
13
|
+
from .._cccl_interop import call_build, set_cccl_iterator_state
|
|
14
|
+
from .._utils import protocols
|
|
15
|
+
from .._utils.temp_storage_buffer import TempStorageBuffer
|
|
16
|
+
from ..iterators._iterators import IteratorBase
|
|
17
|
+
from ..typing import DeviceArrayLike
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def make_cache_key(
|
|
21
|
+
d_in: DeviceArrayLike | IteratorBase,
|
|
22
|
+
d_first_part_out: DeviceArrayLike | IteratorBase,
|
|
23
|
+
d_second_part_out: DeviceArrayLike | IteratorBase,
|
|
24
|
+
d_unselected_out: DeviceArrayLike | IteratorBase,
|
|
25
|
+
d_num_selected_out: DeviceArrayLike | IteratorBase,
|
|
26
|
+
select_first_part_op: Callable,
|
|
27
|
+
select_second_part_op: Callable,
|
|
28
|
+
):
|
|
29
|
+
d_in_key = (
|
|
30
|
+
d_in.kind if isinstance(d_in, IteratorBase) else protocols.get_dtype(d_in)
|
|
31
|
+
)
|
|
32
|
+
d_first_part_out_key = (
|
|
33
|
+
d_first_part_out.kind
|
|
34
|
+
if isinstance(d_first_part_out, IteratorBase)
|
|
35
|
+
else protocols.get_dtype(d_first_part_out)
|
|
36
|
+
)
|
|
37
|
+
d_second_part_out_key = (
|
|
38
|
+
d_second_part_out.kind
|
|
39
|
+
if isinstance(d_second_part_out, IteratorBase)
|
|
40
|
+
else protocols.get_dtype(d_second_part_out)
|
|
41
|
+
)
|
|
42
|
+
d_unselected_out_key = (
|
|
43
|
+
d_unselected_out.kind
|
|
44
|
+
if isinstance(d_unselected_out, IteratorBase)
|
|
45
|
+
else protocols.get_dtype(d_unselected_out)
|
|
46
|
+
)
|
|
47
|
+
d_num_selected_out_key = (
|
|
48
|
+
d_num_selected_out.kind
|
|
49
|
+
if isinstance(d_num_selected_out, IteratorBase)
|
|
50
|
+
else protocols.get_dtype(d_num_selected_out)
|
|
51
|
+
)
|
|
52
|
+
select_first_part_op_key = CachableFunction(select_first_part_op)
|
|
53
|
+
select_second_part_op_key = CachableFunction(select_second_part_op)
|
|
54
|
+
return (
|
|
55
|
+
d_in_key,
|
|
56
|
+
d_first_part_out_key,
|
|
57
|
+
d_second_part_out_key,
|
|
58
|
+
d_unselected_out_key,
|
|
59
|
+
d_num_selected_out_key,
|
|
60
|
+
select_first_part_op_key,
|
|
61
|
+
select_second_part_op_key,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class _ThreeWayPartition:
|
|
66
|
+
__slots__ = [
|
|
67
|
+
"build_result",
|
|
68
|
+
"d_in_cccl",
|
|
69
|
+
"d_first_part_out_cccl",
|
|
70
|
+
"d_second_part_out_cccl",
|
|
71
|
+
"d_unselected_out_cccl",
|
|
72
|
+
"d_num_selected_out_cccl",
|
|
73
|
+
"select_first_part_op_wrapper",
|
|
74
|
+
"select_second_part_op_wrapper",
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
d_in: DeviceArrayLike | IteratorBase,
|
|
80
|
+
d_first_part_out: DeviceArrayLike | IteratorBase,
|
|
81
|
+
d_second_part_out: DeviceArrayLike | IteratorBase,
|
|
82
|
+
d_unselected_out: DeviceArrayLike | IteratorBase,
|
|
83
|
+
d_num_selected_out: DeviceArrayLike | IteratorBase,
|
|
84
|
+
select_first_part_op: Callable,
|
|
85
|
+
select_second_part_op: Callable,
|
|
86
|
+
):
|
|
87
|
+
self.d_in_cccl = cccl.to_cccl_input_iter(d_in)
|
|
88
|
+
self.d_first_part_out_cccl = cccl.to_cccl_output_iter(d_first_part_out)
|
|
89
|
+
self.d_second_part_out_cccl = cccl.to_cccl_output_iter(d_second_part_out)
|
|
90
|
+
self.d_unselected_out_cccl = cccl.to_cccl_output_iter(d_unselected_out)
|
|
91
|
+
self.d_num_selected_out_cccl = cccl.to_cccl_output_iter(d_num_selected_out)
|
|
92
|
+
|
|
93
|
+
value_type = cccl.get_value_type(d_in)
|
|
94
|
+
sig = numba.types.uint8(value_type)
|
|
95
|
+
|
|
96
|
+
# There are no well-known operations that can be used with three_way_partition
|
|
97
|
+
self.select_first_part_op_wrapper = cccl.to_cccl_op(select_first_part_op, sig)
|
|
98
|
+
self.select_second_part_op_wrapper = cccl.to_cccl_op(select_second_part_op, sig)
|
|
99
|
+
|
|
100
|
+
self.build_result = call_build(
|
|
101
|
+
_bindings.DeviceThreeWayPartitionBuildResult,
|
|
102
|
+
self.d_in_cccl,
|
|
103
|
+
self.d_first_part_out_cccl,
|
|
104
|
+
self.d_second_part_out_cccl,
|
|
105
|
+
self.d_unselected_out_cccl,
|
|
106
|
+
self.d_num_selected_out_cccl,
|
|
107
|
+
self.select_first_part_op_wrapper,
|
|
108
|
+
self.select_second_part_op_wrapper,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
def __call__(
|
|
112
|
+
self,
|
|
113
|
+
temp_storage,
|
|
114
|
+
d_in,
|
|
115
|
+
d_first_part_out,
|
|
116
|
+
d_second_part_out,
|
|
117
|
+
d_unselected_out,
|
|
118
|
+
d_num_selected_out,
|
|
119
|
+
num_items: int,
|
|
120
|
+
stream=None,
|
|
121
|
+
):
|
|
122
|
+
set_cccl_iterator_state(self.d_in_cccl, d_in)
|
|
123
|
+
set_cccl_iterator_state(self.d_first_part_out_cccl, d_first_part_out)
|
|
124
|
+
set_cccl_iterator_state(self.d_second_part_out_cccl, d_second_part_out)
|
|
125
|
+
set_cccl_iterator_state(self.d_unselected_out_cccl, d_unselected_out)
|
|
126
|
+
set_cccl_iterator_state(self.d_num_selected_out_cccl, d_num_selected_out)
|
|
127
|
+
stream_handle = protocols.validate_and_get_stream(stream)
|
|
128
|
+
|
|
129
|
+
if temp_storage is None:
|
|
130
|
+
temp_storage_bytes = 0
|
|
131
|
+
d_temp_storage = 0
|
|
132
|
+
else:
|
|
133
|
+
temp_storage_bytes = temp_storage.nbytes
|
|
134
|
+
d_temp_storage = protocols.get_data_pointer(temp_storage)
|
|
135
|
+
|
|
136
|
+
temp_storage_bytes = self.build_result.compute(
|
|
137
|
+
d_temp_storage,
|
|
138
|
+
temp_storage_bytes,
|
|
139
|
+
self.d_in_cccl,
|
|
140
|
+
self.d_first_part_out_cccl,
|
|
141
|
+
self.d_second_part_out_cccl,
|
|
142
|
+
self.d_unselected_out_cccl,
|
|
143
|
+
self.d_num_selected_out_cccl,
|
|
144
|
+
self.select_first_part_op_wrapper,
|
|
145
|
+
self.select_second_part_op_wrapper,
|
|
146
|
+
num_items,
|
|
147
|
+
stream_handle,
|
|
148
|
+
)
|
|
149
|
+
return temp_storage_bytes
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@cache_with_key(make_cache_key)
|
|
153
|
+
def make_three_way_partition(
|
|
154
|
+
d_in: DeviceArrayLike | IteratorBase,
|
|
155
|
+
d_first_part_out: DeviceArrayLike | IteratorBase,
|
|
156
|
+
d_second_part_out: DeviceArrayLike | IteratorBase,
|
|
157
|
+
d_unselected_out: DeviceArrayLike | IteratorBase,
|
|
158
|
+
d_num_selected_out: DeviceArrayLike | IteratorBase,
|
|
159
|
+
select_first_part_op: Callable,
|
|
160
|
+
select_second_part_op: Callable,
|
|
161
|
+
):
|
|
162
|
+
"""
|
|
163
|
+
Computes a device-wide three-way partition using the specified unary ``select_first_part_op`` and ``select_second_part_op`` operators.
|
|
164
|
+
|
|
165
|
+
Example:
|
|
166
|
+
Below, ``make_three_way_partition`` is used to create a three-way partition object that can be reused.
|
|
167
|
+
|
|
168
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_object.py
|
|
169
|
+
:language: python
|
|
170
|
+
:start-after: # example-begin
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
d_in: Device array or iterator containing the input sequence of data items
|
|
174
|
+
d_first_part_out: Device array or iterator to store the first part of the output
|
|
175
|
+
d_second_part_out: Device array or iterator to store the second part of the output
|
|
176
|
+
d_unselected_out: Device array or iterator to store the unselected items
|
|
177
|
+
d_num_selected_out: Device array to store the number of items selected. The total number of items selected by ``select_first_part_op`` and ``select_second_part_op`` is stored in ``d_num_selected_out[0]`` and ``d_num_selected_out[1]``, respectively.
|
|
178
|
+
select_first_part_op: Callable representing the unary operator to select the first part
|
|
179
|
+
select_second_part_op: Callable representing the unary operator to select the second part
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
A callable object that can be used to perform the three-way partition
|
|
183
|
+
"""
|
|
184
|
+
return _ThreeWayPartition(
|
|
185
|
+
d_in,
|
|
186
|
+
d_first_part_out,
|
|
187
|
+
d_second_part_out,
|
|
188
|
+
d_unselected_out,
|
|
189
|
+
d_num_selected_out,
|
|
190
|
+
select_first_part_op,
|
|
191
|
+
select_second_part_op,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def three_way_partition(
|
|
196
|
+
d_in: DeviceArrayLike | IteratorBase,
|
|
197
|
+
d_first_part_out: DeviceArrayLike | IteratorBase,
|
|
198
|
+
d_second_part_out: DeviceArrayLike | IteratorBase,
|
|
199
|
+
d_unselected_out: DeviceArrayLike | IteratorBase,
|
|
200
|
+
d_num_selected_out: DeviceArrayLike | IteratorBase,
|
|
201
|
+
select_first_part_op: Callable,
|
|
202
|
+
select_second_part_op: Callable,
|
|
203
|
+
num_items: int,
|
|
204
|
+
stream=None,
|
|
205
|
+
):
|
|
206
|
+
"""
|
|
207
|
+
Performs device-wide three-way partition. Given an input sequence of data items, it partitions the items into three parts:
|
|
208
|
+
- The first part is selected by the ``select_first_part_op`` operator.
|
|
209
|
+
- The second part is selected by the ``select_second_part_op`` operator.
|
|
210
|
+
- The unselected items are not selected by either operator.
|
|
211
|
+
|
|
212
|
+
This function automatically handles temporary storage allocation and execution.
|
|
213
|
+
|
|
214
|
+
Example:
|
|
215
|
+
Below, ``three_way_partition`` is used to partition a sequence of integers into three parts.
|
|
216
|
+
|
|
217
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_basic.py
|
|
218
|
+
:language: python
|
|
219
|
+
:start-after: # example-begin
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
d_in: Device array or iterator containing the input sequence of data items
|
|
223
|
+
d_first_part_out: Device array or iterator to store the first part of the output
|
|
224
|
+
d_second_part_out: Device array or iterator to store the second part of the output
|
|
225
|
+
d_unselected_out: Device array or iterator to store the unselected items
|
|
226
|
+
d_num_selected_out: Device array to store the number of items selected. The total number of items selected by ``select_first_part_op`` and ``select_second_part_op`` is stored in ``d_num_selected_out[0]`` and ``d_num_selected_out[1]``, respectively.
|
|
227
|
+
select_first_part_op: Callable representing the unary operator to select the first part
|
|
228
|
+
select_second_part_op: Callable representing the unary operator to select the second part
|
|
229
|
+
num_items: Number of items to partition
|
|
230
|
+
stream: CUDA stream for the operation (optional)
|
|
231
|
+
"""
|
|
232
|
+
partitioner = make_three_way_partition(
|
|
233
|
+
d_in,
|
|
234
|
+
d_first_part_out,
|
|
235
|
+
d_second_part_out,
|
|
236
|
+
d_unselected_out,
|
|
237
|
+
d_num_selected_out,
|
|
238
|
+
select_first_part_op,
|
|
239
|
+
select_second_part_op,
|
|
240
|
+
)
|
|
241
|
+
tmp_storage_bytes = partitioner(
|
|
242
|
+
None,
|
|
243
|
+
d_in,
|
|
244
|
+
d_first_part_out,
|
|
245
|
+
d_second_part_out,
|
|
246
|
+
d_unselected_out,
|
|
247
|
+
d_num_selected_out,
|
|
248
|
+
num_items,
|
|
249
|
+
stream,
|
|
250
|
+
)
|
|
251
|
+
tmp_storage = TempStorageBuffer(tmp_storage_bytes, stream)
|
|
252
|
+
partitioner(
|
|
253
|
+
tmp_storage,
|
|
254
|
+
d_in,
|
|
255
|
+
d_first_part_out,
|
|
256
|
+
d_second_part_out,
|
|
257
|
+
d_unselected_out,
|
|
258
|
+
d_num_selected_out,
|
|
259
|
+
num_items,
|
|
260
|
+
stream,
|
|
261
|
+
)
|
|
@@ -196,7 +196,7 @@ def make_unary_transform(
|
|
|
196
196
|
storage allocation. For simpler usage, consider using :func:`unary_transform`.
|
|
197
197
|
|
|
198
198
|
Example:
|
|
199
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
199
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_object.py
|
|
200
200
|
:language: python
|
|
201
201
|
:start-after: # example-begin
|
|
202
202
|
|
|
@@ -227,7 +227,7 @@ def make_binary_transform(
|
|
|
227
227
|
storage allocation. For simpler usage, consider using :func:`binary_transform`.
|
|
228
228
|
|
|
229
229
|
Example:
|
|
230
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
230
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_object.py
|
|
231
231
|
:language: python
|
|
232
232
|
:start-after: # example-begin
|
|
233
233
|
|
|
@@ -259,7 +259,7 @@ def unary_transform(
|
|
|
259
259
|
Example:
|
|
260
260
|
Below, ``unary_transform`` is used to apply a transformation to each element of the input.
|
|
261
261
|
|
|
262
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
262
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_basic.py
|
|
263
263
|
:language: python
|
|
264
264
|
:start-after: # example-begin
|
|
265
265
|
|
|
@@ -291,7 +291,7 @@ def binary_transform(
|
|
|
291
291
|
Example:
|
|
292
292
|
Below, ``binary_transform`` is used to apply a transformation to pairs of elements from two input sequences.
|
|
293
293
|
|
|
294
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
294
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_basic.py
|
|
295
295
|
:language: python
|
|
296
296
|
:start-after: # example-begin
|
|
297
297
|
|
|
@@ -171,7 +171,7 @@ def make_unique_by_key(
|
|
|
171
171
|
Example:
|
|
172
172
|
Below, ``make_unique_by_key`` is used to create a unique by key object that can be reused.
|
|
173
173
|
|
|
174
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
174
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_object.py
|
|
175
175
|
:language: python
|
|
176
176
|
:start-after: # example-begin
|
|
177
177
|
|
|
@@ -211,7 +211,7 @@ def unique_by_key(
|
|
|
211
211
|
Example:
|
|
212
212
|
Below, ``unique_by_key`` is used to populate the arrays of output keys and items with the first key and its corresponding item from each sequence of equal keys. It also outputs the number of items selected.
|
|
213
213
|
|
|
214
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
214
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_basic.py
|
|
215
215
|
:language: python
|
|
216
216
|
:start-after: # example-begin
|
|
217
217
|
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -26,7 +26,7 @@ def CacheModifiedInputIterator(device_array, modifier):
|
|
|
26
26
|
Example:
|
|
27
27
|
The code snippet below demonstrates the usage of a ``CacheModifiedInputIterator``:
|
|
28
28
|
|
|
29
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
29
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/cache_modified_iterator_basic.py
|
|
30
30
|
:language: python
|
|
31
31
|
:start-after: # example-begin
|
|
32
32
|
|
|
@@ -55,7 +55,7 @@ def ConstantIterator(value):
|
|
|
55
55
|
The code snippet below demonstrates the usage of a ``ConstantIterator``
|
|
56
56
|
representing a sequence of constant values:
|
|
57
57
|
|
|
58
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
58
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/constant_iterator_basic.py
|
|
59
59
|
:language: python
|
|
60
60
|
:start-after: # example-begin
|
|
61
61
|
|
|
@@ -78,7 +78,7 @@ def CountingIterator(offset):
|
|
|
78
78
|
The code snippet below demonstrates the usage of a ``CountingIterator``
|
|
79
79
|
representing the sequence ``[10, 11, 12]``:
|
|
80
80
|
|
|
81
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
81
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/counting_iterator_basic.py
|
|
82
82
|
:language: python
|
|
83
83
|
:start-after: # example-begin
|
|
84
84
|
|
|
@@ -100,13 +100,13 @@ def ReverseIterator(sequence):
|
|
|
100
100
|
Examples:
|
|
101
101
|
The code snippet below demonstrates the usage of a ``ReverseIterator`` as an input iterator:
|
|
102
102
|
|
|
103
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
103
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_input_iterator.py
|
|
104
104
|
:language: python
|
|
105
105
|
:start-after: # example-begin
|
|
106
106
|
|
|
107
107
|
The code snippet below demonstrates the usage of a ``ReverseIterator`` as an output iterator:
|
|
108
108
|
|
|
109
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
109
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_output_iterator.py
|
|
110
110
|
:language: python
|
|
111
111
|
:start-after: # example-begin
|
|
112
112
|
|
|
@@ -129,7 +129,7 @@ def TransformIterator(it, op):
|
|
|
129
129
|
The code snippet below demonstrates the usage of a ``TransformIterator`` composed with a ``CountingIterator``
|
|
130
130
|
to transform the input before performing a reduction.
|
|
131
131
|
|
|
132
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
132
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_iterator_basic.py
|
|
133
133
|
:language: python
|
|
134
134
|
:start-after: # example-begin
|
|
135
135
|
Args:
|
|
@@ -151,7 +151,7 @@ def TransformOutputIterator(it, op):
|
|
|
151
151
|
The code snippet below demonstrates the usage of a ``TransformOutputIterator`` to transform the output
|
|
152
152
|
of a reduction before writing to an output array.
|
|
153
153
|
|
|
154
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
154
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_output_iterator.py
|
|
155
155
|
:language: python
|
|
156
156
|
:start-after: # example-begin
|
|
157
157
|
|
|
@@ -178,7 +178,7 @@ def ZipIterator(*iterators):
|
|
|
178
178
|
The code snippet below demonstrates the usage of a ``ZipIterator``
|
|
179
179
|
combining two device arrays:
|
|
180
180
|
|
|
181
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
181
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/zip_iterator_elementwise.py
|
|
182
182
|
:language: python
|
|
183
183
|
:start-after: # example-begin
|
|
184
184
|
|
|
@@ -207,7 +207,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
|
|
|
207
207
|
to a dataclass). The type of each field must be a subclass of
|
|
208
208
|
`np.number`, like `np.int32` or `np.float64`.
|
|
209
209
|
|
|
210
|
-
Arrays of GPUStruct objects can be used as inputs to cuda.
|
|
210
|
+
Arrays of GPUStruct objects can be used as inputs to cuda.compute
|
|
211
211
|
algorithms.
|
|
212
212
|
|
|
213
213
|
Example:
|
|
@@ -216,7 +216,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
|
|
|
216
216
|
a reduction on an input array of floating point values to compute its
|
|
217
217
|
the smallest and the largest absolute values:
|
|
218
218
|
|
|
219
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
219
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/minmax_reduction.py
|
|
220
220
|
:language: python
|
|
221
221
|
:start-after: # example-begin
|
|
222
222
|
|
cuda/coop/__init__.py
ADDED
|
@@ -5,8 +5,9 @@
|
|
|
5
5
|
import functools
|
|
6
6
|
|
|
7
7
|
from cuda.bindings import nvrtc
|
|
8
|
-
|
|
9
|
-
from
|
|
8
|
+
|
|
9
|
+
from ._caching import disk_cache
|
|
10
|
+
from ._common import check_in, version
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def CHECK_NVRTC(err, prog):
|
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
4
|
|
|
5
5
|
"""
|
|
6
|
-
cuda.
|
|
7
|
-
|
|
6
|
+
cuda.coop._scan_op
|
|
7
|
+
==================
|
|
8
8
|
|
|
9
9
|
This module implements the ``ScanOp`` class and related functions.
|
|
10
10
|
"""
|
|
@@ -14,7 +14,7 @@ from enum import Enum
|
|
|
14
14
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
|
|
17
|
-
from
|
|
17
|
+
from ._typing import (
|
|
18
18
|
ScanOpType,
|
|
19
19
|
)
|
|
20
20
|
|
|
@@ -17,8 +17,8 @@ from numba.core.typing import signature
|
|
|
17
17
|
from numba.cuda import LTOIR
|
|
18
18
|
from numba.cuda.cudadrv import driver as cuda_driver
|
|
19
19
|
|
|
20
|
-
import
|
|
21
|
-
from
|
|
20
|
+
from . import _nvrtc as nvrtc
|
|
21
|
+
from ._common import find_unsigned
|
|
22
22
|
|
|
23
23
|
NUMBA_TYPES_TO_CPP = {
|
|
24
24
|
types.boolean: "bool",
|
|
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
|
|
|
9
9
|
import numba
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
12
|
-
from
|
|
12
|
+
from ._common import dim3
|
|
13
13
|
|
|
14
14
|
# Type alias for dimension parameters that can be passed to CUDA functions.
|
|
15
15
|
DimType = Union["dim3", int, Tuple[int, int], Tuple[int, int, int]]
|
|
@@ -2,18 +2,18 @@
|
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from ._block_exchange import (
|
|
6
6
|
BlockExchangeType,
|
|
7
7
|
exchange,
|
|
8
8
|
)
|
|
9
|
-
from
|
|
10
|
-
from
|
|
11
|
-
from
|
|
9
|
+
from ._block_load_store import load, store
|
|
10
|
+
from ._block_merge_sort import merge_sort_keys
|
|
11
|
+
from ._block_radix_sort import (
|
|
12
12
|
radix_sort_keys,
|
|
13
13
|
radix_sort_keys_descending,
|
|
14
14
|
)
|
|
15
|
-
from
|
|
16
|
-
from
|
|
15
|
+
from ._block_reduce import reduce, sum
|
|
16
|
+
from ._block_scan import (
|
|
17
17
|
exclusive_scan,
|
|
18
18
|
exclusive_sum,
|
|
19
19
|
inclusive_scan,
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
4
|
|
|
5
5
|
"""
|
|
6
|
-
cuda.
|
|
6
|
+
cuda.coop.block_exchange
|
|
7
7
|
====================================
|
|
8
8
|
|
|
9
9
|
This module provides a set of :ref:`collective <collective-primitives>` methods
|
|
@@ -105,13 +105,13 @@ def exchange(
|
|
|
105
105
|
perform. Currently, only :py:attr:`StripedToBlocked` is supported.
|
|
106
106
|
|
|
107
107
|
:param dtype: Supplies the data type of the input and output arrays.
|
|
108
|
-
:type dtype: :py:class:`cuda.
|
|
108
|
+
:type dtype: :py:class:`cuda.coop._typing.DtypeType`
|
|
109
109
|
|
|
110
110
|
:param threads_per_block: Supplies the number of threads in the block,
|
|
111
111
|
either as an integer for a 1D block or a tuple of two or three integers
|
|
112
112
|
for a 2D or 3D block, respectively.
|
|
113
113
|
:type threads_per_block:
|
|
114
|
-
:py:class:`cuda.
|
|
114
|
+
:py:class:`cuda.coop._typing.DimType`
|
|
115
115
|
|
|
116
116
|
:param items_per_thread: Supplies the number of items partitioned onto each
|
|
117
117
|
thread.
|
|
@@ -137,7 +137,7 @@ def exchange(
|
|
|
137
137
|
:raises ValueError: If ``items_per_thread`` is greater than 1 and
|
|
138
138
|
``methods`` is not *None* (i.e. a user-defined type is being used).
|
|
139
139
|
|
|
140
|
-
:returns: An :py:class:`cuda.
|
|
140
|
+
:returns: An :py:class:`cuda.coop._types.Invocable`
|
|
141
141
|
object representing the specialized kernel that call be called from
|
|
142
142
|
a Numba JIT'd CUDA kernel.
|
|
143
143
|
|
|
@@ -5,12 +5,12 @@
|
|
|
5
5
|
|
|
6
6
|
import numba
|
|
7
7
|
|
|
8
|
-
from
|
|
8
|
+
from .._common import (
|
|
9
9
|
make_binary_tempfile,
|
|
10
10
|
normalize_dim_param,
|
|
11
11
|
normalize_dtype_param,
|
|
12
12
|
)
|
|
13
|
-
from
|
|
13
|
+
from .._types import (
|
|
14
14
|
Algorithm,
|
|
15
15
|
Dependency,
|
|
16
16
|
DependentArray,
|
|
@@ -70,13 +70,13 @@ def load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
|
|
|
70
70
|
The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
|
|
71
71
|
each thread handling 4 integers.
|
|
72
72
|
|
|
73
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
73
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
|
|
74
74
|
:language: python
|
|
75
75
|
:dedent:
|
|
76
76
|
:start-after: example-begin imports
|
|
77
77
|
:end-before: example-end imports
|
|
78
78
|
|
|
79
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
79
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
|
|
80
80
|
:language: python
|
|
81
81
|
:dedent:
|
|
82
82
|
:start-after: example-begin load_store
|
|
@@ -158,13 +158,13 @@ def store(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
|
|
|
158
158
|
The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
|
|
159
159
|
each thread handling 4 integers.
|
|
160
160
|
|
|
161
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
161
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
|
|
162
162
|
:language: python
|
|
163
163
|
:dedent:
|
|
164
164
|
:start-after: example-begin imports
|
|
165
165
|
:end-before: example-end imports
|
|
166
166
|
|
|
167
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
167
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
|
|
168
168
|
:language: python
|
|
169
169
|
:dedent:
|
|
170
170
|
:start-after: example-begin load_store
|
|
@@ -6,12 +6,12 @@ from typing import TYPE_CHECKING, Callable, Literal, Union
|
|
|
6
6
|
|
|
7
7
|
import numba
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from .._common import (
|
|
10
10
|
make_binary_tempfile,
|
|
11
11
|
normalize_dim_param,
|
|
12
12
|
normalize_dtype_param,
|
|
13
13
|
)
|
|
14
|
-
from
|
|
14
|
+
from .._types import (
|
|
15
15
|
Algorithm,
|
|
16
16
|
Constant,
|
|
17
17
|
Dependency,
|
|
@@ -41,7 +41,7 @@ def merge_sort_keys(
|
|
|
41
41
|
are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
|
|
42
42
|
where each thread owns 4 consecutive keys. We start by importing necessary modules:
|
|
43
43
|
|
|
44
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
44
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
|
|
45
45
|
:language: python
|
|
46
46
|
:dedent:
|
|
47
47
|
:start-after: example-begin imports
|
|
@@ -49,7 +49,7 @@ def merge_sort_keys(
|
|
|
49
49
|
|
|
50
50
|
Below is the code snippet that demonstrates the usage of the ``merge_sort_keys`` API:
|
|
51
51
|
|
|
52
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
52
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
|
|
53
53
|
:language: python
|
|
54
54
|
:dedent:
|
|
55
55
|
:start-after: example-begin merge-sort
|