cuda-cccl 0.3.0__cp310-cp310-manylinux_2_26_x86_64.whl → 0.3.1__cp310-cp310-manylinux_2_26_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +48 -46
- cuda/cccl/headers/include/cuda/__device/attributes.h +171 -121
- cuda/cccl/headers/include/cuda/__device/device_ref.h +30 -42
- cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +1 -0
- cuda/cccl/headers/include/cuda/__event/timed_event.h +1 -0
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/devices +10 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/string_view +12 -5
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +77 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +1 -1
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-310-x86_64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-310-x86_64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +141 -138
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-x86_64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-x86_64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,3 +1,9 @@
|
|
|
1
1
|
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
|
|
2
2
|
#
|
|
3
|
-
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
3
|
+
# SPDX-License -Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
|
+
|
|
5
|
+
from . import experimental
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"experimental",
|
|
9
|
+
]
|
|
@@ -1,8 +1,24 @@
|
|
|
1
|
-
# Copyright (c)
|
|
1
|
+
# Copyright (c) 2025, NVIDIA CORPORATION.
|
|
2
2
|
#
|
|
3
|
-
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
# alias for backwards compatibility
|
|
16
|
+
|
|
17
|
+
from warnings import warn
|
|
4
18
|
|
|
5
|
-
from cuda.
|
|
6
|
-
from cuda.cccl.cooperative.experimental._types import StatefulFunction
|
|
19
|
+
from cuda.coop import * # noqa: F403
|
|
7
20
|
|
|
8
|
-
|
|
21
|
+
warn(
|
|
22
|
+
"The module cuda.cccl.cooperative.experimental is deprecated. Use cuda.coop instead.",
|
|
23
|
+
FutureWarning,
|
|
24
|
+
)
|
|
@@ -64,9 +64,7 @@ struct AgentAdjacentDifferencePolicy
|
|
|
64
64
|
static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
|
|
65
65
|
};
|
|
66
66
|
|
|
67
|
-
namespace detail
|
|
68
|
-
{
|
|
69
|
-
namespace adjacent_difference
|
|
67
|
+
namespace detail::adjacent_difference
|
|
70
68
|
{
|
|
71
69
|
|
|
72
70
|
template <typename Policy,
|
|
@@ -256,7 +254,6 @@ struct AgentDifferenceInit
|
|
|
256
254
|
}
|
|
257
255
|
};
|
|
258
256
|
|
|
259
|
-
} // namespace adjacent_difference
|
|
260
|
-
} // namespace detail
|
|
257
|
+
} // namespace detail::adjacent_difference
|
|
261
258
|
|
|
262
259
|
CUB_NAMESPACE_END
|
|
@@ -62,9 +62,7 @@
|
|
|
62
62
|
|
|
63
63
|
CUB_NAMESPACE_BEGIN
|
|
64
64
|
|
|
65
|
-
namespace detail
|
|
66
|
-
{
|
|
67
|
-
namespace batch_memcpy
|
|
65
|
+
namespace detail::batch_memcpy
|
|
68
66
|
{
|
|
69
67
|
template <bool PTR_IS_FOUR_BYTE_ALIGNED>
|
|
70
68
|
_CCCL_FORCEINLINE _CCCL_DEVICE void
|
|
@@ -1179,7 +1177,6 @@ private:
|
|
|
1179
1177
|
// buffers
|
|
1180
1178
|
BLevBlockOffsetTileState blev_block_scan_state;
|
|
1181
1179
|
};
|
|
1182
|
-
} // namespace batch_memcpy
|
|
1183
|
-
} // namespace detail
|
|
1180
|
+
} // namespace detail::batch_memcpy
|
|
1184
1181
|
|
|
1185
1182
|
CUB_NAMESPACE_END
|
|
@@ -42,9 +42,7 @@
|
|
|
42
42
|
|
|
43
43
|
CUB_NAMESPACE_BEGIN
|
|
44
44
|
|
|
45
|
-
namespace detail
|
|
46
|
-
{
|
|
47
|
-
namespace for_each
|
|
45
|
+
namespace detail::for_each
|
|
48
46
|
{
|
|
49
47
|
|
|
50
48
|
template <int BlockThreads, int ItemsPerThread>
|
|
@@ -78,7 +76,6 @@ struct agent_block_striped_t
|
|
|
78
76
|
}
|
|
79
77
|
};
|
|
80
78
|
|
|
81
|
-
} // namespace for_each
|
|
82
|
-
} // namespace detail
|
|
79
|
+
} // namespace detail::for_each
|
|
83
80
|
|
|
84
81
|
CUB_NAMESPACE_END
|
|
@@ -53,14 +53,8 @@ struct agent_t
|
|
|
53
53
|
using policy = Policy;
|
|
54
54
|
|
|
55
55
|
// key and value type are taken from the first input sequence (consistent with old Thrust behavior)
|
|
56
|
-
using key_type
|
|
57
|
-
using item_type
|
|
58
|
-
|
|
59
|
-
using keys_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt1>;
|
|
60
|
-
using keys_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt2>;
|
|
61
|
-
using items_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt1>;
|
|
62
|
-
using items_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt2>;
|
|
63
|
-
|
|
56
|
+
using key_type = it_value_t<KeysIt1>;
|
|
57
|
+
using item_type = it_value_t<ItemsIt1>;
|
|
64
58
|
using block_store_keys = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
|
|
65
59
|
using block_store_items = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
|
|
66
60
|
|
|
@@ -84,11 +78,11 @@ struct agent_t
|
|
|
84
78
|
|
|
85
79
|
// Per thread data
|
|
86
80
|
temp_storages& storage;
|
|
87
|
-
|
|
88
|
-
|
|
81
|
+
KeysIt1 keys1_in;
|
|
82
|
+
ItemsIt1 items1_in;
|
|
89
83
|
Offset keys1_count;
|
|
90
|
-
|
|
91
|
-
|
|
84
|
+
KeysIt2 keys2_in;
|
|
85
|
+
ItemsIt2 items2_in;
|
|
92
86
|
Offset keys2_count;
|
|
93
87
|
KeysOutputIt keys_out;
|
|
94
88
|
ItemsOutputIt items_out;
|
|
@@ -128,10 +122,14 @@ struct agent_t
|
|
|
128
122
|
}
|
|
129
123
|
|
|
130
124
|
key_type keys_loc[items_per_thread];
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
125
|
+
{
|
|
126
|
+
auto keys1_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(keys1_in);
|
|
127
|
+
auto keys2_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(keys2_in);
|
|
128
|
+
merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
|
|
129
|
+
keys_loc, keys1_in_cm + keys1_beg, keys2_in_cm + keys2_beg, keys1_count_tile, keys2_count_tile);
|
|
130
|
+
merge_sort::reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
|
|
131
|
+
__syncthreads();
|
|
132
|
+
}
|
|
135
133
|
|
|
136
134
|
// now find the merge path for each of thread.
|
|
137
135
|
// we can use int type here, because the number of items in shared memory is limited
|
|
@@ -186,11 +184,15 @@ struct agent_t
|
|
|
186
184
|
if constexpr (have_items)
|
|
187
185
|
{
|
|
188
186
|
item_type items_loc[items_per_thread];
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
187
|
+
{
|
|
188
|
+
auto items1_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(items1_in);
|
|
189
|
+
auto items2_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(items2_in);
|
|
190
|
+
merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
|
|
191
|
+
items_loc, items1_in_cm + keys1_beg, items2_in_cm + keys2_beg, keys1_count_tile, keys2_count_tile);
|
|
192
|
+
__syncthreads(); // block_store_keys above uses SMEM, so make sure all threads are done before we write to it
|
|
193
|
+
merge_sort::reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
|
|
194
|
+
__syncthreads();
|
|
195
|
+
}
|
|
194
196
|
|
|
195
197
|
// gather items from shared mem
|
|
196
198
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
@@ -66,9 +66,28 @@ struct AgentMergeSortPolicy
|
|
|
66
66
|
static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
|
|
67
67
|
};
|
|
68
68
|
|
|
69
|
+
#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
69
70
|
namespace detail
|
|
70
71
|
{
|
|
71
|
-
|
|
72
|
+
// Only define this when needed.
|
|
73
|
+
// Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
|
|
74
|
+
// either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
|
|
75
|
+
// version is always defined, and that's the only one needed for regular CUB operations.
|
|
76
|
+
//
|
|
77
|
+
// TODO: enable this unconditionally once concepts are always available
|
|
78
|
+
CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
79
|
+
MergeSortAgentPolicy,
|
|
80
|
+
(GenericAgentPolicy),
|
|
81
|
+
(BLOCK_THREADS, BlockThreads, int),
|
|
82
|
+
(ITEMS_PER_THREAD, ItemsPerThread, int),
|
|
83
|
+
(ITEMS_PER_TILE, ItemsPerTile, int),
|
|
84
|
+
(LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
|
|
85
|
+
(LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
|
|
86
|
+
(STORE_ALGORITHM, StoreAlgorithm, cub::BlockStoreAlgorithm))
|
|
87
|
+
} // namespace detail
|
|
88
|
+
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES
|
|
89
|
+
|
|
90
|
+
namespace detail::merge_sort
|
|
72
91
|
{
|
|
73
92
|
|
|
74
93
|
template <typename Policy,
|
|
@@ -724,7 +743,6 @@ struct AgentMerge
|
|
|
724
743
|
}
|
|
725
744
|
};
|
|
726
745
|
|
|
727
|
-
} // namespace merge_sort
|
|
728
|
-
} // namespace detail
|
|
746
|
+
} // namespace detail::merge_sort
|
|
729
747
|
|
|
730
748
|
CUB_NAMESPACE_END
|
|
@@ -146,9 +146,7 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
|
146
146
|
* Thread block abstractions
|
|
147
147
|
******************************************************************************/
|
|
148
148
|
|
|
149
|
-
namespace detail
|
|
150
|
-
{
|
|
151
|
-
namespace radix_sort
|
|
149
|
+
namespace detail::radix_sort
|
|
152
150
|
{
|
|
153
151
|
|
|
154
152
|
/**
|
|
@@ -783,7 +781,6 @@ struct AgentRadixSortDownsweep
|
|
|
783
781
|
}
|
|
784
782
|
};
|
|
785
783
|
|
|
786
|
-
} // namespace radix_sort
|
|
787
|
-
} // namespace detail
|
|
784
|
+
} // namespace detail::radix_sort
|
|
788
785
|
|
|
789
786
|
CUB_NAMESPACE_END
|
|
@@ -85,9 +85,7 @@ struct AgentRadixSortExclusiveSumPolicy
|
|
|
85
85
|
};
|
|
86
86
|
};
|
|
87
87
|
|
|
88
|
-
namespace detail
|
|
89
|
-
{
|
|
90
|
-
namespace radix_sort
|
|
88
|
+
namespace detail::radix_sort
|
|
91
89
|
{
|
|
92
90
|
|
|
93
91
|
template <typename AgentRadixSortHistogramPolicy,
|
|
@@ -283,7 +281,6 @@ struct AgentRadixSortHistogram
|
|
|
283
281
|
}
|
|
284
282
|
};
|
|
285
283
|
|
|
286
|
-
} // namespace radix_sort
|
|
287
|
-
} // namespace detail
|
|
284
|
+
} // namespace detail::radix_sort
|
|
288
285
|
|
|
289
286
|
CUB_NAMESPACE_END
|
|
@@ -100,9 +100,7 @@ struct AgentRadixSortOnesweepPolicy : ScalingType
|
|
|
100
100
|
static constexpr RadixSortStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
|
|
101
101
|
};
|
|
102
102
|
|
|
103
|
-
namespace detail
|
|
104
|
-
{
|
|
105
|
-
namespace radix_sort
|
|
103
|
+
namespace detail::radix_sort
|
|
106
104
|
{
|
|
107
105
|
|
|
108
106
|
template <typename AgentRadixSortOnesweepPolicy,
|
|
@@ -700,7 +698,6 @@ struct AgentRadixSortOnesweep
|
|
|
700
698
|
}
|
|
701
699
|
};
|
|
702
700
|
|
|
703
|
-
} // namespace radix_sort
|
|
704
|
-
} // namespace detail
|
|
701
|
+
} // namespace detail::radix_sort
|
|
705
702
|
|
|
706
703
|
CUB_NAMESPACE_END
|
|
@@ -103,9 +103,7 @@ struct AgentRadixSortUpsweepPolicy : ScalingType
|
|
|
103
103
|
* Thread block abstractions
|
|
104
104
|
******************************************************************************/
|
|
105
105
|
|
|
106
|
-
namespace detail
|
|
107
|
-
{
|
|
108
|
-
namespace radix_sort
|
|
106
|
+
namespace detail::radix_sort
|
|
109
107
|
{
|
|
110
108
|
|
|
111
109
|
/**
|
|
@@ -552,7 +550,6 @@ struct AgentRadixSortUpsweep
|
|
|
552
550
|
}
|
|
553
551
|
};
|
|
554
552
|
|
|
555
|
-
} // namespace radix_sort
|
|
556
|
-
} // namespace detail
|
|
553
|
+
} // namespace detail::radix_sort
|
|
557
554
|
|
|
558
555
|
CUB_NAMESPACE_END
|
|
@@ -134,9 +134,7 @@ struct AgentRlePolicy
|
|
|
134
134
|
* Thread block abstractions
|
|
135
135
|
******************************************************************************/
|
|
136
136
|
|
|
137
|
-
namespace detail
|
|
138
|
-
{
|
|
139
|
-
namespace rle
|
|
137
|
+
namespace detail::rle
|
|
140
138
|
{
|
|
141
139
|
|
|
142
140
|
/**
|
|
@@ -1121,7 +1119,6 @@ struct AgentRle
|
|
|
1121
1119
|
}
|
|
1122
1120
|
};
|
|
1123
1121
|
|
|
1124
|
-
} // namespace rle
|
|
1125
|
-
} // namespace detail
|
|
1122
|
+
} // namespace detail::rle
|
|
1126
1123
|
|
|
1127
1124
|
CUB_NAMESPACE_END
|
|
@@ -51,6 +51,10 @@
|
|
|
51
51
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
52
52
|
#include <cub/util_device.cuh>
|
|
53
53
|
|
|
54
|
+
#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
55
|
+
# include <cub/agent/agent_unique_by_key.cuh> // for UniqueByKeyAgentPolicy
|
|
56
|
+
#endif
|
|
57
|
+
|
|
54
58
|
#include <cuda/std/__type_traits/conditional.h>
|
|
55
59
|
#include <cuda/std/__type_traits/is_pointer.h>
|
|
56
60
|
#include <cuda/std/__type_traits/is_same.h>
|
|
@@ -123,7 +127,7 @@ namespace detail
|
|
|
123
127
|
// TODO: enable this unconditionally once concepts are always available
|
|
124
128
|
CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
125
129
|
ScanAgentPolicy,
|
|
126
|
-
(
|
|
130
|
+
(UniqueByKeyAgentPolicy),
|
|
127
131
|
(BLOCK_THREADS, BlockThreads, int),
|
|
128
132
|
(ITEMS_PER_THREAD, ItemsPerThread, int),
|
|
129
133
|
(LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
|
|
@@ -96,9 +96,7 @@ struct AgentScanByKeyPolicy
|
|
|
96
96
|
* Thread block abstractions
|
|
97
97
|
******************************************************************************/
|
|
98
98
|
|
|
99
|
-
namespace detail
|
|
100
|
-
{
|
|
101
|
-
namespace scan_by_key
|
|
99
|
+
namespace detail::scan_by_key
|
|
102
100
|
{
|
|
103
101
|
|
|
104
102
|
/**
|
|
@@ -471,7 +469,6 @@ struct AgentScanByKey
|
|
|
471
469
|
}
|
|
472
470
|
};
|
|
473
471
|
|
|
474
|
-
} // namespace scan_by_key
|
|
475
|
-
} // namespace detail
|
|
472
|
+
} // namespace detail::scan_by_key
|
|
476
473
|
|
|
477
474
|
CUB_NAMESPACE_END
|
|
@@ -45,9 +45,7 @@
|
|
|
45
45
|
|
|
46
46
|
CUB_NAMESPACE_BEGIN
|
|
47
47
|
|
|
48
|
-
namespace detail
|
|
49
|
-
{
|
|
50
|
-
namespace radix_sort
|
|
48
|
+
namespace detail::radix_sort
|
|
51
49
|
{
|
|
52
50
|
|
|
53
51
|
/**
|
|
@@ -286,7 +284,6 @@ struct AgentSegmentedRadixSort
|
|
|
286
284
|
}
|
|
287
285
|
};
|
|
288
286
|
|
|
289
|
-
} // namespace radix_sort
|
|
290
|
-
} // namespace detail
|
|
287
|
+
} // namespace detail::radix_sort
|
|
291
288
|
|
|
292
289
|
CUB_NAMESPACE_END
|
|
@@ -126,9 +126,7 @@ struct AgentSelectIfPolicy
|
|
|
126
126
|
* Thread block abstractions
|
|
127
127
|
******************************************************************************/
|
|
128
128
|
|
|
129
|
-
namespace detail
|
|
130
|
-
{
|
|
131
|
-
namespace select
|
|
129
|
+
namespace detail::select
|
|
132
130
|
{
|
|
133
131
|
|
|
134
132
|
template <typename EqualityOpT>
|
|
@@ -1114,7 +1112,6 @@ struct AgentSelectIf
|
|
|
1114
1112
|
}
|
|
1115
1113
|
};
|
|
1116
1114
|
|
|
1117
|
-
} // namespace select
|
|
1118
|
-
} // namespace detail
|
|
1115
|
+
} // namespace detail::select
|
|
1119
1116
|
|
|
1120
1117
|
CUB_NAMESPACE_END
|
|
@@ -84,9 +84,7 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
|
84
84
|
} // namespace detail
|
|
85
85
|
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
86
86
|
|
|
87
|
-
namespace detail
|
|
88
|
-
{
|
|
89
|
-
namespace sub_warp_merge_sort
|
|
87
|
+
namespace detail::sub_warp_merge_sort
|
|
90
88
|
{
|
|
91
89
|
|
|
92
90
|
/**
|
|
@@ -343,7 +341,6 @@ private:
|
|
|
343
341
|
}
|
|
344
342
|
};
|
|
345
343
|
|
|
346
|
-
} // namespace sub_warp_merge_sort
|
|
347
|
-
} // namespace detail
|
|
344
|
+
} // namespace detail::sub_warp_merge_sort
|
|
348
345
|
|
|
349
346
|
CUB_NAMESPACE_END
|
|
@@ -91,9 +91,7 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
|
91
91
|
} // namespace detail
|
|
92
92
|
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
93
93
|
|
|
94
|
-
namespace detail
|
|
95
|
-
{
|
|
96
|
-
namespace three_way_partition
|
|
94
|
+
namespace detail::three_way_partition
|
|
97
95
|
{
|
|
98
96
|
|
|
99
97
|
template <class OffsetT>
|
|
@@ -603,7 +601,6 @@ struct AgentThreeWayPartition
|
|
|
603
601
|
}
|
|
604
602
|
};
|
|
605
603
|
|
|
606
|
-
} // namespace three_way_partition
|
|
607
|
-
} // namespace detail
|
|
604
|
+
} // namespace detail::three_way_partition
|
|
608
605
|
|
|
609
606
|
CUB_NAMESPACE_END
|
|
@@ -85,13 +85,31 @@ struct AgentUniqueByKeyPolicy
|
|
|
85
85
|
};
|
|
86
86
|
};
|
|
87
87
|
|
|
88
|
+
#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
89
|
+
namespace detail
|
|
90
|
+
{
|
|
91
|
+
// Only define this when needed.
|
|
92
|
+
// Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
|
|
93
|
+
// either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
|
|
94
|
+
// version is always defined, and that's the only one needed for regular CUB operations.
|
|
95
|
+
//
|
|
96
|
+
// TODO: enable this unconditionally once concepts are always available
|
|
97
|
+
CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
98
|
+
UniqueByKeyAgentPolicy,
|
|
99
|
+
(GenericAgentPolicy),
|
|
100
|
+
(BLOCK_THREADS, BlockThreads, int),
|
|
101
|
+
(ITEMS_PER_THREAD, ItemsPerThread, int),
|
|
102
|
+
(LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
|
|
103
|
+
(LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
|
|
104
|
+
(SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
|
|
105
|
+
} // namespace detail
|
|
106
|
+
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
107
|
+
|
|
88
108
|
/******************************************************************************
|
|
89
109
|
* Thread block abstractions
|
|
90
110
|
******************************************************************************/
|
|
91
111
|
|
|
92
|
-
namespace detail
|
|
93
|
-
{
|
|
94
|
-
namespace unique_by_key
|
|
112
|
+
namespace detail::unique_by_key
|
|
95
113
|
{
|
|
96
114
|
|
|
97
115
|
/**
|
|
@@ -608,7 +626,6 @@ struct AgentUniqueByKey
|
|
|
608
626
|
}
|
|
609
627
|
};
|
|
610
628
|
|
|
611
|
-
} // namespace unique_by_key
|
|
612
|
-
} // namespace detail
|
|
629
|
+
} // namespace detail::unique_by_key
|
|
613
630
|
|
|
614
631
|
CUB_NAMESPACE_END
|
|
@@ -50,6 +50,7 @@
|
|
|
50
50
|
|
|
51
51
|
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
52
52
|
#include <cuda/std/__algorithm/max.h>
|
|
53
|
+
#include <cuda/std/__bit/integral.h>
|
|
53
54
|
#include <cuda/std/__functional/operations.h>
|
|
54
55
|
#include <cuda/std/__type_traits/conditional.h>
|
|
55
56
|
#include <cuda/std/__type_traits/is_same.h>
|
|
@@ -1072,7 +1073,7 @@ struct BlockRadixRankMatchEarlyCounts
|
|
|
1072
1073
|
atomicOr(p_match_mask, lane_mask);
|
|
1073
1074
|
__syncwarp(WARP_MASK);
|
|
1074
1075
|
int bin_mask = *p_match_mask;
|
|
1075
|
-
int leader = (
|
|
1076
|
+
int leader = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
|
|
1076
1077
|
int warp_offset = 0;
|
|
1077
1078
|
int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
|
|
1078
1079
|
if (lane == leader)
|
|
@@ -1102,7 +1103,7 @@ struct BlockRadixRankMatchEarlyCounts
|
|
|
1102
1103
|
::cuda::std::uint32_t bin = Digit(keys[u]);
|
|
1103
1104
|
int bin_mask =
|
|
1104
1105
|
detail::warp_in_block_matcher_t<RADIX_BITS, PARTIAL_WARP_THREADS, BLOCK_WARPS - 1>::match_any(bin, warp);
|
|
1105
|
-
int leader = (
|
|
1106
|
+
int leader = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
|
|
1106
1107
|
int warp_offset = 0;
|
|
1107
1108
|
int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
|
|
1108
1109
|
if (lane == leader)
|
|
@@ -174,10 +174,12 @@ CUB_NAMESPACE_BEGIN
|
|
|
174
174
|
//!
|
|
175
175
|
//! .. code-block:: python
|
|
176
176
|
//!
|
|
177
|
-
//!
|
|
177
|
+
//! from cuda import coop
|
|
178
|
+
//! from pynvjitlink import patch
|
|
179
|
+
//! patch.patch_numba_linker(lto=True)
|
|
178
180
|
//!
|
|
179
181
|
//! # Specialize radix sort for a 1D block of 128 threads owning 4 integer items each
|
|
180
|
-
//! block_radix_sort =
|
|
182
|
+
//! block_radix_sort = coop.block.radix_sort_keys(numba.int32, 128, 4)
|
|
181
183
|
//! temp_storage_bytes = block_radix_sort.temp_storage_bytes
|
|
182
184
|
//!
|
|
183
185
|
//! @cuda.jit(link=block_radix_sort.files)
|