cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +48 -46
- cuda/cccl/headers/include/cuda/__device/attributes.h +171 -121
- cuda/cccl/headers/include/cuda/__device/device_ref.h +30 -42
- cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +1 -0
- cuda/cccl/headers/include/cuda/__event/timed_event.h +1 -0
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/devices +10 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/string_view +12 -5
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +77 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +1 -1
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +141 -138
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -328,11 +328,6 @@ struct DispatchReduceDeterministic
|
|
|
328
328
|
// Alias the allocation for the privatized per-block reductions
|
|
329
329
|
deterministic_accum_t* d_block_reductions = (deterministic_accum_t*) allocations[0];
|
|
330
330
|
|
|
331
|
-
if (num_chunks > 1 && !detail::all_iterators_support_add_assign_operator(::cuda::std::int32_t{}, d_in))
|
|
332
|
-
{
|
|
333
|
-
return cudaErrorInvalidValue;
|
|
334
|
-
}
|
|
335
|
-
|
|
336
331
|
auto d_chunk_block_reductions = d_block_reductions;
|
|
337
332
|
for (int chunk_index = 0; chunk_index < num_chunks; chunk_index++)
|
|
338
333
|
{
|
|
@@ -372,7 +367,7 @@ struct DispatchReduceDeterministic
|
|
|
372
367
|
|
|
373
368
|
if (chunk_index + 1 < num_chunks)
|
|
374
369
|
{
|
|
375
|
-
|
|
370
|
+
d_in += num_current_items;
|
|
376
371
|
d_chunk_block_reductions += current_grid_size;
|
|
377
372
|
}
|
|
378
373
|
|
|
@@ -20,7 +20,6 @@
|
|
|
20
20
|
|
|
21
21
|
#include <cub/detail/launcher/cuda_runtime.cuh>
|
|
22
22
|
#include <cub/detail/type_traits.cuh> // for cub::detail::invoke_result_t
|
|
23
|
-
#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
|
|
24
23
|
#include <cub/device/dispatch/kernels/reduce.cuh>
|
|
25
24
|
#include <cub/device/dispatch/tuning/tuning_reduce.cuh>
|
|
26
25
|
#include <cub/grid/grid_even_share.cuh>
|
|
@@ -40,7 +40,6 @@
|
|
|
40
40
|
#include <cub/detail/device_double_buffer.cuh>
|
|
41
41
|
#include <cub/detail/temporary_storage.cuh>
|
|
42
42
|
#include <cub/device/device_partition.cuh>
|
|
43
|
-
#include <cub/device/dispatch/dispatch_advance_iterators.cuh>
|
|
44
43
|
#include <cub/device/dispatch/kernels/segmented_sort.cuh>
|
|
45
44
|
#include <cub/device/dispatch/tuning/tuning_segmented_sort.cuh>
|
|
46
45
|
#include <cub/util_debug.cuh>
|
|
@@ -764,8 +763,8 @@ private:
|
|
|
764
763
|
BeginOffsetIteratorT current_begin_offset = d_begin_offsets;
|
|
765
764
|
EndOffsetIteratorT current_end_offset = d_end_offsets;
|
|
766
765
|
|
|
767
|
-
|
|
768
|
-
|
|
766
|
+
current_begin_offset += current_seg_offset;
|
|
767
|
+
current_end_offset += current_seg_offset;
|
|
769
768
|
|
|
770
769
|
auto medium_indices_iterator =
|
|
771
770
|
::cuda::std::make_reverse_iterator(large_and_medium_segments_indices.get() + current_num_segments);
|
|
@@ -47,9 +47,7 @@
|
|
|
47
47
|
|
|
48
48
|
CUB_NAMESPACE_BEGIN
|
|
49
49
|
|
|
50
|
-
namespace detail
|
|
51
|
-
{
|
|
52
|
-
namespace reduce
|
|
50
|
+
namespace detail::reduce
|
|
53
51
|
{
|
|
54
52
|
|
|
55
53
|
/**
|
|
@@ -580,7 +578,6 @@ CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(
|
|
|
580
578
|
}
|
|
581
579
|
}
|
|
582
580
|
|
|
583
|
-
} // namespace reduce
|
|
584
|
-
} // namespace detail
|
|
581
|
+
} // namespace detail::reduce
|
|
585
582
|
|
|
586
583
|
CUB_NAMESPACE_END
|
|
@@ -42,9 +42,7 @@
|
|
|
42
42
|
|
|
43
43
|
CUB_NAMESPACE_BEGIN
|
|
44
44
|
|
|
45
|
-
namespace detail
|
|
46
|
-
{
|
|
47
|
-
namespace scan
|
|
45
|
+
namespace detail::scan
|
|
48
46
|
{
|
|
49
47
|
|
|
50
48
|
/******************************************************************************
|
|
@@ -186,7 +184,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS))
|
|
|
186
184
|
AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(num_items, tile_state, start_tile);
|
|
187
185
|
}
|
|
188
186
|
|
|
189
|
-
} // namespace scan
|
|
190
|
-
} // namespace detail
|
|
187
|
+
} // namespace detail::scan
|
|
191
188
|
|
|
192
189
|
CUB_NAMESPACE_END
|
|
@@ -43,9 +43,7 @@
|
|
|
43
43
|
|
|
44
44
|
CUB_NAMESPACE_BEGIN
|
|
45
45
|
|
|
46
|
-
namespace detail
|
|
47
|
-
{
|
|
48
|
-
namespace reduce
|
|
46
|
+
namespace detail::reduce
|
|
49
47
|
{
|
|
50
48
|
|
|
51
49
|
/// Normalize input iterator to segment offset
|
|
@@ -318,7 +316,6 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)
|
|
|
318
316
|
}
|
|
319
317
|
}
|
|
320
318
|
|
|
321
|
-
} // namespace reduce
|
|
322
|
-
} // namespace detail
|
|
319
|
+
} // namespace detail::reduce
|
|
323
320
|
|
|
324
321
|
CUB_NAMESPACE_END
|
|
@@ -43,9 +43,7 @@
|
|
|
43
43
|
|
|
44
44
|
CUB_NAMESPACE_BEGIN
|
|
45
45
|
|
|
46
|
-
namespace detail
|
|
47
|
-
{
|
|
48
|
-
namespace adjacent_difference
|
|
46
|
+
namespace detail::adjacent_difference
|
|
49
47
|
{
|
|
50
48
|
template <typename InputIteratorT, bool MayAlias>
|
|
51
49
|
struct policy_hub
|
|
@@ -64,7 +62,6 @@ struct policy_hub
|
|
|
64
62
|
|
|
65
63
|
using MaxPolicy = Policy500;
|
|
66
64
|
};
|
|
67
|
-
} // namespace adjacent_difference
|
|
68
|
-
} // namespace detail
|
|
65
|
+
} // namespace detail::adjacent_difference
|
|
69
66
|
|
|
70
67
|
CUB_NAMESPACE_END
|
|
@@ -43,9 +43,7 @@
|
|
|
43
43
|
|
|
44
44
|
CUB_NAMESPACE_BEGIN
|
|
45
45
|
|
|
46
|
-
namespace detail
|
|
47
|
-
{
|
|
48
|
-
namespace batch_memcpy
|
|
46
|
+
namespace detail::batch_memcpy
|
|
49
47
|
{
|
|
50
48
|
/**
|
|
51
49
|
* Parameterizable tuning policy type for AgentBatchMemcpy
|
|
@@ -115,7 +113,6 @@ struct policy_hub
|
|
|
115
113
|
|
|
116
114
|
using MaxPolicy = Policy700;
|
|
117
115
|
};
|
|
118
|
-
} // namespace batch_memcpy
|
|
119
|
-
} // namespace detail
|
|
116
|
+
} // namespace detail::batch_memcpy
|
|
120
117
|
|
|
121
118
|
CUB_NAMESPACE_END
|
|
@@ -42,9 +42,7 @@
|
|
|
42
42
|
|
|
43
43
|
CUB_NAMESPACE_BEGIN
|
|
44
44
|
|
|
45
|
-
namespace detail
|
|
46
|
-
{
|
|
47
|
-
namespace for_each
|
|
45
|
+
namespace detail::for_each
|
|
48
46
|
{
|
|
49
47
|
|
|
50
48
|
struct policy_hub_t
|
|
@@ -57,7 +55,6 @@ struct policy_hub_t
|
|
|
57
55
|
using MaxPolicy = policy_500_t;
|
|
58
56
|
};
|
|
59
57
|
|
|
60
|
-
} // namespace for_each
|
|
61
|
-
} // namespace detail
|
|
58
|
+
} // namespace detail::for_each
|
|
62
59
|
|
|
63
60
|
CUB_NAMESPACE_END
|
|
@@ -46,9 +46,7 @@
|
|
|
46
46
|
|
|
47
47
|
CUB_NAMESPACE_BEGIN
|
|
48
48
|
|
|
49
|
-
namespace detail
|
|
50
|
-
{
|
|
51
|
-
namespace histogram
|
|
49
|
+
namespace detail::histogram
|
|
52
50
|
{
|
|
53
51
|
enum class primitive_sample
|
|
54
52
|
{
|
|
@@ -272,7 +270,6 @@ struct policy_hub
|
|
|
272
270
|
|
|
273
271
|
using MaxPolicy = Policy1000;
|
|
274
272
|
};
|
|
275
|
-
} // namespace histogram
|
|
276
|
-
} // namespace detail
|
|
273
|
+
} // namespace detail::histogram
|
|
277
274
|
|
|
278
275
|
CUB_NAMESPACE_END
|
|
@@ -42,9 +42,7 @@
|
|
|
42
42
|
|
|
43
43
|
CUB_NAMESPACE_BEGIN
|
|
44
44
|
|
|
45
|
-
namespace detail
|
|
46
|
-
{
|
|
47
|
-
namespace merge
|
|
45
|
+
namespace detail::merge
|
|
48
46
|
{
|
|
49
47
|
template <typename KeyT, typename ValueT>
|
|
50
48
|
struct policy_hub
|
|
@@ -73,7 +71,6 @@ struct policy_hub
|
|
|
73
71
|
|
|
74
72
|
using max_policy = policy600;
|
|
75
73
|
};
|
|
76
|
-
} // namespace merge
|
|
77
|
-
} // namespace detail
|
|
74
|
+
} // namespace detail::merge
|
|
78
75
|
|
|
79
76
|
CUB_NAMESPACE_END
|
|
@@ -62,6 +62,14 @@ struct MergeSortPolicyWrapper<StaticPolicyT, ::cuda::std::void_t<decltype(Static
|
|
|
62
62
|
{}
|
|
63
63
|
|
|
64
64
|
CUB_DEFINE_SUB_POLICY_GETTER(MergeSort);
|
|
65
|
+
|
|
66
|
+
#if defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
67
|
+
_CCCL_DEVICE static constexpr auto EncodedPolicy()
|
|
68
|
+
{
|
|
69
|
+
using namespace ptx_json;
|
|
70
|
+
return object<key<"MergeSortPolicy">() = MergeSort().EncodedPolicy()>();
|
|
71
|
+
}
|
|
72
|
+
#endif
|
|
65
73
|
};
|
|
66
74
|
|
|
67
75
|
template <typename PolicyT>
|
|
@@ -46,9 +46,7 @@
|
|
|
46
46
|
|
|
47
47
|
CUB_NAMESPACE_BEGIN
|
|
48
48
|
|
|
49
|
-
namespace detail
|
|
50
|
-
{
|
|
51
|
-
namespace radix
|
|
49
|
+
namespace detail::radix
|
|
52
50
|
{
|
|
53
51
|
// sm90 default
|
|
54
52
|
template <size_t KeySize, size_t ValueSize, size_t OffsetSize>
|
|
@@ -1062,7 +1060,6 @@ struct policy_hub
|
|
|
1062
1060
|
using MaxPolicy = Policy1000;
|
|
1063
1061
|
};
|
|
1064
1062
|
|
|
1065
|
-
} // namespace radix
|
|
1066
|
-
} // namespace detail
|
|
1063
|
+
} // namespace detail::radix
|
|
1067
1064
|
|
|
1068
1065
|
CUB_NAMESPACE_END
|
|
@@ -50,9 +50,7 @@
|
|
|
50
50
|
|
|
51
51
|
CUB_NAMESPACE_BEGIN
|
|
52
52
|
|
|
53
|
-
namespace detail
|
|
54
|
-
{
|
|
55
|
-
namespace reduce_by_key
|
|
53
|
+
namespace detail::reduce_by_key
|
|
56
54
|
{
|
|
57
55
|
enum class primitive_key
|
|
58
56
|
{
|
|
@@ -939,7 +937,6 @@ struct policy_hub
|
|
|
939
937
|
};
|
|
940
938
|
using MaxPolicy = Policy1000;
|
|
941
939
|
};
|
|
942
|
-
} // namespace reduce_by_key
|
|
943
|
-
} // namespace detail
|
|
940
|
+
} // namespace detail::reduce_by_key
|
|
944
941
|
|
|
945
942
|
CUB_NAMESPACE_END
|
|
@@ -52,9 +52,7 @@
|
|
|
52
52
|
|
|
53
53
|
CUB_NAMESPACE_BEGIN
|
|
54
54
|
|
|
55
|
-
namespace detail
|
|
56
|
-
{
|
|
57
|
-
namespace rle
|
|
55
|
+
namespace detail::rle
|
|
58
56
|
{
|
|
59
57
|
enum class primitive_key
|
|
60
58
|
{
|
|
@@ -670,7 +668,6 @@ struct policy_hub
|
|
|
670
668
|
using MaxPolicy = Policy1000;
|
|
671
669
|
};
|
|
672
670
|
} // namespace non_trivial_runs
|
|
673
|
-
} // namespace rle
|
|
674
|
-
} // namespace detail
|
|
671
|
+
} // namespace detail::rle
|
|
675
672
|
|
|
676
673
|
CUB_NAMESPACE_END
|
|
@@ -53,9 +53,7 @@
|
|
|
53
53
|
|
|
54
54
|
CUB_NAMESPACE_BEGIN
|
|
55
55
|
|
|
56
|
-
namespace detail
|
|
57
|
-
{
|
|
58
|
-
namespace scan
|
|
56
|
+
namespace detail::scan
|
|
59
57
|
{
|
|
60
58
|
enum class keep_rejects
|
|
61
59
|
{
|
|
@@ -615,7 +613,6 @@ struct policy_hub
|
|
|
615
613
|
|
|
616
614
|
using MaxPolicy = Policy1000;
|
|
617
615
|
};
|
|
618
|
-
} // namespace scan
|
|
619
|
-
} // namespace detail
|
|
616
|
+
} // namespace detail::scan
|
|
620
617
|
|
|
621
618
|
CUB_NAMESPACE_END
|
|
@@ -49,9 +49,7 @@
|
|
|
49
49
|
|
|
50
50
|
CUB_NAMESPACE_BEGIN
|
|
51
51
|
|
|
52
|
-
namespace detail
|
|
53
|
-
{
|
|
54
|
-
namespace scan_by_key
|
|
52
|
+
namespace detail::scan_by_key
|
|
55
53
|
{
|
|
56
54
|
enum class primitive_accum
|
|
57
55
|
{
|
|
@@ -1007,7 +1005,6 @@ struct policy_hub
|
|
|
1007
1005
|
|
|
1008
1006
|
using MaxPolicy = Policy1000;
|
|
1009
1007
|
};
|
|
1010
|
-
} // namespace scan_by_key
|
|
1011
|
-
} // namespace detail
|
|
1008
|
+
} // namespace detail::scan_by_key
|
|
1012
1009
|
|
|
1013
1010
|
CUB_NAMESPACE_END
|
|
@@ -43,9 +43,7 @@
|
|
|
43
43
|
|
|
44
44
|
CUB_NAMESPACE_BEGIN
|
|
45
45
|
|
|
46
|
-
namespace detail
|
|
47
|
-
{
|
|
48
|
-
namespace segmented_sort
|
|
46
|
+
namespace detail::segmented_sort
|
|
49
47
|
{
|
|
50
48
|
|
|
51
49
|
template <typename PolicyT, typename = void>
|
|
@@ -395,7 +393,6 @@ struct policy_hub
|
|
|
395
393
|
|
|
396
394
|
using MaxPolicy = Policy860;
|
|
397
395
|
};
|
|
398
|
-
} // namespace segmented_sort
|
|
399
|
-
} // namespace detail
|
|
396
|
+
} // namespace detail::segmented_sort
|
|
400
397
|
|
|
401
398
|
CUB_NAMESPACE_END
|
|
@@ -47,9 +47,7 @@
|
|
|
47
47
|
|
|
48
48
|
CUB_NAMESPACE_BEGIN
|
|
49
49
|
|
|
50
|
-
namespace detail
|
|
51
|
-
{
|
|
52
|
-
namespace three_way_partition
|
|
50
|
+
namespace detail::three_way_partition
|
|
53
51
|
{
|
|
54
52
|
|
|
55
53
|
template <typename PolicyT, typename = void>
|
|
@@ -437,7 +435,6 @@ struct policy_hub
|
|
|
437
435
|
|
|
438
436
|
using MaxPolicy = Policy1000;
|
|
439
437
|
};
|
|
440
|
-
} // namespace three_way_partition
|
|
441
|
-
} // namespace detail
|
|
438
|
+
} // namespace detail::three_way_partition
|
|
442
439
|
|
|
443
440
|
CUB_NAMESPACE_END
|
|
@@ -788,6 +788,16 @@ struct UniqueByKeyPolicyWrapper<StaticPolicyT,
|
|
|
788
788
|
{
|
|
789
789
|
return cub::detail::MakePolicyWrapper(typename StaticPolicyT::UniqueByKeyPolicyT());
|
|
790
790
|
}
|
|
791
|
+
|
|
792
|
+
#if defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
793
|
+
_CCCL_DEVICE static constexpr auto EncodedPolicy()
|
|
794
|
+
{
|
|
795
|
+
using namespace ptx_json;
|
|
796
|
+
return object<key<"UniqueByKeyPolicyT">() = UniqueByKey().EncodedPolicy(),
|
|
797
|
+
key<"DelayConstructor">() =
|
|
798
|
+
StaticPolicyT::UniqueByKeyPolicyT::detail::delay_constructor_t::EncodedConstructor()>();
|
|
799
|
+
}
|
|
800
|
+
#endif
|
|
791
801
|
};
|
|
792
802
|
|
|
793
803
|
template <typename PolicyT>
|
|
@@ -51,6 +51,7 @@
|
|
|
51
51
|
#include <cuda/__functional/maximum.h>
|
|
52
52
|
#include <cuda/__functional/minimum.h>
|
|
53
53
|
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
54
|
+
#include <cuda/std/__bit/countr.h>
|
|
54
55
|
#include <cuda/std/__functional/operations.h>
|
|
55
56
|
#include <cuda/std/__type_traits/enable_if.h>
|
|
56
57
|
#include <cuda/std/__type_traits/integral_constant.h>
|
|
@@ -701,7 +702,7 @@ struct WarpReduceShfl
|
|
|
701
702
|
_CCCL_DEVICE _CCCL_FORCEINLINE T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
|
|
702
703
|
{
|
|
703
704
|
// Get the start flags for each thread in the warp.
|
|
704
|
-
|
|
705
|
+
unsigned warp_flags = __ballot_sync(member_mask, flag);
|
|
705
706
|
|
|
706
707
|
// Convert to tail-segmented
|
|
707
708
|
if (HEAD_SEGMENTED)
|
|
@@ -722,7 +723,7 @@ struct WarpReduceShfl
|
|
|
722
723
|
warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
|
|
723
724
|
|
|
724
725
|
// Find the next set flag
|
|
725
|
-
int last_lane =
|
|
726
|
+
int last_lane = ::cuda::std::countr_zero(warp_flags);
|
|
726
727
|
|
|
727
728
|
T output = input;
|
|
728
729
|
// Template-iterate reduction steps
|
|
@@ -49,6 +49,7 @@
|
|
|
49
49
|
#include <cub/util_type.cuh>
|
|
50
50
|
|
|
51
51
|
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
52
|
+
#include <cuda/std/__bit/countr.h>
|
|
52
53
|
#include <cuda/std/__type_traits/integral_constant.h>
|
|
53
54
|
|
|
54
55
|
CUB_NAMESPACE_BEGIN
|
|
@@ -215,7 +216,7 @@ struct WarpReduceSmem
|
|
|
215
216
|
SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, ::cuda::std::true_type /*has_ballot*/)
|
|
216
217
|
{
|
|
217
218
|
// Get the start flags for each thread in the warp.
|
|
218
|
-
|
|
219
|
+
unsigned warp_flags = __ballot_sync(member_mask, flag);
|
|
219
220
|
|
|
220
221
|
if (!HEAD_SEGMENTED)
|
|
221
222
|
{
|
|
@@ -232,7 +233,7 @@ struct WarpReduceSmem
|
|
|
232
233
|
}
|
|
233
234
|
|
|
234
235
|
// Find next flag
|
|
235
|
-
int next_flag =
|
|
236
|
+
int next_flag = ::cuda::std::countr_zero(warp_flags);
|
|
236
237
|
|
|
237
238
|
// Clip the next segment at the warp boundary if necessary
|
|
238
239
|
if (LOGICAL_WARP_THREADS != 32)
|
|
@@ -50,8 +50,8 @@
|
|
|
50
50
|
|
|
51
51
|
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
52
52
|
#include <cuda/std/__algorithm/clamp.h>
|
|
53
|
-
#include <cuda/std/__algorithm/max.h>
|
|
54
53
|
#include <cuda/std/__bit/has_single_bit.h>
|
|
54
|
+
#include <cuda/std/__bit/integral.h>
|
|
55
55
|
#include <cuda/std/__functional/operations.h>
|
|
56
56
|
#include <cuda/std/__type_traits/integral_constant.h>
|
|
57
57
|
#include <cuda/std/__type_traits/is_integral.h>
|
|
@@ -630,7 +630,7 @@ struct WarpScanShfl
|
|
|
630
630
|
ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le();
|
|
631
631
|
|
|
632
632
|
// Find index of first set bit
|
|
633
|
-
int segment_first_lane = ::cuda::std::
|
|
633
|
+
int segment_first_lane = ::cuda::std::__bit_log2(ballot);
|
|
634
634
|
|
|
635
635
|
// Iterate scan steps
|
|
636
636
|
_CCCL_PRAGMA_UNROLL_FULL()
|