cuda-cccl 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
- cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
- cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
- cuda/cccl/headers/include/cuda/__event/event.h +26 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +3 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +146 -11
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/compute/__init__.py +2 -0
- cuda/compute/_bindings.pyi +43 -1
- cuda/compute/_bindings_impl.pyx +156 -7
- cuda/compute/algorithms/_scan.py +108 -36
- cuda/compute/algorithms/_transform.py +32 -11
- cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/iterators/__init__.py +2 -0
- cuda/compute/iterators/_factories.py +28 -0
- cuda/compute/iterators/_iterators.py +206 -1
- cuda/compute/numba_utils.py +2 -2
- cuda/compute/typing.py +2 -0
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -26,11 +26,9 @@
|
|
|
26
26
|
*
|
|
27
27
|
******************************************************************************/
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
* histogram .
|
|
33
|
-
*/
|
|
29
|
+
//! \file
|
|
30
|
+
//! cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide
|
|
31
|
+
//! histogram.
|
|
34
32
|
|
|
35
33
|
#pragma once
|
|
36
34
|
|
|
@@ -55,13 +53,6 @@
|
|
|
55
53
|
|
|
56
54
|
CUB_NAMESPACE_BEGIN
|
|
57
55
|
|
|
58
|
-
/******************************************************************************
|
|
59
|
-
* Tuning policy
|
|
60
|
-
******************************************************************************/
|
|
61
|
-
|
|
62
|
-
/**
|
|
63
|
-
*
|
|
64
|
-
*/
|
|
65
56
|
enum BlockHistogramMemoryPreference
|
|
66
57
|
{
|
|
67
58
|
GMEM,
|
|
@@ -69,114 +60,117 @@ enum BlockHistogramMemoryPreference
|
|
|
69
60
|
BLEND
|
|
70
61
|
};
|
|
71
62
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
bool
|
|
104
|
-
|
|
105
|
-
bool _WORK_STEALING,
|
|
106
|
-
int _VEC_SIZE = 4>
|
|
63
|
+
//! Parameterizable tuning policy type for AgentHistogram
|
|
64
|
+
//!
|
|
65
|
+
//! @tparam BlockThreads
|
|
66
|
+
//! Threads per thread block
|
|
67
|
+
//!
|
|
68
|
+
//! @tparam PixelsPerThread
|
|
69
|
+
//! Pixels per thread (per tile of input)
|
|
70
|
+
//!
|
|
71
|
+
//! @tparam LoadAlgorithm
|
|
72
|
+
//! The BlockLoad algorithm to use
|
|
73
|
+
//!
|
|
74
|
+
//! @tparam LoadModifier
|
|
75
|
+
//! Cache load modifier for reading input elements
|
|
76
|
+
//!
|
|
77
|
+
//! @tparam RleCompress
|
|
78
|
+
//! Whether to perform localized RLE to compress samples before histogramming
|
|
79
|
+
//!
|
|
80
|
+
//! @tparam MemoryPreference
|
|
81
|
+
//! Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
|
|
82
|
+
//!
|
|
83
|
+
//! @tparam WorkStealing
|
|
84
|
+
//! Whether to dequeue tiles from a global work queue
|
|
85
|
+
//!
|
|
86
|
+
//! @tparam VecSize
|
|
87
|
+
//! Vector size for samples loading (1, 2, 4)
|
|
88
|
+
template <int BlockThreads,
|
|
89
|
+
int PixelsPerThread,
|
|
90
|
+
BlockLoadAlgorithm LoadAlgorithm,
|
|
91
|
+
CacheLoadModifier LoadModifier,
|
|
92
|
+
bool RleCompress,
|
|
93
|
+
BlockHistogramMemoryPreference MemoryPreference,
|
|
94
|
+
bool WorkStealing,
|
|
95
|
+
int VecSize = 4>
|
|
107
96
|
struct AgentHistogramPolicy
|
|
108
97
|
{
|
|
109
98
|
/// Threads per thread block
|
|
110
|
-
static constexpr int BLOCK_THREADS =
|
|
99
|
+
static constexpr int BLOCK_THREADS = BlockThreads;
|
|
111
100
|
/// Pixels per thread (per tile of input)
|
|
112
|
-
static constexpr int PIXELS_PER_THREAD =
|
|
101
|
+
static constexpr int PIXELS_PER_THREAD = PixelsPerThread;
|
|
113
102
|
|
|
114
103
|
/// Whether to perform localized RLE to compress samples before histogramming
|
|
115
|
-
static constexpr bool IS_RLE_COMPRESS =
|
|
104
|
+
static constexpr bool IS_RLE_COMPRESS = RleCompress;
|
|
116
105
|
|
|
117
106
|
/// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
|
|
118
|
-
static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE =
|
|
107
|
+
static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE = MemoryPreference;
|
|
119
108
|
|
|
120
109
|
/// Whether to dequeue tiles from a global work queue
|
|
121
|
-
static constexpr bool IS_WORK_STEALING =
|
|
110
|
+
static constexpr bool IS_WORK_STEALING = WorkStealing;
|
|
122
111
|
|
|
123
112
|
/// Vector size for samples loading (1, 2, 4)
|
|
124
|
-
static constexpr int VEC_SIZE =
|
|
113
|
+
static constexpr int VEC_SIZE = VecSize;
|
|
125
114
|
|
|
126
115
|
///< The BlockLoad algorithm to use
|
|
127
|
-
static constexpr BlockLoadAlgorithm LOAD_ALGORITHM =
|
|
116
|
+
static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = LoadAlgorithm;
|
|
128
117
|
|
|
129
118
|
///< Cache load modifier for reading input elements
|
|
130
|
-
static constexpr CacheLoadModifier LOAD_MODIFIER =
|
|
119
|
+
static constexpr CacheLoadModifier LOAD_MODIFIER = LoadModifier;
|
|
131
120
|
};
|
|
132
121
|
|
|
133
|
-
|
|
134
|
-
* Thread block abstractions
|
|
135
|
-
******************************************************************************/
|
|
136
|
-
|
|
137
|
-
namespace detail
|
|
122
|
+
namespace detail::histogram
|
|
138
123
|
{
|
|
139
|
-
|
|
124
|
+
// Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
|
|
125
|
+
template <CacheLoadModifier Modifier, typename ValueT, typename OffsetT>
|
|
126
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE auto NativePointer(CacheModifiedInputIterator<Modifier, ValueT, OffsetT> itr)
|
|
140
127
|
{
|
|
128
|
+
return itr.ptr;
|
|
129
|
+
}
|
|
141
130
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
131
|
+
// Return a native pixel pointer (specialized for other types)
|
|
132
|
+
template <typename IteratorT>
|
|
133
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE auto NativePointer(IteratorT itr)
|
|
134
|
+
{
|
|
135
|
+
return nullptr;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
//! @brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating
|
|
139
|
+
//! in device-wide histogram .
|
|
140
|
+
//!
|
|
141
|
+
//! @tparam AgentHistogramPolicyT
|
|
142
|
+
//! Parameterized AgentHistogramPolicy tuning policy type
|
|
143
|
+
//!
|
|
144
|
+
//! @tparam PrivatizedSmemBins
|
|
145
|
+
//! Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized
|
|
146
|
+
//! counters to be maintained in device-accessible memory.
|
|
147
|
+
//!
|
|
148
|
+
//! @tparam NumChannels
|
|
149
|
+
//! Number of channels interleaved in the input data. Supports up to four channels.
|
|
150
|
+
//!
|
|
151
|
+
//! @tparam NumActiveChannels
|
|
152
|
+
//! Number of channels actively being histogrammed
|
|
153
|
+
//!
|
|
154
|
+
//! @tparam SampleIteratorT
|
|
155
|
+
//! Random-access input iterator type for reading samples
|
|
156
|
+
//!
|
|
157
|
+
//! @tparam CounterT
|
|
158
|
+
//! Integer type for counting sample occurrences per histogram bin
|
|
159
|
+
//!
|
|
160
|
+
//! @tparam PrivatizedDecodeOpT
|
|
161
|
+
//! The transform operator type for determining privatized counter indices from samples, one for
|
|
162
|
+
//! each channel
|
|
163
|
+
//!
|
|
164
|
+
//! @tparam OutputDecodeOpT
|
|
165
|
+
//! The transform operator type for determining output bin-ids from privatized counter indices, one
|
|
166
|
+
//! for each channel
|
|
167
|
+
//!
|
|
168
|
+
//! @tparam OffsetT
|
|
169
|
+
//! Signed integer type for global offsets
|
|
176
170
|
template <typename AgentHistogramPolicyT,
|
|
177
|
-
int
|
|
178
|
-
int
|
|
179
|
-
int
|
|
171
|
+
int PrivatizedSmemBins,
|
|
172
|
+
int NumChannels,
|
|
173
|
+
int NumActiveChannels,
|
|
180
174
|
typename SampleIteratorT,
|
|
181
175
|
typename CounterT,
|
|
182
176
|
typename PrivatizedDecodeOpT,
|
|
@@ -184,251 +178,137 @@ template <typename AgentHistogramPolicyT,
|
|
|
184
178
|
typename OffsetT>
|
|
185
179
|
struct AgentHistogram
|
|
186
180
|
{
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
static constexpr int PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD;
|
|
205
|
-
static constexpr int SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS;
|
|
206
|
-
static constexpr int VECS_PER_THREAD = SAMPLES_PER_THREAD / VecSize;
|
|
207
|
-
|
|
208
|
-
static constexpr int TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS;
|
|
209
|
-
static constexpr int TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS;
|
|
210
|
-
|
|
211
|
-
static constexpr bool IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS;
|
|
212
|
-
|
|
213
|
-
static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE =
|
|
214
|
-
(PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM;
|
|
215
|
-
|
|
216
|
-
static constexpr bool IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING;
|
|
217
|
-
|
|
218
|
-
/// Cache load modifier for reading input elements
|
|
219
|
-
static constexpr CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
|
|
181
|
+
static constexpr int vec_size = AgentHistogramPolicyT::VEC_SIZE;
|
|
182
|
+
static constexpr int block_threads = AgentHistogramPolicyT::BLOCK_THREADS;
|
|
183
|
+
static constexpr int pixels_per_thread = AgentHistogramPolicyT::PIXELS_PER_THREAD;
|
|
184
|
+
static constexpr int samples_per_thread = pixels_per_thread * NumChannels;
|
|
185
|
+
static constexpr int vecs_per_thread = samples_per_thread / vec_size;
|
|
186
|
+
static constexpr int tile_pixels = pixels_per_thread * block_threads;
|
|
187
|
+
static constexpr int tile_samples = samples_per_thread * block_threads;
|
|
188
|
+
static constexpr bool is_rle_compress = AgentHistogramPolicyT::IS_RLE_COMPRESS;
|
|
189
|
+
static constexpr bool is_work_stealing = AgentHistogramPolicyT::IS_WORK_STEALING;
|
|
190
|
+
static constexpr CacheLoadModifier load_modifier = AgentHistogramPolicyT::LOAD_MODIFIER;
|
|
191
|
+
static constexpr auto mem_preference =
|
|
192
|
+
(PrivatizedSmemBins > 0) ? BlockHistogramMemoryPreference{AgentHistogramPolicyT::MEM_PREFERENCE} : GMEM;
|
|
193
|
+
|
|
194
|
+
using SampleT = it_value_t<SampleIteratorT>;
|
|
195
|
+
using PixelT = typename CubVector<SampleT, NumChannels>::Type;
|
|
196
|
+
using VecT = typename CubVector<SampleT, vec_size>::Type;
|
|
220
197
|
|
|
221
198
|
/// Input iterator wrapper type (for applying cache modifier)
|
|
222
|
-
// Wrap the native input pointer with CacheModifiedInputIterator
|
|
223
|
-
//
|
|
199
|
+
// Wrap the native input pointer with CacheModifiedInputIterator or directly use the supplied input iterator type
|
|
200
|
+
// TODO(bgruber): we can wrap all contiguous iterators, not just pointers
|
|
224
201
|
using WrappedSampleIteratorT =
|
|
225
202
|
::cuda::std::_If<::cuda::std::is_pointer_v<SampleIteratorT>,
|
|
226
|
-
CacheModifiedInputIterator<
|
|
203
|
+
CacheModifiedInputIterator<load_modifier, SampleT, OffsetT>,
|
|
227
204
|
SampleIteratorT>;
|
|
205
|
+
using WrappedPixelIteratorT = CacheModifiedInputIterator<load_modifier, PixelT, OffsetT>;
|
|
206
|
+
using WrappedVecsIteratorT = CacheModifiedInputIterator<load_modifier, VecT, OffsetT>;
|
|
207
|
+
using BlockLoadSampleT = BlockLoad<SampleT, block_threads, samples_per_thread, AgentHistogramPolicyT::LOAD_ALGORITHM>;
|
|
208
|
+
using BlockLoadPixelT = BlockLoad<PixelT, block_threads, pixels_per_thread, AgentHistogramPolicyT::LOAD_ALGORITHM>;
|
|
209
|
+
using BlockLoadVecT = BlockLoad<VecT, block_threads, vecs_per_thread, AgentHistogramPolicyT::LOAD_ALGORITHM>;
|
|
228
210
|
|
|
229
|
-
/// Pixel input iterator type (for applying cache modifier)
|
|
230
|
-
using WrappedPixelIteratorT = CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>;
|
|
231
|
-
|
|
232
|
-
/// Qaud input iterator type (for applying cache modifier)
|
|
233
|
-
using WrappedVecsIteratorT = CacheModifiedInputIterator<LOAD_MODIFIER, VecT, OffsetT>;
|
|
234
|
-
|
|
235
|
-
/// Parameterized BlockLoad type for samples
|
|
236
|
-
using BlockLoadSampleT = BlockLoad<SampleT, BLOCK_THREADS, SAMPLES_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM>;
|
|
237
|
-
|
|
238
|
-
/// Parameterized BlockLoad type for pixels
|
|
239
|
-
using BlockLoadPixelT = BlockLoad<PixelT, BLOCK_THREADS, PIXELS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM>;
|
|
240
|
-
|
|
241
|
-
/// Parameterized BlockLoad type for vecs
|
|
242
|
-
using BlockLoadVecT = BlockLoad<VecT, BLOCK_THREADS, VECS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM>;
|
|
243
|
-
|
|
244
|
-
/// Shared memory type required by this thread block
|
|
245
211
|
struct _TempStorage
|
|
246
212
|
{
|
|
247
213
|
// Smem needed for block-privatized smem histogram (with 1 word of padding)
|
|
248
|
-
CounterT histograms[
|
|
249
|
-
|
|
214
|
+
CounterT histograms[NumActiveChannels][PrivatizedSmemBins + 1];
|
|
250
215
|
int tile_idx;
|
|
251
216
|
|
|
252
|
-
|
|
253
|
-
union Aliasable
|
|
217
|
+
union
|
|
254
218
|
{
|
|
255
|
-
// Smem needed for loading a tile of samples
|
|
256
219
|
typename BlockLoadSampleT::TempStorage sample_load;
|
|
257
|
-
|
|
258
|
-
// Smem needed for loading a tile of pixels
|
|
259
220
|
typename BlockLoadPixelT::TempStorage pixel_load;
|
|
260
|
-
|
|
261
|
-
// Smem needed for loading a tile of vecs
|
|
262
221
|
typename BlockLoadVecT::TempStorage vec_load;
|
|
263
|
-
|
|
264
|
-
} aliasable;
|
|
222
|
+
};
|
|
265
223
|
};
|
|
266
224
|
|
|
267
|
-
|
|
268
|
-
struct TempStorage : Uninitialized<_TempStorage>
|
|
269
|
-
{};
|
|
270
|
-
|
|
271
|
-
//---------------------------------------------------------------------
|
|
272
|
-
// Per-thread fields
|
|
273
|
-
//---------------------------------------------------------------------
|
|
225
|
+
using TempStorage = Uninitialized<_TempStorage>;
|
|
274
226
|
|
|
275
|
-
/// Reference to temp_storage
|
|
276
227
|
_TempStorage& temp_storage;
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
int* num_privatized_bins;
|
|
289
|
-
|
|
290
|
-
/// Copy of gmem privatized histograms for each channel
|
|
291
|
-
CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
|
|
292
|
-
|
|
293
|
-
/// Reference to final output histograms (gmem)
|
|
294
|
-
CounterT** d_output_histograms;
|
|
295
|
-
|
|
296
|
-
/// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
|
|
297
|
-
OutputDecodeOpT* output_decode_op;
|
|
298
|
-
|
|
299
|
-
/// The transform operator for determining privatized counter indices from samples, one for each channel
|
|
300
|
-
PrivatizedDecodeOpT* privatized_decode_op;
|
|
301
|
-
|
|
302
|
-
/// Whether to prefer privatized smem counters vs privatized global counters
|
|
303
|
-
bool prefer_smem;
|
|
304
|
-
|
|
305
|
-
//---------------------------------------------------------------------
|
|
306
|
-
// Initialize privatized bin counters
|
|
307
|
-
//---------------------------------------------------------------------
|
|
308
|
-
|
|
309
|
-
// Initialize privatized bin counters
|
|
310
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
|
|
228
|
+
WrappedSampleIteratorT d_wrapped_samples; // with cache modifier applied, if possible
|
|
229
|
+
SampleT* d_native_samples; // possibly nullptr if unavailable
|
|
230
|
+
int* num_output_bins; // one for each channel
|
|
231
|
+
int* num_privatized_bins; // one for each channel
|
|
232
|
+
CounterT* d_privatized_histograms[NumActiveChannels]; // one for each channel
|
|
233
|
+
CounterT** d_output_histograms; // in global memory
|
|
234
|
+
OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each channel
|
|
235
|
+
PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each channel
|
|
236
|
+
bool prefer_smem; // for privatized counterss
|
|
237
|
+
|
|
238
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void ZeroBinCounters(CounterT* privatized_histograms[NumActiveChannels])
|
|
311
239
|
{
|
|
312
|
-
// Initialize histogram bin counts to zeros
|
|
313
240
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
314
|
-
for (int
|
|
241
|
+
for (int ch = 0; ch < NumActiveChannels; ++ch)
|
|
315
242
|
{
|
|
316
|
-
for (int
|
|
317
|
-
privatized_bin += BLOCK_THREADS)
|
|
243
|
+
for (int bin = threadIdx.x; bin < num_privatized_bins[ch]; bin += block_threads)
|
|
318
244
|
{
|
|
319
|
-
privatized_histograms[
|
|
245
|
+
privatized_histograms[ch][bin] = 0;
|
|
320
246
|
}
|
|
321
247
|
}
|
|
322
248
|
|
|
249
|
+
// TODO(bgruber): do we also need the __syncthreads() when prefer_smem is false?
|
|
323
250
|
// Barrier to make sure all threads are done updating counters
|
|
324
251
|
__syncthreads();
|
|
325
252
|
}
|
|
326
253
|
|
|
327
|
-
// Initialize privatized bin counters. Specialized for privatized shared-memory counters
|
|
328
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void InitSmemBinCounters()
|
|
329
|
-
{
|
|
330
|
-
CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
|
|
331
|
-
|
|
332
|
-
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
|
|
333
|
-
{
|
|
334
|
-
privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
|
|
335
|
-
}
|
|
336
|
-
|
|
337
|
-
InitBinCounters(privatized_histograms);
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
// Initialize privatized bin counters. Specialized for privatized global-memory counters
|
|
341
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void InitGmemBinCounters()
|
|
342
|
-
{
|
|
343
|
-
InitBinCounters(d_privatized_histograms);
|
|
344
|
-
}
|
|
345
|
-
|
|
346
|
-
//---------------------------------------------------------------------
|
|
347
|
-
// Update final output histograms
|
|
348
|
-
//---------------------------------------------------------------------
|
|
349
|
-
|
|
350
254
|
// Update final output histograms from privatized histograms
|
|
351
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void StoreOutput(CounterT* privatized_histograms[
|
|
255
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void StoreOutput(CounterT* privatized_histograms[NumActiveChannels])
|
|
352
256
|
{
|
|
353
257
|
// Barrier to make sure all threads are done updating counters
|
|
354
258
|
__syncthreads();
|
|
355
259
|
|
|
356
260
|
// Apply privatized bin counts to output bin counts
|
|
357
261
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
358
|
-
for (int
|
|
262
|
+
for (int ch = 0; ch < NumActiveChannels; ++ch)
|
|
359
263
|
{
|
|
360
|
-
int channel_bins = num_privatized_bins[
|
|
361
|
-
for (int
|
|
264
|
+
const int channel_bins = num_privatized_bins[ch];
|
|
265
|
+
for (int bin = threadIdx.x; bin < channel_bins; bin += block_threads)
|
|
362
266
|
{
|
|
363
|
-
int output_bin
|
|
364
|
-
CounterT count = privatized_histograms[
|
|
365
|
-
bool is_valid = count > 0;
|
|
366
|
-
|
|
367
|
-
output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
|
|
267
|
+
int output_bin = -1;
|
|
268
|
+
const CounterT count = privatized_histograms[ch][bin];
|
|
269
|
+
const bool is_valid = count > 0;
|
|
270
|
+
output_decode_op[ch].template BinSelect<load_modifier>(static_cast<SampleT>(bin), output_bin, is_valid);
|
|
368
271
|
|
|
369
272
|
if (output_bin >= 0)
|
|
370
273
|
{
|
|
371
|
-
atomicAdd(&d_output_histograms[
|
|
274
|
+
atomicAdd(&d_output_histograms[ch][output_bin], count);
|
|
372
275
|
}
|
|
373
276
|
}
|
|
374
277
|
}
|
|
375
278
|
}
|
|
376
279
|
|
|
377
|
-
// Update final output histograms from privatized histograms. Specialized for privatized shared-memory counters
|
|
378
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void StoreSmemOutput()
|
|
379
|
-
{
|
|
380
|
-
CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
|
|
381
|
-
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
|
|
382
|
-
{
|
|
383
|
-
privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
StoreOutput(privatized_histograms);
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
// Update final output histograms from privatized histograms. Specialized for privatized global-memory counters
|
|
390
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void StoreGmemOutput()
|
|
391
|
-
{
|
|
392
|
-
StoreOutput(d_privatized_histograms);
|
|
393
|
-
}
|
|
394
|
-
|
|
395
|
-
//---------------------------------------------------------------------
|
|
396
|
-
// Tile accumulation
|
|
397
|
-
//---------------------------------------------------------------------
|
|
398
|
-
|
|
399
280
|
// Accumulate pixels. Specialized for RLE compression.
|
|
400
281
|
_CCCL_DEVICE _CCCL_FORCEINLINE void AccumulatePixels(
|
|
401
|
-
SampleT samples[
|
|
402
|
-
bool is_valid[
|
|
403
|
-
CounterT* privatized_histograms[
|
|
282
|
+
SampleT samples[pixels_per_thread][NumChannels],
|
|
283
|
+
bool is_valid[pixels_per_thread],
|
|
284
|
+
CounterT* privatized_histograms[NumActiveChannels],
|
|
404
285
|
::cuda::std::true_type is_rle_compress)
|
|
405
286
|
{
|
|
406
287
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
407
|
-
for (int
|
|
288
|
+
for (int ch = 0; ch < NumActiveChannels; ++ch)
|
|
408
289
|
{
|
|
409
290
|
// Bin pixels
|
|
410
|
-
int bins[
|
|
291
|
+
int bins[pixels_per_thread];
|
|
411
292
|
|
|
412
293
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
413
|
-
for (int
|
|
294
|
+
for (int pixel = 0; pixel < pixels_per_thread; ++pixel)
|
|
414
295
|
{
|
|
415
|
-
bins[
|
|
416
|
-
privatized_decode_op[
|
|
417
|
-
samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
|
|
296
|
+
bins[pixel] = -1;
|
|
297
|
+
privatized_decode_op[ch].template BinSelect<load_modifier>(samples[pixel][ch], bins[pixel], is_valid[pixel]);
|
|
418
298
|
}
|
|
419
299
|
|
|
420
300
|
CounterT accumulator = 1;
|
|
421
301
|
|
|
422
302
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
423
|
-
for (int
|
|
303
|
+
for (int pixel = 0; pixel < pixels_per_thread - 1; ++pixel)
|
|
424
304
|
{
|
|
425
|
-
if (bins[
|
|
305
|
+
if (bins[pixel] != bins[pixel + 1])
|
|
426
306
|
{
|
|
427
|
-
if (bins[
|
|
307
|
+
if (bins[pixel] >= 0)
|
|
428
308
|
{
|
|
429
309
|
NV_IF_TARGET(NV_PROVIDES_SM_60,
|
|
430
|
-
(atomicAdd_block(privatized_histograms[
|
|
431
|
-
(atomicAdd(privatized_histograms[
|
|
310
|
+
(atomicAdd_block(privatized_histograms[ch] + bins[pixel], accumulator);),
|
|
311
|
+
(atomicAdd(privatized_histograms[ch] + bins[pixel], accumulator);));
|
|
432
312
|
}
|
|
433
313
|
|
|
434
314
|
accumulator = 0;
|
|
@@ -437,234 +317,162 @@ struct AgentHistogram
|
|
|
437
317
|
}
|
|
438
318
|
|
|
439
319
|
// Last pixel
|
|
440
|
-
if (bins[
|
|
320
|
+
if (bins[pixels_per_thread - 1] >= 0)
|
|
441
321
|
{
|
|
442
322
|
NV_IF_TARGET(NV_PROVIDES_SM_60,
|
|
443
|
-
(atomicAdd_block(privatized_histograms[
|
|
444
|
-
(atomicAdd(privatized_histograms[
|
|
323
|
+
(atomicAdd_block(privatized_histograms[ch] + bins[pixels_per_thread - 1], accumulator);),
|
|
324
|
+
(atomicAdd(privatized_histograms[ch] + bins[pixels_per_thread - 1], accumulator);));
|
|
445
325
|
}
|
|
446
326
|
}
|
|
447
327
|
}
|
|
448
328
|
|
|
449
329
|
// Accumulate pixels. Specialized for individual accumulation of each pixel.
|
|
450
330
|
_CCCL_DEVICE _CCCL_FORCEINLINE void AccumulatePixels(
|
|
451
|
-
SampleT samples[
|
|
452
|
-
bool is_valid[
|
|
453
|
-
CounterT* privatized_histograms[
|
|
331
|
+
SampleT samples[pixels_per_thread][NumChannels],
|
|
332
|
+
bool is_valid[pixels_per_thread],
|
|
333
|
+
CounterT* privatized_histograms[NumActiveChannels],
|
|
454
334
|
::cuda::std::false_type is_rle_compress)
|
|
455
335
|
{
|
|
456
336
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
457
|
-
for (int
|
|
337
|
+
for (int pixel = 0; pixel < pixels_per_thread; ++pixel)
|
|
458
338
|
{
|
|
459
339
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
460
|
-
for (int
|
|
340
|
+
for (int ch = 0; ch < NumActiveChannels; ++ch)
|
|
461
341
|
{
|
|
462
342
|
int bin = -1;
|
|
463
|
-
privatized_decode_op[
|
|
343
|
+
privatized_decode_op[ch].template BinSelect<load_modifier>(samples[pixel][ch], bin, is_valid[pixel]);
|
|
464
344
|
if (bin >= 0)
|
|
465
345
|
{
|
|
466
346
|
NV_IF_TARGET(NV_PROVIDES_SM_60,
|
|
467
|
-
(atomicAdd_block(privatized_histograms[
|
|
468
|
-
(atomicAdd(privatized_histograms[
|
|
347
|
+
(atomicAdd_block(privatized_histograms[ch] + bin, 1);),
|
|
348
|
+
(atomicAdd(privatized_histograms[ch] + bin, 1);));
|
|
469
349
|
}
|
|
470
350
|
}
|
|
471
351
|
}
|
|
472
352
|
}
|
|
473
353
|
|
|
474
|
-
|
|
475
|
-
* Accumulate pixel, specialized for smem privatized histogram
|
|
476
|
-
*/
|
|
354
|
+
// Load full, aligned tile using pixel iterator
|
|
477
355
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
478
|
-
|
|
356
|
+
LoadFullAlignedTile(OffsetT block_offset, SampleT (&samples)[pixels_per_thread][NumChannels])
|
|
479
357
|
{
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
|
|
358
|
+
if constexpr (NumActiveChannels == 1)
|
|
483
359
|
{
|
|
484
|
-
|
|
360
|
+
using AliasedVecs = VecT[vecs_per_thread];
|
|
361
|
+
WrappedVecsIteratorT d_wrapped_vecs(reinterpret_cast<VecT*>(d_native_samples + block_offset));
|
|
362
|
+
// Load using a wrapped vec iterator
|
|
363
|
+
BlockLoadVecT{temp_storage.vec_load}.Load(d_wrapped_vecs, reinterpret_cast<AliasedVecs&>(samples));
|
|
364
|
+
}
|
|
365
|
+
else
|
|
366
|
+
{
|
|
367
|
+
using AliasedPixels = PixelT[pixels_per_thread];
|
|
368
|
+
WrappedPixelIteratorT d_wrapped_pixels(reinterpret_cast<PixelT*>(d_native_samples + block_offset));
|
|
369
|
+
// Load using a wrapped pixel iterator
|
|
370
|
+
BlockLoadPixelT{temp_storage.pixel_load}.Load(d_wrapped_pixels, reinterpret_cast<AliasedPixels&>(samples));
|
|
485
371
|
}
|
|
486
|
-
|
|
487
|
-
AccumulatePixels(samples, is_valid, privatized_histograms, ::cuda::std::bool_constant<IS_RLE_COMPRESS>{});
|
|
488
|
-
}
|
|
489
|
-
|
|
490
|
-
/**
|
|
491
|
-
* Accumulate pixel, specialized for gmem privatized histogram
|
|
492
|
-
*/
|
|
493
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
494
|
-
AccumulateGmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
|
|
495
|
-
{
|
|
496
|
-
AccumulatePixels(samples, is_valid, d_privatized_histograms, ::cuda::std::bool_constant<IS_RLE_COMPRESS>{});
|
|
497
|
-
}
|
|
498
|
-
|
|
499
|
-
//---------------------------------------------------------------------
|
|
500
|
-
// Tile loading
|
|
501
|
-
//---------------------------------------------------------------------
|
|
502
|
-
|
|
503
|
-
// Load full, aligned tile using pixel iterator (multi-channel)
|
|
504
|
-
template <int _NUM_ACTIVE_CHANNELS>
|
|
505
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadFullAlignedTile(
|
|
506
|
-
OffsetT block_offset,
|
|
507
|
-
int valid_samples,
|
|
508
|
-
SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
|
|
509
|
-
constant_t<_NUM_ACTIVE_CHANNELS> num_active_channels)
|
|
510
|
-
{
|
|
511
|
-
using AliasedPixels = PixelT[PIXELS_PER_THREAD];
|
|
512
|
-
|
|
513
|
-
WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
|
|
514
|
-
|
|
515
|
-
// Load using a wrapped pixel iterator
|
|
516
|
-
BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(d_wrapped_pixels, reinterpret_cast<AliasedPixels&>(samples));
|
|
517
|
-
}
|
|
518
|
-
|
|
519
|
-
// Load full, aligned tile using vec iterator (single-channel)
|
|
520
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadFullAlignedTile(
|
|
521
|
-
OffsetT block_offset,
|
|
522
|
-
int valid_samples,
|
|
523
|
-
SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
|
|
524
|
-
constant_t<1> num_active_channels)
|
|
525
|
-
{
|
|
526
|
-
using AliasedVecs = VecT[VECS_PER_THREAD];
|
|
527
|
-
|
|
528
|
-
WrappedVecsIteratorT d_wrapped_vecs((VecT*) (d_native_samples + block_offset));
|
|
529
|
-
|
|
530
|
-
// Load using a wrapped vec iterator
|
|
531
|
-
BlockLoadVecT(temp_storage.aliasable.vec_load).Load(d_wrapped_vecs, reinterpret_cast<AliasedVecs&>(samples));
|
|
532
|
-
}
|
|
533
|
-
|
|
534
|
-
// Load full, aligned tile
|
|
535
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile(
|
|
536
|
-
OffsetT block_offset,
|
|
537
|
-
int valid_samples,
|
|
538
|
-
SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
|
|
539
|
-
::cuda::std::true_type is_full_tile,
|
|
540
|
-
::cuda::std::true_type is_aligned)
|
|
541
|
-
{
|
|
542
|
-
LoadFullAlignedTile(block_offset, valid_samples, samples, constant_v<NUM_ACTIVE_CHANNELS>);
|
|
543
|
-
}
|
|
544
|
-
|
|
545
|
-
// Load full, mis-aligned tile using sample iterator
|
|
546
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile(
|
|
547
|
-
OffsetT block_offset,
|
|
548
|
-
int valid_samples,
|
|
549
|
-
SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
|
|
550
|
-
::cuda::std::true_type is_full_tile,
|
|
551
|
-
::cuda::std::false_type is_aligned)
|
|
552
|
-
{
|
|
553
|
-
using AliasedSamples = SampleT[SAMPLES_PER_THREAD];
|
|
554
|
-
|
|
555
|
-
// Load using sample iterator
|
|
556
|
-
BlockLoadSampleT(temp_storage.aliasable.sample_load)
|
|
557
|
-
.Load(d_wrapped_samples + block_offset, reinterpret_cast<AliasedSamples&>(samples));
|
|
558
|
-
}
|
|
559
|
-
|
|
560
|
-
// Load partially-full, aligned tile using the pixel iterator
|
|
561
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile(
|
|
562
|
-
OffsetT block_offset,
|
|
563
|
-
int valid_samples,
|
|
564
|
-
SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
|
|
565
|
-
::cuda::std::false_type is_full_tile,
|
|
566
|
-
::cuda::std::true_type is_aligned)
|
|
567
|
-
{
|
|
568
|
-
using AliasedPixels = PixelT[PIXELS_PER_THREAD];
|
|
569
|
-
|
|
570
|
-
WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
|
|
571
|
-
|
|
572
|
-
int valid_pixels = valid_samples / NUM_CHANNELS;
|
|
573
|
-
|
|
574
|
-
// Load using a wrapped pixel iterator
|
|
575
|
-
BlockLoadPixelT(temp_storage.aliasable.pixel_load)
|
|
576
|
-
.Load(d_wrapped_pixels, reinterpret_cast<AliasedPixels&>(samples), valid_pixels);
|
|
577
|
-
}
|
|
578
|
-
|
|
579
|
-
// Load partially-full, mis-aligned tile using sample iterator
|
|
580
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile(
|
|
581
|
-
OffsetT block_offset,
|
|
582
|
-
int valid_samples,
|
|
583
|
-
SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
|
|
584
|
-
::cuda::std::false_type is_full_tile,
|
|
585
|
-
::cuda::std::false_type is_aligned)
|
|
586
|
-
{
|
|
587
|
-
using AliasedSamples = SampleT[SAMPLES_PER_THREAD];
|
|
588
|
-
|
|
589
|
-
BlockLoadSampleT(temp_storage.aliasable.sample_load)
|
|
590
|
-
.Load(d_wrapped_samples + block_offset, reinterpret_cast<AliasedSamples&>(samples), valid_samples);
|
|
591
372
|
}
|
|
592
373
|
|
|
593
|
-
template <bool
|
|
374
|
+
template <bool IsFullTile, bool IsAligned>
|
|
594
375
|
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
595
|
-
|
|
376
|
+
LoadTile(OffsetT block_offset, int valid_samples, SampleT (&samples)[pixels_per_thread][NumChannels])
|
|
596
377
|
{
|
|
597
|
-
|
|
598
|
-
|
|
378
|
+
if constexpr (IsFullTile)
|
|
379
|
+
{
|
|
380
|
+
if constexpr (IsAligned)
|
|
381
|
+
{
|
|
382
|
+
LoadFullAlignedTile(block_offset, samples);
|
|
383
|
+
}
|
|
384
|
+
else
|
|
385
|
+
{
|
|
386
|
+
// Load using sample iterator
|
|
387
|
+
using AliasedSamples = SampleT[samples_per_thread];
|
|
388
|
+
BlockLoadSampleT{temp_storage.sample_load}.Load(
|
|
389
|
+
d_wrapped_samples + block_offset, reinterpret_cast<AliasedSamples&>(samples));
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
else
|
|
599
393
|
{
|
|
600
|
-
|
|
394
|
+
if constexpr (IsAligned)
|
|
395
|
+
{
|
|
396
|
+
// Load partially-full, aligned tile using the pixel iterator
|
|
397
|
+
using AliasedPixels = PixelT[pixels_per_thread];
|
|
398
|
+
WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
|
|
399
|
+
int valid_pixels = valid_samples / NumChannels;
|
|
400
|
+
|
|
401
|
+
// Load using a wrapped pixel iterator
|
|
402
|
+
BlockLoadPixelT{temp_storage.pixel_load}.Load(
|
|
403
|
+
d_wrapped_pixels, reinterpret_cast<AliasedPixels&>(samples), valid_pixels);
|
|
404
|
+
}
|
|
405
|
+
else
|
|
406
|
+
{
|
|
407
|
+
using AliasedSamples = SampleT[samples_per_thread];
|
|
408
|
+
BlockLoadSampleT{temp_storage.sample_load}.Load(
|
|
409
|
+
d_wrapped_samples + block_offset, reinterpret_cast<AliasedSamples&>(samples), valid_samples);
|
|
410
|
+
}
|
|
601
411
|
}
|
|
602
412
|
}
|
|
603
413
|
|
|
604
|
-
template <bool
|
|
605
|
-
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
606
|
-
MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], int valid_samples, ::cuda::std::true_type /* is_striped = true */)
|
|
414
|
+
template <bool IsFullTile, bool IsStriped>
|
|
415
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void MarkValid(bool (&is_valid)[pixels_per_thread], int valid_samples)
|
|
607
416
|
{
|
|
608
417
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
609
|
-
for (int
|
|
418
|
+
for (int pixel = 0; pixel < pixels_per_thread; ++pixel)
|
|
610
419
|
{
|
|
611
|
-
|
|
420
|
+
if constexpr (IsStriped)
|
|
421
|
+
{
|
|
422
|
+
is_valid[pixel] = IsFullTile || (((threadIdx.x + block_threads * pixel) * NumChannels) < valid_samples);
|
|
423
|
+
}
|
|
424
|
+
else
|
|
425
|
+
{
|
|
426
|
+
is_valid[pixel] = IsFullTile || (((threadIdx.x * pixels_per_thread + pixel) * NumChannels) < valid_samples);
|
|
427
|
+
}
|
|
612
428
|
}
|
|
613
429
|
}
|
|
614
430
|
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
* Whether the tile offset is aligned (vec-aligned for single-channel, pixel-aligned for multi-channel)
|
|
624
|
-
*
|
|
625
|
-
* @tparam IS_FULL_TILE
|
|
626
|
-
Whether the tile is full
|
|
627
|
-
*/
|
|
628
|
-
template <bool IS_ALIGNED, bool IS_FULL_TILE>
|
|
431
|
+
//! @brief Consume a tile of data samples
|
|
432
|
+
//!
|
|
433
|
+
//! @tparam IsAligned
|
|
434
|
+
//! Whether the tile offset is aligned (vec-aligned for single-channel, pixel-aligned for multi-channel)
|
|
435
|
+
//!
|
|
436
|
+
//! @tparam IsFullTile
|
|
437
|
+
//! Whether the tile is full
|
|
438
|
+
template <bool IsAligned, bool IsFullTile>
|
|
629
439
|
_CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(OffsetT block_offset, int valid_samples)
|
|
630
440
|
{
|
|
631
|
-
SampleT samples[
|
|
632
|
-
bool is_valid[
|
|
633
|
-
|
|
634
|
-
// Load tile
|
|
635
|
-
LoadTile(block_offset, valid_samples, samples, bool_constant_v<IS_FULL_TILE>, bool_constant_v<IS_ALIGNED>);
|
|
441
|
+
SampleT samples[pixels_per_thread][NumChannels];
|
|
442
|
+
bool is_valid[pixels_per_thread];
|
|
636
443
|
|
|
637
|
-
|
|
638
|
-
MarkValid<
|
|
639
|
-
is_valid, valid_samples, bool_constant_v < AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED >);
|
|
444
|
+
LoadTile<IsFullTile, IsAligned>(block_offset, valid_samples, samples);
|
|
445
|
+
MarkValid<IsFullTile, AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED>(is_valid, valid_samples);
|
|
640
446
|
|
|
641
|
-
// Accumulate samples
|
|
642
447
|
if (prefer_smem)
|
|
643
448
|
{
|
|
644
|
-
|
|
449
|
+
CounterT* privatized_histograms[NumActiveChannels];
|
|
450
|
+
for (int ch = 0; ch < NumActiveChannels; ++ch)
|
|
451
|
+
{
|
|
452
|
+
privatized_histograms[ch] = temp_storage.histograms[ch];
|
|
453
|
+
}
|
|
454
|
+
AccumulatePixels(samples, is_valid, privatized_histograms, ::cuda::std::bool_constant<is_rle_compress>{});
|
|
645
455
|
}
|
|
646
456
|
else
|
|
647
457
|
{
|
|
648
|
-
|
|
458
|
+
AccumulatePixels(samples, is_valid, d_privatized_histograms, ::cuda::std::bool_constant<is_rle_compress>{});
|
|
649
459
|
}
|
|
650
460
|
}
|
|
651
461
|
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
*/
|
|
667
|
-
template <bool IS_ALIGNED>
|
|
462
|
+
//! @brief Consume row tiles. Specialized for work-stealing from queue
|
|
463
|
+
//!
|
|
464
|
+
//! @param num_row_pixels
|
|
465
|
+
//! The number of multi-channel pixels per row in the region of interest
|
|
466
|
+
//!
|
|
467
|
+
//! @param num_rows
|
|
468
|
+
//! The number of rows in the region of interest
|
|
469
|
+
//!
|
|
470
|
+
//! @param row_stride_samples
|
|
471
|
+
//! The number of samples between starts of consecutive rows in the region of interest
|
|
472
|
+
//!
|
|
473
|
+
//! @param tiles_per_row
|
|
474
|
+
//! Number of image tiles per row
|
|
475
|
+
template <bool IsAligned>
|
|
668
476
|
_CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTiles(
|
|
669
477
|
OffsetT num_row_pixels,
|
|
670
478
|
OffsetT num_rows,
|
|
@@ -682,19 +490,19 @@ struct AgentHistogram
|
|
|
682
490
|
int row = tile_idx / tiles_per_row;
|
|
683
491
|
int col = tile_idx - (row * tiles_per_row);
|
|
684
492
|
OffsetT row_offset = row * row_stride_samples;
|
|
685
|
-
OffsetT col_offset = (col *
|
|
493
|
+
OffsetT col_offset = (col * tile_samples);
|
|
686
494
|
OffsetT tile_offset = row_offset + col_offset;
|
|
687
495
|
|
|
688
496
|
if (col == tiles_per_row - 1)
|
|
689
497
|
{
|
|
690
498
|
// Consume a partially-full tile at the end of the row
|
|
691
|
-
OffsetT num_remaining = (num_row_pixels *
|
|
692
|
-
ConsumeTile<
|
|
499
|
+
OffsetT num_remaining = (num_row_pixels * NumChannels) - col_offset;
|
|
500
|
+
ConsumeTile<IsAligned, false>(tile_offset, num_remaining);
|
|
693
501
|
}
|
|
694
502
|
else
|
|
695
503
|
{
|
|
696
504
|
// Consume full tile
|
|
697
|
-
ConsumeTile<
|
|
505
|
+
ConsumeTile<IsAligned, true>(tile_offset, tile_samples);
|
|
698
506
|
}
|
|
699
507
|
|
|
700
508
|
__syncthreads();
|
|
@@ -711,50 +519,40 @@ struct AgentHistogram
|
|
|
711
519
|
}
|
|
712
520
|
}
|
|
713
521
|
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
*
|
|
726
|
-
* @param tiles_per_row
|
|
727
|
-
* Number of image tiles per row
|
|
728
|
-
*/
|
|
729
|
-
template <bool IS_ALIGNED>
|
|
522
|
+
//! @brief Consume row tiles. Specialized for even-share (striped across thread blocks)
|
|
523
|
+
//!
|
|
524
|
+
//! @param num_row_pixels
|
|
525
|
+
//! The number of multi-channel pixels per row in the region of interest
|
|
526
|
+
//!
|
|
527
|
+
//! @param num_rows
|
|
528
|
+
//! The number of rows in the region of interest
|
|
529
|
+
//!
|
|
530
|
+
//! @param row_stride_samples
|
|
531
|
+
//! The number of samples between starts of consecutive rows in the region of interest
|
|
532
|
+
template <bool IsAligned>
|
|
730
533
|
_CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTiles(
|
|
731
|
-
OffsetT num_row_pixels,
|
|
732
|
-
OffsetT num_rows,
|
|
733
|
-
OffsetT row_stride_samples,
|
|
734
|
-
int tiles_per_row,
|
|
735
|
-
GridQueue<int> tile_queue,
|
|
736
|
-
::cuda::std::false_type is_work_stealing)
|
|
534
|
+
OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int, GridQueue<int>, ::cuda::std::false_type)
|
|
737
535
|
{
|
|
738
536
|
for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
|
|
739
537
|
{
|
|
740
538
|
OffsetT row_begin = row * row_stride_samples;
|
|
741
|
-
OffsetT row_end = row_begin + (num_row_pixels *
|
|
742
|
-
OffsetT tile_offset = row_begin + (blockIdx.x *
|
|
539
|
+
OffsetT row_end = row_begin + (num_row_pixels * NumChannels);
|
|
540
|
+
OffsetT tile_offset = row_begin + (blockIdx.x * tile_samples);
|
|
743
541
|
|
|
744
542
|
while (tile_offset < row_end)
|
|
745
543
|
{
|
|
746
544
|
OffsetT num_remaining = row_end - tile_offset;
|
|
747
545
|
|
|
748
|
-
if (num_remaining <
|
|
546
|
+
if (num_remaining < tile_samples)
|
|
749
547
|
{
|
|
750
548
|
// Consume partial tile
|
|
751
|
-
ConsumeTile<
|
|
549
|
+
ConsumeTile<IsAligned, false>(tile_offset, num_remaining);
|
|
752
550
|
break;
|
|
753
551
|
}
|
|
754
552
|
|
|
755
553
|
// Consume full tile
|
|
756
|
-
ConsumeTile<
|
|
757
|
-
tile_offset += gridDim.x *
|
|
554
|
+
ConsumeTile<IsAligned, true>(tile_offset, tile_samples);
|
|
555
|
+
tile_offset += gridDim.x * tile_samples;
|
|
758
556
|
}
|
|
759
557
|
}
|
|
760
558
|
}
|
|
@@ -763,51 +561,31 @@ struct AgentHistogram
|
|
|
763
561
|
// Parameter extraction
|
|
764
562
|
//---------------------------------------------------------------------
|
|
765
563
|
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
* Input data to reduce
|
|
792
|
-
*
|
|
793
|
-
* @param num_output_bins
|
|
794
|
-
* The number bins per final output histogram
|
|
795
|
-
*
|
|
796
|
-
* @param num_privatized_bins
|
|
797
|
-
* The number bins per privatized histogram
|
|
798
|
-
*
|
|
799
|
-
* @param d_output_histograms
|
|
800
|
-
* Reference to final output histograms
|
|
801
|
-
*
|
|
802
|
-
* @param d_privatized_histograms
|
|
803
|
-
* Reference to privatized histograms
|
|
804
|
-
*
|
|
805
|
-
* @param output_decode_op
|
|
806
|
-
* The transform operator for determining output bin-ids from privatized counter indices, one for each channel
|
|
807
|
-
*
|
|
808
|
-
* @param privatized_decode_op
|
|
809
|
-
* The transform operator for determining privatized counter indices from samples, one for each channel
|
|
810
|
-
*/
|
|
564
|
+
//! @brief Constructor
|
|
565
|
+
//!
|
|
566
|
+
//! @param temp_storage
|
|
567
|
+
//! Reference to temp_storage
|
|
568
|
+
//!
|
|
569
|
+
//! @param d_samples
|
|
570
|
+
//! Input data to reduce
|
|
571
|
+
//!
|
|
572
|
+
//! @param num_output_bins
|
|
573
|
+
//! The number bins per final output histogram
|
|
574
|
+
//!
|
|
575
|
+
//! @param num_privatized_bins
|
|
576
|
+
//! The number bins per privatized histogram
|
|
577
|
+
//!
|
|
578
|
+
//! @param d_output_histograms
|
|
579
|
+
//! Reference to final output histograms
|
|
580
|
+
//!
|
|
581
|
+
//! @param d_privatized_histograms
|
|
582
|
+
//! Reference to privatized histograms
|
|
583
|
+
//!
|
|
584
|
+
//! @param output_decode_op
|
|
585
|
+
//! The transform operator for determining output bin-ids from privatized counter indices, one for each channel
|
|
586
|
+
//!
|
|
587
|
+
//! @param privatized_decode_op
|
|
588
|
+
//! The transform operator for determining privatized counter indices from samples, one for each channel
|
|
811
589
|
_CCCL_DEVICE _CCCL_FORCEINLINE AgentHistogram(
|
|
812
590
|
TempStorage& temp_storage,
|
|
813
591
|
SampleIteratorT d_samples,
|
|
@@ -825,39 +603,37 @@ struct AgentHistogram
|
|
|
825
603
|
, d_output_histograms(d_output_histograms)
|
|
826
604
|
, output_decode_op(output_decode_op)
|
|
827
605
|
, privatized_decode_op(privatized_decode_op)
|
|
828
|
-
, prefer_smem((
|
|
829
|
-
(
|
|
606
|
+
, prefer_smem((mem_preference == SMEM) ? true : // prefer smem privatized histograms
|
|
607
|
+
(mem_preference == GMEM) ? false
|
|
830
608
|
: // prefer gmem privatized histograms
|
|
831
609
|
blockIdx.x & 1) // prefer blended privatized histograms
|
|
832
610
|
{
|
|
833
|
-
int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
|
|
611
|
+
const int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
|
|
834
612
|
|
|
613
|
+
// TODO(bgruber): d_privatized_histograms seems only used when !prefer_smem, can we skip it if prefer_smem?
|
|
835
614
|
// Initialize the locations of this block's privatized histograms
|
|
836
|
-
for (int
|
|
615
|
+
for (int ch = 0; ch < NumActiveChannels; ++ch)
|
|
837
616
|
{
|
|
838
|
-
this->d_privatized_histograms[
|
|
839
|
-
d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
|
|
617
|
+
this->d_privatized_histograms[ch] = d_privatized_histograms[ch] + (blockId * num_privatized_bins[ch]);
|
|
840
618
|
}
|
|
841
619
|
}
|
|
842
620
|
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
* Queue descriptor for assigning tiles of work to thread blocks
|
|
860
|
-
*/
|
|
621
|
+
//! @brief Consume image
|
|
622
|
+
//!
|
|
623
|
+
//! @param num_row_pixels
|
|
624
|
+
//! The number of multi-channel pixels per row in the region of interest
|
|
625
|
+
//!
|
|
626
|
+
//! @param num_rows
|
|
627
|
+
//! The number of rows in the region of interest
|
|
628
|
+
//!
|
|
629
|
+
//! @param row_stride_samples
|
|
630
|
+
//! The number of samples between starts of consecutive rows in the region of interest
|
|
631
|
+
//!
|
|
632
|
+
//! @param tiles_per_row
|
|
633
|
+
//! Number of image tiles per row
|
|
634
|
+
//!
|
|
635
|
+
//! @param tile_queue
|
|
636
|
+
//! Queue descriptor for assigning tiles of work to thread blocks
|
|
861
637
|
_CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTiles(
|
|
862
638
|
OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue<int> tile_queue)
|
|
863
639
|
{
|
|
@@ -866,14 +642,16 @@ struct AgentHistogram
|
|
|
866
642
|
int pixel_mask = AlignBytes<PixelT>::ALIGN_BYTES - 1;
|
|
867
643
|
size_t row_bytes = sizeof(SampleT) * row_stride_samples;
|
|
868
644
|
|
|
869
|
-
|
|
870
|
-
|
|
645
|
+
// FIXME(bgruber): const changes SASS
|
|
646
|
+
/*const*/ bool vec_aligned_rows =
|
|
647
|
+
(NumChannels == 1) && (samples_per_thread % vec_size == 0) && // Single channel
|
|
871
648
|
((size_t(d_native_samples) & vec_mask) == 0) && // ptr is quad-aligned
|
|
872
649
|
((num_rows == 1) || ((row_bytes & vec_mask) == 0)); // number of row-samples is a multiple of the alignment of the
|
|
873
650
|
// quad
|
|
874
651
|
|
|
875
|
-
|
|
876
|
-
|
|
652
|
+
// FIXME(bgruber): const changes SASS
|
|
653
|
+
/*const*/ bool pixel_aligned_rows =
|
|
654
|
+
(NumChannels > 1) && // Multi channel
|
|
877
655
|
((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned
|
|
878
656
|
((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel
|
|
879
657
|
|
|
@@ -881,47 +659,51 @@ struct AgentHistogram
|
|
|
881
659
|
if ((d_native_samples != nullptr) && (vec_aligned_rows || pixel_aligned_rows))
|
|
882
660
|
{
|
|
883
661
|
ConsumeTiles<true>(
|
|
884
|
-
num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, bool_constant_v<
|
|
662
|
+
num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, bool_constant_v<is_work_stealing>);
|
|
885
663
|
}
|
|
886
664
|
else
|
|
887
665
|
{
|
|
888
666
|
ConsumeTiles<false>(
|
|
889
|
-
num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, bool_constant_v<
|
|
667
|
+
num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, bool_constant_v<is_work_stealing>);
|
|
890
668
|
}
|
|
891
669
|
}
|
|
892
670
|
|
|
893
|
-
|
|
894
|
-
* Initialize privatized bin counters. Specialized for privatized shared-memory counters
|
|
895
|
-
*/
|
|
671
|
+
//! Initialize privatized bin counters. Specialized for privatized shared-memory counters
|
|
896
672
|
_CCCL_DEVICE _CCCL_FORCEINLINE void InitBinCounters()
|
|
897
673
|
{
|
|
898
674
|
if (prefer_smem)
|
|
899
675
|
{
|
|
900
|
-
|
|
676
|
+
CounterT* privatized_histograms[NumActiveChannels];
|
|
677
|
+
for (int ch = 0; ch < NumActiveChannels; ++ch)
|
|
678
|
+
{
|
|
679
|
+
privatized_histograms[ch] = temp_storage.histograms[ch];
|
|
680
|
+
}
|
|
681
|
+
ZeroBinCounters(privatized_histograms);
|
|
901
682
|
}
|
|
902
683
|
else
|
|
903
684
|
{
|
|
904
|
-
|
|
685
|
+
ZeroBinCounters(d_privatized_histograms);
|
|
905
686
|
}
|
|
906
687
|
}
|
|
907
688
|
|
|
908
|
-
|
|
909
|
-
* Store privatized histogram to device-accessible memory. Specialized for privatized shared-memory counters
|
|
910
|
-
*/
|
|
689
|
+
//! Store privatized histogram to device-accessible memory. Specialized for privatized shared-memory counters
|
|
911
690
|
_CCCL_DEVICE _CCCL_FORCEINLINE void StoreOutput()
|
|
912
691
|
{
|
|
913
692
|
if (prefer_smem)
|
|
914
693
|
{
|
|
915
|
-
|
|
694
|
+
CounterT* privatized_histograms[NumActiveChannels];
|
|
695
|
+
for (int ch = 0; ch < NumActiveChannels; ++ch)
|
|
696
|
+
{
|
|
697
|
+
privatized_histograms[ch] = temp_storage.histograms[ch];
|
|
698
|
+
}
|
|
699
|
+
StoreOutput(privatized_histograms);
|
|
916
700
|
}
|
|
917
701
|
else
|
|
918
702
|
{
|
|
919
|
-
|
|
703
|
+
StoreOutput(d_privatized_histograms);
|
|
920
704
|
}
|
|
921
705
|
}
|
|
922
706
|
};
|
|
923
|
-
|
|
924
|
-
} // namespace histogram
|
|
925
|
-
} // namespace detail
|
|
707
|
+
} // namespace detail::histogram
|
|
926
708
|
|
|
927
709
|
CUB_NAMESPACE_END
|