cuda-cccl 0.3.1__cp313-cp313-manylinux_2_26_x86_64.whl → 0.3.2__cp313-cp313-manylinux_2_26_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  2. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  3. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  4. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  5. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  6. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  7. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
  8. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  9. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  10. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  11. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  12. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  13. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  14. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  15. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
  16. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  17. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
  18. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
  19. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
  20. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  21. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  22. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  23. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  24. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  25. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  26. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  27. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  28. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  29. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  30. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  31. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  32. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  33. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  34. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  35. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  36. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  37. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  38. cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
  39. cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
  40. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  41. cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
  42. cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
  43. cuda/cccl/headers/include/cuda/__event/event.h +26 -26
  44. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  45. cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
  46. cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
  47. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  48. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  49. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  50. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  51. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  52. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  53. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
  54. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
  55. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
  56. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  57. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  58. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  59. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  60. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  61. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  62. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  63. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  64. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  65. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  66. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
  67. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  68. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
  69. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  70. cuda/cccl/headers/include/cuda/cmath +1 -0
  71. cuda/cccl/headers/include/cuda/devices +3 -0
  72. cuda/cccl/headers/include/cuda/memory +1 -0
  73. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  74. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  75. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  76. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  77. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  78. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  79. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  80. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  81. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  82. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  83. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  84. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  85. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  86. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  87. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  88. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  89. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  90. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  91. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  92. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  93. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  94. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  95. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  96. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  97. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  98. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  99. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  100. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  101. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  102. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  103. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  104. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  105. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  106. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  107. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  108. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  109. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  110. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  111. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  112. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  113. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  114. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  115. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  116. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  117. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  118. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  120. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  121. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  122. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  123. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  124. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  125. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  126. cuda/cccl/headers/include/cuda/std/string_view +146 -11
  127. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  128. cuda/cccl/headers/include/cuda/utility +1 -0
  129. cuda/cccl/headers/include/nv/target +7 -2
  130. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  131. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  132. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  133. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  134. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  135. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  136. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  137. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  138. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  139. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  140. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  141. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  142. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  143. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  144. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  145. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  146. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  147. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  148. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  149. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  150. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  151. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  152. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  153. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  154. cuda/compute/__init__.py +2 -0
  155. cuda/compute/_bindings.pyi +43 -1
  156. cuda/compute/_bindings_impl.pyx +156 -7
  157. cuda/compute/algorithms/_scan.py +108 -36
  158. cuda/compute/algorithms/_transform.py +32 -11
  159. cuda/compute/cu12/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
  160. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  161. cuda/compute/cu13/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
  162. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  163. cuda/compute/iterators/__init__.py +2 -0
  164. cuda/compute/iterators/_factories.py +28 -0
  165. cuda/compute/iterators/_iterators.py +206 -1
  166. cuda/compute/numba_utils.py +2 -2
  167. cuda/compute/typing.py +2 -0
  168. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  169. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
  170. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  171. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  172. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  173. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  174. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  175. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  176. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  177. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  178. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  179. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  180. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  181. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  182. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  183. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  184. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  185. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -26,11 +26,9 @@
26
26
  *
27
27
  ******************************************************************************/
28
28
 
29
- /**
30
- * \file
31
- * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide
32
- * histogram .
33
- */
29
+ //! \file
30
+ //! cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide
31
+ //! histogram.
34
32
 
35
33
  #pragma once
36
34
 
@@ -55,13 +53,6 @@
55
53
 
56
54
  CUB_NAMESPACE_BEGIN
57
55
 
58
- /******************************************************************************
59
- * Tuning policy
60
- ******************************************************************************/
61
-
62
- /**
63
- *
64
- */
65
56
  enum BlockHistogramMemoryPreference
66
57
  {
67
58
  GMEM,
@@ -69,114 +60,117 @@ enum BlockHistogramMemoryPreference
69
60
  BLEND
70
61
  };
71
62
 
72
- /**
73
- * Parameterizable tuning policy type for AgentHistogram
74
- *
75
- * @tparam _BLOCK_THREADS
76
- * Threads per thread block
77
- *
78
- * @tparam _PIXELS_PER_THREAD
79
- * Pixels per thread (per tile of input)
80
- *
81
- * @tparam _LOAD_ALGORITHM
82
- * The BlockLoad algorithm to use
83
- *
84
- * @tparam _LOAD_MODIFIER
85
- * Cache load modifier for reading input elements
86
- *
87
- * @tparam _RLE_COMPRESS
88
- * Whether to perform localized RLE to compress samples before histogramming
89
- *
90
- * @tparam _MEM_PREFERENCE
91
- * Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
92
- *
93
- * @tparam _WORK_STEALING
94
- * Whether to dequeue tiles from a global work queue
95
- *
96
- * @tparam _VEC_SIZE
97
- * Vector size for samples loading (1, 2, 4)
98
- */
99
- template <int _BLOCK_THREADS,
100
- int _PIXELS_PER_THREAD,
101
- BlockLoadAlgorithm _LOAD_ALGORITHM,
102
- CacheLoadModifier _LOAD_MODIFIER,
103
- bool _RLE_COMPRESS,
104
- BlockHistogramMemoryPreference _MEM_PREFERENCE,
105
- bool _WORK_STEALING,
106
- int _VEC_SIZE = 4>
63
+ //! Parameterizable tuning policy type for AgentHistogram
64
+ //!
65
+ //! @tparam BlockThreads
66
+ //! Threads per thread block
67
+ //!
68
+ //! @tparam PixelsPerThread
69
+ //! Pixels per thread (per tile of input)
70
+ //!
71
+ //! @tparam LoadAlgorithm
72
+ //! The BlockLoad algorithm to use
73
+ //!
74
+ //! @tparam LoadModifier
75
+ //! Cache load modifier for reading input elements
76
+ //!
77
+ //! @tparam RleCompress
78
+ //! Whether to perform localized RLE to compress samples before histogramming
79
+ //!
80
+ //! @tparam MemoryPreference
81
+ //! Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
82
+ //!
83
+ //! @tparam WorkStealing
84
+ //! Whether to dequeue tiles from a global work queue
85
+ //!
86
+ //! @tparam VecSize
87
+ //! Vector size for samples loading (1, 2, 4)
88
+ template <int BlockThreads,
89
+ int PixelsPerThread,
90
+ BlockLoadAlgorithm LoadAlgorithm,
91
+ CacheLoadModifier LoadModifier,
92
+ bool RleCompress,
93
+ BlockHistogramMemoryPreference MemoryPreference,
94
+ bool WorkStealing,
95
+ int VecSize = 4>
107
96
  struct AgentHistogramPolicy
108
97
  {
109
98
  /// Threads per thread block
110
- static constexpr int BLOCK_THREADS = _BLOCK_THREADS;
99
+ static constexpr int BLOCK_THREADS = BlockThreads;
111
100
  /// Pixels per thread (per tile of input)
112
- static constexpr int PIXELS_PER_THREAD = _PIXELS_PER_THREAD;
101
+ static constexpr int PIXELS_PER_THREAD = PixelsPerThread;
113
102
 
114
103
  /// Whether to perform localized RLE to compress samples before histogramming
115
- static constexpr bool IS_RLE_COMPRESS = _RLE_COMPRESS;
104
+ static constexpr bool IS_RLE_COMPRESS = RleCompress;
116
105
 
117
106
  /// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
118
- static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE = _MEM_PREFERENCE;
107
+ static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE = MemoryPreference;
119
108
 
120
109
  /// Whether to dequeue tiles from a global work queue
121
- static constexpr bool IS_WORK_STEALING = _WORK_STEALING;
110
+ static constexpr bool IS_WORK_STEALING = WorkStealing;
122
111
 
123
112
  /// Vector size for samples loading (1, 2, 4)
124
- static constexpr int VEC_SIZE = _VEC_SIZE;
113
+ static constexpr int VEC_SIZE = VecSize;
125
114
 
126
115
  ///< The BlockLoad algorithm to use
127
- static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
116
+ static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = LoadAlgorithm;
128
117
 
129
118
  ///< Cache load modifier for reading input elements
130
- static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
119
+ static constexpr CacheLoadModifier LOAD_MODIFIER = LoadModifier;
131
120
  };
132
121
 
133
- /******************************************************************************
134
- * Thread block abstractions
135
- ******************************************************************************/
136
-
137
- namespace detail
122
+ namespace detail::histogram
138
123
  {
139
- namespace histogram
124
+ // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
125
+ template <CacheLoadModifier Modifier, typename ValueT, typename OffsetT>
126
+ _CCCL_DEVICE _CCCL_FORCEINLINE auto NativePointer(CacheModifiedInputIterator<Modifier, ValueT, OffsetT> itr)
140
127
  {
128
+ return itr.ptr;
129
+ }
141
130
 
142
- /**
143
- * @brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating
144
- * in device-wide histogram .
145
- *
146
- * @tparam AgentHistogramPolicyT
147
- * Parameterized AgentHistogramPolicy tuning policy type
148
- *
149
- * @tparam PRIVATIZED_SMEM_BINS
150
- * Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized
151
- * counters to be maintained in device-accessible memory.
152
- *
153
- * @tparam NUM_CHANNELS
154
- * Number of channels interleaved in the input data. Supports up to four channels.
155
- *
156
- * @tparam NUM_ACTIVE_CHANNELS
157
- * Number of channels actively being histogrammed
158
- *
159
- * @tparam SampleIteratorT
160
- * Random-access input iterator type for reading samples
161
- *
162
- * @tparam CounterT
163
- * Integer type for counting sample occurrences per histogram bin
164
- *
165
- * @tparam PrivatizedDecodeOpT
166
- * The transform operator type for determining privatized counter indices from samples, one for
167
- * each channel
168
- *
169
- * @tparam OutputDecodeOpT
170
- * The transform operator type for determining output bin-ids from privatized counter indices, one
171
- * for each channel
172
- *
173
- * @tparam OffsetT
174
- * Signed integer type for global offsets
175
- */
131
+ // Return a native pixel pointer (specialized for other types)
132
+ template <typename IteratorT>
133
+ _CCCL_DEVICE _CCCL_FORCEINLINE auto NativePointer(IteratorT itr)
134
+ {
135
+ return nullptr;
136
+ }
137
+
138
+ //! @brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating
139
+ //! in device-wide histogram .
140
+ //!
141
+ //! @tparam AgentHistogramPolicyT
142
+ //! Parameterized AgentHistogramPolicy tuning policy type
143
+ //!
144
+ //! @tparam PrivatizedSmemBins
145
+ //! Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized
146
+ //! counters to be maintained in device-accessible memory.
147
+ //!
148
+ //! @tparam NumChannels
149
+ //! Number of channels interleaved in the input data. Supports up to four channels.
150
+ //!
151
+ //! @tparam NumActiveChannels
152
+ //! Number of channels actively being histogrammed
153
+ //!
154
+ //! @tparam SampleIteratorT
155
+ //! Random-access input iterator type for reading samples
156
+ //!
157
+ //! @tparam CounterT
158
+ //! Integer type for counting sample occurrences per histogram bin
159
+ //!
160
+ //! @tparam PrivatizedDecodeOpT
161
+ //! The transform operator type for determining privatized counter indices from samples, one for
162
+ //! each channel
163
+ //!
164
+ //! @tparam OutputDecodeOpT
165
+ //! The transform operator type for determining output bin-ids from privatized counter indices, one
166
+ //! for each channel
167
+ //!
168
+ //! @tparam OffsetT
169
+ //! Signed integer type for global offsets
176
170
  template <typename AgentHistogramPolicyT,
177
- int PRIVATIZED_SMEM_BINS,
178
- int NUM_CHANNELS,
179
- int NUM_ACTIVE_CHANNELS,
171
+ int PrivatizedSmemBins,
172
+ int NumChannels,
173
+ int NumActiveChannels,
180
174
  typename SampleIteratorT,
181
175
  typename CounterT,
182
176
  typename PrivatizedDecodeOpT,
@@ -184,251 +178,137 @@ template <typename AgentHistogramPolicyT,
184
178
  typename OffsetT>
185
179
  struct AgentHistogram
186
180
  {
187
- //---------------------------------------------------------------------
188
- // Types and constants
189
- //---------------------------------------------------------------------
190
-
191
- /// The sample type of the input iterator
192
- using SampleT = cub::detail::it_value_t<SampleIteratorT>;
193
-
194
- /// The pixel type of SampleT
195
- using PixelT = typename CubVector<SampleT, NUM_CHANNELS>::Type;
196
-
197
- /// The vec type of SampleT
198
- static constexpr int VecSize = AgentHistogramPolicyT::VEC_SIZE;
199
- using VecT = typename CubVector<SampleT, VecSize>::Type;
200
-
201
- /// Constants
202
- static constexpr int BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS;
203
-
204
- static constexpr int PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD;
205
- static constexpr int SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS;
206
- static constexpr int VECS_PER_THREAD = SAMPLES_PER_THREAD / VecSize;
207
-
208
- static constexpr int TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS;
209
- static constexpr int TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS;
210
-
211
- static constexpr bool IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS;
212
-
213
- static constexpr BlockHistogramMemoryPreference MEM_PREFERENCE =
214
- (PRIVATIZED_SMEM_BINS > 0) ? AgentHistogramPolicyT::MEM_PREFERENCE : GMEM;
215
-
216
- static constexpr bool IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING;
217
-
218
- /// Cache load modifier for reading input elements
219
- static constexpr CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
181
+ static constexpr int vec_size = AgentHistogramPolicyT::VEC_SIZE;
182
+ static constexpr int block_threads = AgentHistogramPolicyT::BLOCK_THREADS;
183
+ static constexpr int pixels_per_thread = AgentHistogramPolicyT::PIXELS_PER_THREAD;
184
+ static constexpr int samples_per_thread = pixels_per_thread * NumChannels;
185
+ static constexpr int vecs_per_thread = samples_per_thread / vec_size;
186
+ static constexpr int tile_pixels = pixels_per_thread * block_threads;
187
+ static constexpr int tile_samples = samples_per_thread * block_threads;
188
+ static constexpr bool is_rle_compress = AgentHistogramPolicyT::IS_RLE_COMPRESS;
189
+ static constexpr bool is_work_stealing = AgentHistogramPolicyT::IS_WORK_STEALING;
190
+ static constexpr CacheLoadModifier load_modifier = AgentHistogramPolicyT::LOAD_MODIFIER;
191
+ static constexpr auto mem_preference =
192
+ (PrivatizedSmemBins > 0) ? BlockHistogramMemoryPreference{AgentHistogramPolicyT::MEM_PREFERENCE} : GMEM;
193
+
194
+ using SampleT = it_value_t<SampleIteratorT>;
195
+ using PixelT = typename CubVector<SampleT, NumChannels>::Type;
196
+ using VecT = typename CubVector<SampleT, vec_size>::Type;
220
197
 
221
198
  /// Input iterator wrapper type (for applying cache modifier)
222
- // Wrap the native input pointer with CacheModifiedInputIterator
223
- // or directly use the supplied input iterator type
199
+ // Wrap the native input pointer with CacheModifiedInputIterator or directly use the supplied input iterator type
200
+ // TODO(bgruber): we can wrap all contiguous iterators, not just pointers
224
201
  using WrappedSampleIteratorT =
225
202
  ::cuda::std::_If<::cuda::std::is_pointer_v<SampleIteratorT>,
226
- CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,
203
+ CacheModifiedInputIterator<load_modifier, SampleT, OffsetT>,
227
204
  SampleIteratorT>;
205
+ using WrappedPixelIteratorT = CacheModifiedInputIterator<load_modifier, PixelT, OffsetT>;
206
+ using WrappedVecsIteratorT = CacheModifiedInputIterator<load_modifier, VecT, OffsetT>;
207
+ using BlockLoadSampleT = BlockLoad<SampleT, block_threads, samples_per_thread, AgentHistogramPolicyT::LOAD_ALGORITHM>;
208
+ using BlockLoadPixelT = BlockLoad<PixelT, block_threads, pixels_per_thread, AgentHistogramPolicyT::LOAD_ALGORITHM>;
209
+ using BlockLoadVecT = BlockLoad<VecT, block_threads, vecs_per_thread, AgentHistogramPolicyT::LOAD_ALGORITHM>;
228
210
 
229
- /// Pixel input iterator type (for applying cache modifier)
230
- using WrappedPixelIteratorT = CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>;
231
-
232
- /// Qaud input iterator type (for applying cache modifier)
233
- using WrappedVecsIteratorT = CacheModifiedInputIterator<LOAD_MODIFIER, VecT, OffsetT>;
234
-
235
- /// Parameterized BlockLoad type for samples
236
- using BlockLoadSampleT = BlockLoad<SampleT, BLOCK_THREADS, SAMPLES_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM>;
237
-
238
- /// Parameterized BlockLoad type for pixels
239
- using BlockLoadPixelT = BlockLoad<PixelT, BLOCK_THREADS, PIXELS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM>;
240
-
241
- /// Parameterized BlockLoad type for vecs
242
- using BlockLoadVecT = BlockLoad<VecT, BLOCK_THREADS, VECS_PER_THREAD, AgentHistogramPolicyT::LOAD_ALGORITHM>;
243
-
244
- /// Shared memory type required by this thread block
245
211
  struct _TempStorage
246
212
  {
247
213
  // Smem needed for block-privatized smem histogram (with 1 word of padding)
248
- CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];
249
-
214
+ CounterT histograms[NumActiveChannels][PrivatizedSmemBins + 1];
250
215
  int tile_idx;
251
216
 
252
- // Aliasable storage layout
253
- union Aliasable
217
+ union
254
218
  {
255
- // Smem needed for loading a tile of samples
256
219
  typename BlockLoadSampleT::TempStorage sample_load;
257
-
258
- // Smem needed for loading a tile of pixels
259
220
  typename BlockLoadPixelT::TempStorage pixel_load;
260
-
261
- // Smem needed for loading a tile of vecs
262
221
  typename BlockLoadVecT::TempStorage vec_load;
263
-
264
- } aliasable;
222
+ };
265
223
  };
266
224
 
267
- /// Temporary storage type (unionable)
268
- struct TempStorage : Uninitialized<_TempStorage>
269
- {};
270
-
271
- //---------------------------------------------------------------------
272
- // Per-thread fields
273
- //---------------------------------------------------------------------
225
+ using TempStorage = Uninitialized<_TempStorage>;
274
226
 
275
- /// Reference to temp_storage
276
227
  _TempStorage& temp_storage;
277
-
278
- /// Sample input iterator (with cache modifier applied, if possible)
279
- WrappedSampleIteratorT d_wrapped_samples;
280
-
281
- /// Native pointer for input samples (possibly nullptr if unavailable)
282
- SampleT* d_native_samples;
283
-
284
- /// The number of output bins for each channel
285
- int* num_output_bins;
286
-
287
- /// The number of privatized bins for each channel
288
- int* num_privatized_bins;
289
-
290
- /// Copy of gmem privatized histograms for each channel
291
- CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
292
-
293
- /// Reference to final output histograms (gmem)
294
- CounterT** d_output_histograms;
295
-
296
- /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
297
- OutputDecodeOpT* output_decode_op;
298
-
299
- /// The transform operator for determining privatized counter indices from samples, one for each channel
300
- PrivatizedDecodeOpT* privatized_decode_op;
301
-
302
- /// Whether to prefer privatized smem counters vs privatized global counters
303
- bool prefer_smem;
304
-
305
- //---------------------------------------------------------------------
306
- // Initialize privatized bin counters
307
- //---------------------------------------------------------------------
308
-
309
- // Initialize privatized bin counters
310
- _CCCL_DEVICE _CCCL_FORCEINLINE void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
228
+ WrappedSampleIteratorT d_wrapped_samples; // with cache modifier applied, if possible
229
+ SampleT* d_native_samples; // possibly nullptr if unavailable
230
+ int* num_output_bins; // one for each channel
231
+ int* num_privatized_bins; // one for each channel
232
+ CounterT* d_privatized_histograms[NumActiveChannels]; // one for each channel
233
+ CounterT** d_output_histograms; // in global memory
234
+ OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each channel
235
+ PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each channel
236
+ bool prefer_smem; // for privatized counterss
237
+
238
+ _CCCL_DEVICE _CCCL_FORCEINLINE void ZeroBinCounters(CounterT* privatized_histograms[NumActiveChannels])
311
239
  {
312
- // Initialize histogram bin counts to zeros
313
240
  _CCCL_PRAGMA_UNROLL_FULL()
314
- for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
241
+ for (int ch = 0; ch < NumActiveChannels; ++ch)
315
242
  {
316
- for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL];
317
- privatized_bin += BLOCK_THREADS)
243
+ for (int bin = threadIdx.x; bin < num_privatized_bins[ch]; bin += block_threads)
318
244
  {
319
- privatized_histograms[CHANNEL][privatized_bin] = 0;
245
+ privatized_histograms[ch][bin] = 0;
320
246
  }
321
247
  }
322
248
 
249
+ // TODO(bgruber): do we also need the __syncthreads() when prefer_smem is false?
323
250
  // Barrier to make sure all threads are done updating counters
324
251
  __syncthreads();
325
252
  }
326
253
 
327
- // Initialize privatized bin counters. Specialized for privatized shared-memory counters
328
- _CCCL_DEVICE _CCCL_FORCEINLINE void InitSmemBinCounters()
329
- {
330
- CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
331
-
332
- for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
333
- {
334
- privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
335
- }
336
-
337
- InitBinCounters(privatized_histograms);
338
- }
339
-
340
- // Initialize privatized bin counters. Specialized for privatized global-memory counters
341
- _CCCL_DEVICE _CCCL_FORCEINLINE void InitGmemBinCounters()
342
- {
343
- InitBinCounters(d_privatized_histograms);
344
- }
345
-
346
- //---------------------------------------------------------------------
347
- // Update final output histograms
348
- //---------------------------------------------------------------------
349
-
350
254
  // Update final output histograms from privatized histograms
351
- _CCCL_DEVICE _CCCL_FORCEINLINE void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
255
+ _CCCL_DEVICE _CCCL_FORCEINLINE void StoreOutput(CounterT* privatized_histograms[NumActiveChannels])
352
256
  {
353
257
  // Barrier to make sure all threads are done updating counters
354
258
  __syncthreads();
355
259
 
356
260
  // Apply privatized bin counts to output bin counts
357
261
  _CCCL_PRAGMA_UNROLL_FULL()
358
- for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
262
+ for (int ch = 0; ch < NumActiveChannels; ++ch)
359
263
  {
360
- int channel_bins = num_privatized_bins[CHANNEL];
361
- for (int privatized_bin = threadIdx.x; privatized_bin < channel_bins; privatized_bin += BLOCK_THREADS)
264
+ const int channel_bins = num_privatized_bins[ch];
265
+ for (int bin = threadIdx.x; bin < channel_bins; bin += block_threads)
362
266
  {
363
- int output_bin = -1;
364
- CounterT count = privatized_histograms[CHANNEL][privatized_bin];
365
- bool is_valid = count > 0;
366
-
367
- output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
267
+ int output_bin = -1;
268
+ const CounterT count = privatized_histograms[ch][bin];
269
+ const bool is_valid = count > 0;
270
+ output_decode_op[ch].template BinSelect<load_modifier>(static_cast<SampleT>(bin), output_bin, is_valid);
368
271
 
369
272
  if (output_bin >= 0)
370
273
  {
371
- atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
274
+ atomicAdd(&d_output_histograms[ch][output_bin], count);
372
275
  }
373
276
  }
374
277
  }
375
278
  }
376
279
 
377
- // Update final output histograms from privatized histograms. Specialized for privatized shared-memory counters
378
- _CCCL_DEVICE _CCCL_FORCEINLINE void StoreSmemOutput()
379
- {
380
- CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
381
- for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
382
- {
383
- privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
384
- }
385
-
386
- StoreOutput(privatized_histograms);
387
- }
388
-
389
- // Update final output histograms from privatized histograms. Specialized for privatized global-memory counters
390
- _CCCL_DEVICE _CCCL_FORCEINLINE void StoreGmemOutput()
391
- {
392
- StoreOutput(d_privatized_histograms);
393
- }
394
-
395
- //---------------------------------------------------------------------
396
- // Tile accumulation
397
- //---------------------------------------------------------------------
398
-
399
280
  // Accumulate pixels. Specialized for RLE compression.
400
281
  _CCCL_DEVICE _CCCL_FORCEINLINE void AccumulatePixels(
401
- SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
402
- bool is_valid[PIXELS_PER_THREAD],
403
- CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
282
+ SampleT samples[pixels_per_thread][NumChannels],
283
+ bool is_valid[pixels_per_thread],
284
+ CounterT* privatized_histograms[NumActiveChannels],
404
285
  ::cuda::std::true_type is_rle_compress)
405
286
  {
406
287
  _CCCL_PRAGMA_UNROLL_FULL()
407
- for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
288
+ for (int ch = 0; ch < NumActiveChannels; ++ch)
408
289
  {
409
290
  // Bin pixels
410
- int bins[PIXELS_PER_THREAD];
291
+ int bins[pixels_per_thread];
411
292
 
412
293
  _CCCL_PRAGMA_UNROLL_FULL()
413
- for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
294
+ for (int pixel = 0; pixel < pixels_per_thread; ++pixel)
414
295
  {
415
- bins[PIXEL] = -1;
416
- privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(
417
- samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
296
+ bins[pixel] = -1;
297
+ privatized_decode_op[ch].template BinSelect<load_modifier>(samples[pixel][ch], bins[pixel], is_valid[pixel]);
418
298
  }
419
299
 
420
300
  CounterT accumulator = 1;
421
301
 
422
302
  _CCCL_PRAGMA_UNROLL_FULL()
423
- for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
303
+ for (int pixel = 0; pixel < pixels_per_thread - 1; ++pixel)
424
304
  {
425
- if (bins[PIXEL] != bins[PIXEL + 1])
305
+ if (bins[pixel] != bins[pixel + 1])
426
306
  {
427
- if (bins[PIXEL] >= 0)
307
+ if (bins[pixel] >= 0)
428
308
  {
429
309
  NV_IF_TARGET(NV_PROVIDES_SM_60,
430
- (atomicAdd_block(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);),
431
- (atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);));
310
+ (atomicAdd_block(privatized_histograms[ch] + bins[pixel], accumulator);),
311
+ (atomicAdd(privatized_histograms[ch] + bins[pixel], accumulator);));
432
312
  }
433
313
 
434
314
  accumulator = 0;
@@ -437,234 +317,162 @@ struct AgentHistogram
437
317
  }
438
318
 
439
319
  // Last pixel
440
- if (bins[PIXELS_PER_THREAD - 1] >= 0)
320
+ if (bins[pixels_per_thread - 1] >= 0)
441
321
  {
442
322
  NV_IF_TARGET(NV_PROVIDES_SM_60,
443
- (atomicAdd_block(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);),
444
- (atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);));
323
+ (atomicAdd_block(privatized_histograms[ch] + bins[pixels_per_thread - 1], accumulator);),
324
+ (atomicAdd(privatized_histograms[ch] + bins[pixels_per_thread - 1], accumulator);));
445
325
  }
446
326
  }
447
327
  }
448
328
 
449
329
  // Accumulate pixels. Specialized for individual accumulation of each pixel.
450
330
  _CCCL_DEVICE _CCCL_FORCEINLINE void AccumulatePixels(
451
- SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS],
452
- bool is_valid[PIXELS_PER_THREAD],
453
- CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS],
331
+ SampleT samples[pixels_per_thread][NumChannels],
332
+ bool is_valid[pixels_per_thread],
333
+ CounterT* privatized_histograms[NumActiveChannels],
454
334
  ::cuda::std::false_type is_rle_compress)
455
335
  {
456
336
  _CCCL_PRAGMA_UNROLL_FULL()
457
- for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
337
+ for (int pixel = 0; pixel < pixels_per_thread; ++pixel)
458
338
  {
459
339
  _CCCL_PRAGMA_UNROLL_FULL()
460
- for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
340
+ for (int ch = 0; ch < NumActiveChannels; ++ch)
461
341
  {
462
342
  int bin = -1;
463
- privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
343
+ privatized_decode_op[ch].template BinSelect<load_modifier>(samples[pixel][ch], bin, is_valid[pixel]);
464
344
  if (bin >= 0)
465
345
  {
466
346
  NV_IF_TARGET(NV_PROVIDES_SM_60,
467
- (atomicAdd_block(privatized_histograms[CHANNEL] + bin, 1);),
468
- (atomicAdd(privatized_histograms[CHANNEL] + bin, 1);));
347
+ (atomicAdd_block(privatized_histograms[ch] + bin, 1);),
348
+ (atomicAdd(privatized_histograms[ch] + bin, 1);));
469
349
  }
470
350
  }
471
351
  }
472
352
  }
473
353
 
474
- /**
475
- * Accumulate pixel, specialized for smem privatized histogram
476
- */
354
+ // Load full, aligned tile using pixel iterator
477
355
  _CCCL_DEVICE _CCCL_FORCEINLINE void
478
- AccumulateSmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
356
+ LoadFullAlignedTile(OffsetT block_offset, SampleT (&samples)[pixels_per_thread][NumChannels])
479
357
  {
480
- CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
481
-
482
- for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
358
+ if constexpr (NumActiveChannels == 1)
483
359
  {
484
- privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
360
+ using AliasedVecs = VecT[vecs_per_thread];
361
+ WrappedVecsIteratorT d_wrapped_vecs(reinterpret_cast<VecT*>(d_native_samples + block_offset));
362
+ // Load using a wrapped vec iterator
363
+ BlockLoadVecT{temp_storage.vec_load}.Load(d_wrapped_vecs, reinterpret_cast<AliasedVecs&>(samples));
364
+ }
365
+ else
366
+ {
367
+ using AliasedPixels = PixelT[pixels_per_thread];
368
+ WrappedPixelIteratorT d_wrapped_pixels(reinterpret_cast<PixelT*>(d_native_samples + block_offset));
369
+ // Load using a wrapped pixel iterator
370
+ BlockLoadPixelT{temp_storage.pixel_load}.Load(d_wrapped_pixels, reinterpret_cast<AliasedPixels&>(samples));
485
371
  }
486
-
487
- AccumulatePixels(samples, is_valid, privatized_histograms, ::cuda::std::bool_constant<IS_RLE_COMPRESS>{});
488
- }
489
-
490
- /**
491
- * Accumulate pixel, specialized for gmem privatized histogram
492
- */
493
- _CCCL_DEVICE _CCCL_FORCEINLINE void
494
- AccumulateGmemPixels(SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], bool is_valid[PIXELS_PER_THREAD])
495
- {
496
- AccumulatePixels(samples, is_valid, d_privatized_histograms, ::cuda::std::bool_constant<IS_RLE_COMPRESS>{});
497
- }
498
-
499
- //---------------------------------------------------------------------
500
- // Tile loading
501
- //---------------------------------------------------------------------
502
-
503
- // Load full, aligned tile using pixel iterator (multi-channel)
504
- template <int _NUM_ACTIVE_CHANNELS>
505
- _CCCL_DEVICE _CCCL_FORCEINLINE void LoadFullAlignedTile(
506
- OffsetT block_offset,
507
- int valid_samples,
508
- SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
509
- constant_t<_NUM_ACTIVE_CHANNELS> num_active_channels)
510
- {
511
- using AliasedPixels = PixelT[PIXELS_PER_THREAD];
512
-
513
- WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
514
-
515
- // Load using a wrapped pixel iterator
516
- BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(d_wrapped_pixels, reinterpret_cast<AliasedPixels&>(samples));
517
- }
518
-
519
- // Load full, aligned tile using vec iterator (single-channel)
520
- _CCCL_DEVICE _CCCL_FORCEINLINE void LoadFullAlignedTile(
521
- OffsetT block_offset,
522
- int valid_samples,
523
- SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
524
- constant_t<1> num_active_channels)
525
- {
526
- using AliasedVecs = VecT[VECS_PER_THREAD];
527
-
528
- WrappedVecsIteratorT d_wrapped_vecs((VecT*) (d_native_samples + block_offset));
529
-
530
- // Load using a wrapped vec iterator
531
- BlockLoadVecT(temp_storage.aliasable.vec_load).Load(d_wrapped_vecs, reinterpret_cast<AliasedVecs&>(samples));
532
- }
533
-
534
- // Load full, aligned tile
535
- _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile(
536
- OffsetT block_offset,
537
- int valid_samples,
538
- SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
539
- ::cuda::std::true_type is_full_tile,
540
- ::cuda::std::true_type is_aligned)
541
- {
542
- LoadFullAlignedTile(block_offset, valid_samples, samples, constant_v<NUM_ACTIVE_CHANNELS>);
543
- }
544
-
545
- // Load full, mis-aligned tile using sample iterator
546
- _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile(
547
- OffsetT block_offset,
548
- int valid_samples,
549
- SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
550
- ::cuda::std::true_type is_full_tile,
551
- ::cuda::std::false_type is_aligned)
552
- {
553
- using AliasedSamples = SampleT[SAMPLES_PER_THREAD];
554
-
555
- // Load using sample iterator
556
- BlockLoadSampleT(temp_storage.aliasable.sample_load)
557
- .Load(d_wrapped_samples + block_offset, reinterpret_cast<AliasedSamples&>(samples));
558
- }
559
-
560
- // Load partially-full, aligned tile using the pixel iterator
561
- _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile(
562
- OffsetT block_offset,
563
- int valid_samples,
564
- SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
565
- ::cuda::std::false_type is_full_tile,
566
- ::cuda::std::true_type is_aligned)
567
- {
568
- using AliasedPixels = PixelT[PIXELS_PER_THREAD];
569
-
570
- WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
571
-
572
- int valid_pixels = valid_samples / NUM_CHANNELS;
573
-
574
- // Load using a wrapped pixel iterator
575
- BlockLoadPixelT(temp_storage.aliasable.pixel_load)
576
- .Load(d_wrapped_pixels, reinterpret_cast<AliasedPixels&>(samples), valid_pixels);
577
- }
578
-
579
- // Load partially-full, mis-aligned tile using sample iterator
580
- _CCCL_DEVICE _CCCL_FORCEINLINE void LoadTile(
581
- OffsetT block_offset,
582
- int valid_samples,
583
- SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
584
- ::cuda::std::false_type is_full_tile,
585
- ::cuda::std::false_type is_aligned)
586
- {
587
- using AliasedSamples = SampleT[SAMPLES_PER_THREAD];
588
-
589
- BlockLoadSampleT(temp_storage.aliasable.sample_load)
590
- .Load(d_wrapped_samples + block_offset, reinterpret_cast<AliasedSamples&>(samples), valid_samples);
591
372
  }
592
373
 
593
- template <bool IS_FULL_TILE>
374
+ template <bool IsFullTile, bool IsAligned>
594
375
  _CCCL_DEVICE _CCCL_FORCEINLINE void
595
- MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], int valid_samples, ::cuda::std::false_type /* is_striped = false */)
376
+ LoadTile(OffsetT block_offset, int valid_samples, SampleT (&samples)[pixels_per_thread][NumChannels])
596
377
  {
597
- _CCCL_PRAGMA_UNROLL_FULL()
598
- for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
378
+ if constexpr (IsFullTile)
379
+ {
380
+ if constexpr (IsAligned)
381
+ {
382
+ LoadFullAlignedTile(block_offset, samples);
383
+ }
384
+ else
385
+ {
386
+ // Load using sample iterator
387
+ using AliasedSamples = SampleT[samples_per_thread];
388
+ BlockLoadSampleT{temp_storage.sample_load}.Load(
389
+ d_wrapped_samples + block_offset, reinterpret_cast<AliasedSamples&>(samples));
390
+ }
391
+ }
392
+ else
599
393
  {
600
- is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
394
+ if constexpr (IsAligned)
395
+ {
396
+ // Load partially-full, aligned tile using the pixel iterator
397
+ using AliasedPixels = PixelT[pixels_per_thread];
398
+ WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
399
+ int valid_pixels = valid_samples / NumChannels;
400
+
401
+ // Load using a wrapped pixel iterator
402
+ BlockLoadPixelT{temp_storage.pixel_load}.Load(
403
+ d_wrapped_pixels, reinterpret_cast<AliasedPixels&>(samples), valid_pixels);
404
+ }
405
+ else
406
+ {
407
+ using AliasedSamples = SampleT[samples_per_thread];
408
+ BlockLoadSampleT{temp_storage.sample_load}.Load(
409
+ d_wrapped_samples + block_offset, reinterpret_cast<AliasedSamples&>(samples), valid_samples);
410
+ }
601
411
  }
602
412
  }
603
413
 
604
- template <bool IS_FULL_TILE>
605
- _CCCL_DEVICE _CCCL_FORCEINLINE void
606
- MarkValid(bool (&is_valid)[PIXELS_PER_THREAD], int valid_samples, ::cuda::std::true_type /* is_striped = true */)
414
+ template <bool IsFullTile, bool IsStriped>
415
+ _CCCL_DEVICE _CCCL_FORCEINLINE void MarkValid(bool (&is_valid)[pixels_per_thread], int valid_samples)
607
416
  {
608
417
  _CCCL_PRAGMA_UNROLL_FULL()
609
- for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
418
+ for (int pixel = 0; pixel < pixels_per_thread; ++pixel)
610
419
  {
611
- is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x + BLOCK_THREADS * PIXEL) * NUM_CHANNELS) < valid_samples);
420
+ if constexpr (IsStriped)
421
+ {
422
+ is_valid[pixel] = IsFullTile || (((threadIdx.x + block_threads * pixel) * NumChannels) < valid_samples);
423
+ }
424
+ else
425
+ {
426
+ is_valid[pixel] = IsFullTile || (((threadIdx.x * pixels_per_thread + pixel) * NumChannels) < valid_samples);
427
+ }
612
428
  }
613
429
  }
614
430
 
615
- //---------------------------------------------------------------------
616
- // Tile processing
617
- //---------------------------------------------------------------------
618
-
619
- /**
620
- * @brief Consume a tile of data samples
621
- *
622
- * @tparam IS_ALIGNED
623
- * Whether the tile offset is aligned (vec-aligned for single-channel, pixel-aligned for multi-channel)
624
- *
625
- * @tparam IS_FULL_TILE
626
- Whether the tile is full
627
- */
628
- template <bool IS_ALIGNED, bool IS_FULL_TILE>
431
+ //! @brief Consume a tile of data samples
432
+ //!
433
+ //! @tparam IsAligned
434
+ //! Whether the tile offset is aligned (vec-aligned for single-channel, pixel-aligned for multi-channel)
435
+ //!
436
+ //! @tparam IsFullTile
437
+ //! Whether the tile is full
438
+ template <bool IsAligned, bool IsFullTile>
629
439
  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(OffsetT block_offset, int valid_samples)
630
440
  {
631
- SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS];
632
- bool is_valid[PIXELS_PER_THREAD];
633
-
634
- // Load tile
635
- LoadTile(block_offset, valid_samples, samples, bool_constant_v<IS_FULL_TILE>, bool_constant_v<IS_ALIGNED>);
441
+ SampleT samples[pixels_per_thread][NumChannels];
442
+ bool is_valid[pixels_per_thread];
636
443
 
637
- // Set valid flags
638
- MarkValid<IS_FULL_TILE>(
639
- is_valid, valid_samples, bool_constant_v < AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED >);
444
+ LoadTile<IsFullTile, IsAligned>(block_offset, valid_samples, samples);
445
+ MarkValid<IsFullTile, AgentHistogramPolicyT::LOAD_ALGORITHM == BLOCK_LOAD_STRIPED>(is_valid, valid_samples);
640
446
 
641
- // Accumulate samples
642
447
  if (prefer_smem)
643
448
  {
644
- AccumulateSmemPixels(samples, is_valid);
449
+ CounterT* privatized_histograms[NumActiveChannels];
450
+ for (int ch = 0; ch < NumActiveChannels; ++ch)
451
+ {
452
+ privatized_histograms[ch] = temp_storage.histograms[ch];
453
+ }
454
+ AccumulatePixels(samples, is_valid, privatized_histograms, ::cuda::std::bool_constant<is_rle_compress>{});
645
455
  }
646
456
  else
647
457
  {
648
- AccumulateGmemPixels(samples, is_valid);
458
+ AccumulatePixels(samples, is_valid, d_privatized_histograms, ::cuda::std::bool_constant<is_rle_compress>{});
649
459
  }
650
460
  }
651
461
 
652
- /**
653
- * @brief Consume row tiles. Specialized for work-stealing from queue
654
- *
655
- * @param num_row_pixels
656
- * The number of multi-channel pixels per row in the region of interest
657
- *
658
- * @param num_rows
659
- * The number of rows in the region of interest
660
- *
661
- * @param row_stride_samples
662
- * The number of samples between starts of consecutive rows in the region of interest
663
- *
664
- * @param tiles_per_row
665
- * Number of image tiles per row
666
- */
667
- template <bool IS_ALIGNED>
462
+ //! @brief Consume row tiles. Specialized for work-stealing from queue
463
+ //!
464
+ //! @param num_row_pixels
465
+ //! The number of multi-channel pixels per row in the region of interest
466
+ //!
467
+ //! @param num_rows
468
+ //! The number of rows in the region of interest
469
+ //!
470
+ //! @param row_stride_samples
471
+ //! The number of samples between starts of consecutive rows in the region of interest
472
+ //!
473
+ //! @param tiles_per_row
474
+ //! Number of image tiles per row
475
+ template <bool IsAligned>
668
476
  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTiles(
669
477
  OffsetT num_row_pixels,
670
478
  OffsetT num_rows,
@@ -682,19 +490,19 @@ struct AgentHistogram
682
490
  int row = tile_idx / tiles_per_row;
683
491
  int col = tile_idx - (row * tiles_per_row);
684
492
  OffsetT row_offset = row * row_stride_samples;
685
- OffsetT col_offset = (col * TILE_SAMPLES);
493
+ OffsetT col_offset = (col * tile_samples);
686
494
  OffsetT tile_offset = row_offset + col_offset;
687
495
 
688
496
  if (col == tiles_per_row - 1)
689
497
  {
690
498
  // Consume a partially-full tile at the end of the row
691
- OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
692
- ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
499
+ OffsetT num_remaining = (num_row_pixels * NumChannels) - col_offset;
500
+ ConsumeTile<IsAligned, false>(tile_offset, num_remaining);
693
501
  }
694
502
  else
695
503
  {
696
504
  // Consume full tile
697
- ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
505
+ ConsumeTile<IsAligned, true>(tile_offset, tile_samples);
698
506
  }
699
507
 
700
508
  __syncthreads();
@@ -711,50 +519,40 @@ struct AgentHistogram
711
519
  }
712
520
  }
713
521
 
714
- /**
715
- * @brief Consume row tiles. Specialized for even-share (striped across thread blocks)
716
- *
717
- * @param num_row_pixels
718
- * The number of multi-channel pixels per row in the region of interest
719
- *
720
- * @param num_rows
721
- * The number of rows in the region of interest
722
- *
723
- * @param row_stride_samples
724
- * The number of samples between starts of consecutive rows in the region of interest
725
- *
726
- * @param tiles_per_row
727
- * Number of image tiles per row
728
- */
729
- template <bool IS_ALIGNED>
522
+ //! @brief Consume row tiles. Specialized for even-share (striped across thread blocks)
523
+ //!
524
+ //! @param num_row_pixels
525
+ //! The number of multi-channel pixels per row in the region of interest
526
+ //!
527
+ //! @param num_rows
528
+ //! The number of rows in the region of interest
529
+ //!
530
+ //! @param row_stride_samples
531
+ //! The number of samples between starts of consecutive rows in the region of interest
532
+ template <bool IsAligned>
730
533
  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTiles(
731
- OffsetT num_row_pixels,
732
- OffsetT num_rows,
733
- OffsetT row_stride_samples,
734
- int tiles_per_row,
735
- GridQueue<int> tile_queue,
736
- ::cuda::std::false_type is_work_stealing)
534
+ OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int, GridQueue<int>, ::cuda::std::false_type)
737
535
  {
738
536
  for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
739
537
  {
740
538
  OffsetT row_begin = row * row_stride_samples;
741
- OffsetT row_end = row_begin + (num_row_pixels * NUM_CHANNELS);
742
- OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
539
+ OffsetT row_end = row_begin + (num_row_pixels * NumChannels);
540
+ OffsetT tile_offset = row_begin + (blockIdx.x * tile_samples);
743
541
 
744
542
  while (tile_offset < row_end)
745
543
  {
746
544
  OffsetT num_remaining = row_end - tile_offset;
747
545
 
748
- if (num_remaining < TILE_SAMPLES)
546
+ if (num_remaining < tile_samples)
749
547
  {
750
548
  // Consume partial tile
751
- ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
549
+ ConsumeTile<IsAligned, false>(tile_offset, num_remaining);
752
550
  break;
753
551
  }
754
552
 
755
553
  // Consume full tile
756
- ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
757
- tile_offset += gridDim.x * TILE_SAMPLES;
554
+ ConsumeTile<IsAligned, true>(tile_offset, tile_samples);
555
+ tile_offset += gridDim.x * tile_samples;
758
556
  }
759
557
  }
760
558
  }
@@ -763,51 +561,31 @@ struct AgentHistogram
763
561
  // Parameter extraction
764
562
  //---------------------------------------------------------------------
765
563
 
766
- // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
767
- template <CacheLoadModifier _MODIFIER, typename _ValueT, typename _OffsetT>
768
- _CCCL_DEVICE _CCCL_FORCEINLINE SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
769
- {
770
- return itr.ptr;
771
- }
772
-
773
- // Return a native pixel pointer (specialized for other types)
774
- template <typename IteratorT>
775
- _CCCL_DEVICE _CCCL_FORCEINLINE SampleT* NativePointer(IteratorT itr)
776
- {
777
- return nullptr;
778
- }
779
-
780
- //---------------------------------------------------------------------
781
- // Interface
782
- //---------------------------------------------------------------------
783
-
784
- /**
785
- * @brief Constructor
786
- *
787
- * @param temp_storage
788
- * Reference to temp_storage
789
- *
790
- * @param d_samples
791
- * Input data to reduce
792
- *
793
- * @param num_output_bins
794
- * The number bins per final output histogram
795
- *
796
- * @param num_privatized_bins
797
- * The number bins per privatized histogram
798
- *
799
- * @param d_output_histograms
800
- * Reference to final output histograms
801
- *
802
- * @param d_privatized_histograms
803
- * Reference to privatized histograms
804
- *
805
- * @param output_decode_op
806
- * The transform operator for determining output bin-ids from privatized counter indices, one for each channel
807
- *
808
- * @param privatized_decode_op
809
- * The transform operator for determining privatized counter indices from samples, one for each channel
810
- */
564
+ //! @brief Constructor
565
+ //!
566
+ //! @param temp_storage
567
+ //! Reference to temp_storage
568
+ //!
569
+ //! @param d_samples
570
+ //! Input data to reduce
571
+ //!
572
+ //! @param num_output_bins
573
+ //! The number bins per final output histogram
574
+ //!
575
+ //! @param num_privatized_bins
576
+ //! The number bins per privatized histogram
577
+ //!
578
+ //! @param d_output_histograms
579
+ //! Reference to final output histograms
580
+ //!
581
+ //! @param d_privatized_histograms
582
+ //! Reference to privatized histograms
583
+ //!
584
+ //! @param output_decode_op
585
+ //! The transform operator for determining output bin-ids from privatized counter indices, one for each channel
586
+ //!
587
+ //! @param privatized_decode_op
588
+ //! The transform operator for determining privatized counter indices from samples, one for each channel
811
589
  _CCCL_DEVICE _CCCL_FORCEINLINE AgentHistogram(
812
590
  TempStorage& temp_storage,
813
591
  SampleIteratorT d_samples,
@@ -825,39 +603,37 @@ struct AgentHistogram
825
603
  , d_output_histograms(d_output_histograms)
826
604
  , output_decode_op(output_decode_op)
827
605
  , privatized_decode_op(privatized_decode_op)
828
- , prefer_smem((MEM_PREFERENCE == SMEM) ? true : // prefer smem privatized histograms
829
- (MEM_PREFERENCE == GMEM) ? false
606
+ , prefer_smem((mem_preference == SMEM) ? true : // prefer smem privatized histograms
607
+ (mem_preference == GMEM) ? false
830
608
  : // prefer gmem privatized histograms
831
609
  blockIdx.x & 1) // prefer blended privatized histograms
832
610
  {
833
- int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
611
+ const int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
834
612
 
613
+ // TODO(bgruber): d_privatized_histograms seems only used when !prefer_smem, can we skip it if prefer_smem?
835
614
  // Initialize the locations of this block's privatized histograms
836
- for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
615
+ for (int ch = 0; ch < NumActiveChannels; ++ch)
837
616
  {
838
- this->d_privatized_histograms[CHANNEL] =
839
- d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
617
+ this->d_privatized_histograms[ch] = d_privatized_histograms[ch] + (blockId * num_privatized_bins[ch]);
840
618
  }
841
619
  }
842
620
 
843
- /**
844
- * @brief Consume image
845
- *
846
- * @param num_row_pixels
847
- * The number of multi-channel pixels per row in the region of interest
848
- *
849
- * @param num_rows
850
- * The number of rows in the region of interest
851
- *
852
- * @param row_stride_samples
853
- * The number of samples between starts of consecutive rows in the region of interest
854
- *
855
- * @param tiles_per_row
856
- * Number of image tiles per row
857
- *
858
- * @param tile_queue
859
- * Queue descriptor for assigning tiles of work to thread blocks
860
- */
621
+ //! @brief Consume image
622
+ //!
623
+ //! @param num_row_pixels
624
+ //! The number of multi-channel pixels per row in the region of interest
625
+ //!
626
+ //! @param num_rows
627
+ //! The number of rows in the region of interest
628
+ //!
629
+ //! @param row_stride_samples
630
+ //! The number of samples between starts of consecutive rows in the region of interest
631
+ //!
632
+ //! @param tiles_per_row
633
+ //! Number of image tiles per row
634
+ //!
635
+ //! @param tile_queue
636
+ //! Queue descriptor for assigning tiles of work to thread blocks
861
637
  _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTiles(
862
638
  OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int tiles_per_row, GridQueue<int> tile_queue)
863
639
  {
@@ -866,14 +642,16 @@ struct AgentHistogram
866
642
  int pixel_mask = AlignBytes<PixelT>::ALIGN_BYTES - 1;
867
643
  size_t row_bytes = sizeof(SampleT) * row_stride_samples;
868
644
 
869
- bool vec_aligned_rows =
870
- (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % VecSize == 0) && // Single channel
645
+ // FIXME(bgruber): const changes SASS
646
+ /*const*/ bool vec_aligned_rows =
647
+ (NumChannels == 1) && (samples_per_thread % vec_size == 0) && // Single channel
871
648
  ((size_t(d_native_samples) & vec_mask) == 0) && // ptr is quad-aligned
872
649
  ((num_rows == 1) || ((row_bytes & vec_mask) == 0)); // number of row-samples is a multiple of the alignment of the
873
650
  // quad
874
651
 
875
- bool pixel_aligned_rows =
876
- (NUM_CHANNELS > 1) && // Multi channel
652
+ // FIXME(bgruber): const changes SASS
653
+ /*const*/ bool pixel_aligned_rows =
654
+ (NumChannels > 1) && // Multi channel
877
655
  ((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned
878
656
  ((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel
879
657
 
@@ -881,47 +659,51 @@ struct AgentHistogram
881
659
  if ((d_native_samples != nullptr) && (vec_aligned_rows || pixel_aligned_rows))
882
660
  {
883
661
  ConsumeTiles<true>(
884
- num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, bool_constant_v<IS_WORK_STEALING>);
662
+ num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, bool_constant_v<is_work_stealing>);
885
663
  }
886
664
  else
887
665
  {
888
666
  ConsumeTiles<false>(
889
- num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, bool_constant_v<IS_WORK_STEALING>);
667
+ num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, bool_constant_v<is_work_stealing>);
890
668
  }
891
669
  }
892
670
 
893
- /**
894
- * Initialize privatized bin counters. Specialized for privatized shared-memory counters
895
- */
671
+ //! Initialize privatized bin counters. Specialized for privatized shared-memory counters
896
672
  _CCCL_DEVICE _CCCL_FORCEINLINE void InitBinCounters()
897
673
  {
898
674
  if (prefer_smem)
899
675
  {
900
- InitSmemBinCounters();
676
+ CounterT* privatized_histograms[NumActiveChannels];
677
+ for (int ch = 0; ch < NumActiveChannels; ++ch)
678
+ {
679
+ privatized_histograms[ch] = temp_storage.histograms[ch];
680
+ }
681
+ ZeroBinCounters(privatized_histograms);
901
682
  }
902
683
  else
903
684
  {
904
- InitGmemBinCounters();
685
+ ZeroBinCounters(d_privatized_histograms);
905
686
  }
906
687
  }
907
688
 
908
- /**
909
- * Store privatized histogram to device-accessible memory. Specialized for privatized shared-memory counters
910
- */
689
+ //! Store privatized histogram to device-accessible memory. Specialized for privatized shared-memory counters
911
690
  _CCCL_DEVICE _CCCL_FORCEINLINE void StoreOutput()
912
691
  {
913
692
  if (prefer_smem)
914
693
  {
915
- StoreSmemOutput();
694
+ CounterT* privatized_histograms[NumActiveChannels];
695
+ for (int ch = 0; ch < NumActiveChannels; ++ch)
696
+ {
697
+ privatized_histograms[ch] = temp_storage.histograms[ch];
698
+ }
699
+ StoreOutput(privatized_histograms);
916
700
  }
917
701
  else
918
702
  {
919
- StoreGmemOutput();
703
+ StoreOutput(d_privatized_histograms);
920
704
  }
921
705
  }
922
706
  };
923
-
924
- } // namespace histogram
925
- } // namespace detail
707
+ } // namespace detail::histogram
926
708
 
927
709
  CUB_NAMESPACE_END