cuda-cccl 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  2. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  3. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  4. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  5. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  6. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  7. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
  8. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  9. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  10. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  11. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  12. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  13. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  14. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  15. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
  16. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  17. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
  18. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
  19. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
  20. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  21. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  22. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  23. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  24. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  25. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  26. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  27. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  28. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  29. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  30. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  31. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  32. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  33. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  34. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  35. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  36. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  37. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  38. cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
  39. cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
  40. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  41. cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
  42. cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
  43. cuda/cccl/headers/include/cuda/__event/event.h +26 -26
  44. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  45. cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
  46. cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
  47. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  48. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  49. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  50. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  51. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  52. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  53. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
  54. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
  55. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
  56. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  57. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  58. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  59. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  60. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  61. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  62. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  63. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  64. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  65. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  66. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
  67. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  68. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
  69. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  70. cuda/cccl/headers/include/cuda/cmath +1 -0
  71. cuda/cccl/headers/include/cuda/devices +3 -0
  72. cuda/cccl/headers/include/cuda/memory +1 -0
  73. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  74. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  75. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  76. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  77. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  78. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  79. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  80. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  81. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  82. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  83. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  84. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  85. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  86. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  87. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  88. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  89. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  90. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  91. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  92. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  93. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  94. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  95. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  96. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  97. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  98. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  99. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  100. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  101. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  102. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  103. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  104. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  105. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  106. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  107. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  108. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  109. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  110. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  111. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  112. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  113. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  114. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  115. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  116. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  117. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  118. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  120. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  121. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  122. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  123. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  124. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  125. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  126. cuda/cccl/headers/include/cuda/std/string_view +146 -11
  127. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  128. cuda/cccl/headers/include/cuda/utility +1 -0
  129. cuda/cccl/headers/include/nv/target +7 -2
  130. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  131. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  132. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  133. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  134. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  135. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  136. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  137. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  138. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  139. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  140. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  141. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  142. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  143. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  144. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  145. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  146. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  147. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  148. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  149. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  150. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  151. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  152. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  153. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  154. cuda/compute/__init__.py +2 -0
  155. cuda/compute/_bindings.pyi +43 -1
  156. cuda/compute/_bindings_impl.pyx +156 -7
  157. cuda/compute/algorithms/_scan.py +108 -36
  158. cuda/compute/algorithms/_transform.py +32 -11
  159. cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  160. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  161. cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  162. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  163. cuda/compute/iterators/__init__.py +2 -0
  164. cuda/compute/iterators/_factories.py +28 -0
  165. cuda/compute/iterators/_iterators.py +206 -1
  166. cuda/compute/numba_utils.py +2 -2
  167. cuda/compute/typing.py +2 -0
  168. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  169. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
  170. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  171. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  172. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  173. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  174. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  175. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  176. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  177. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  178. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  179. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  180. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  181. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  182. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  183. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  184. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  185. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -111,10 +111,9 @@ CUB_NAMESPACE_BEGIN
111
111
  //! // Collectively compute adjacent_difference
112
112
  //! int result[4];
113
113
  //!
114
- //! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
115
- //! thread_data,
116
- //! result,
117
- //! CustomDifference());
114
+ //! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(thread_data, result,
115
+ //! CustomDifference());
116
+ //! }
118
117
  //!
119
118
  //! Suppose the set of input `thread_data` across the block of threads is
120
119
  //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
@@ -283,10 +282,9 @@ public:
283
282
  //! ...
284
283
  //!
285
284
  //! // Collectively compute adjacent_difference
286
- //! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
287
- //! thread_data,
288
- //! thread_data,
289
- //! CustomDifference());
285
+ //! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(thread_data, thread_data,
286
+ //! CustomDifference());
287
+ //! }
290
288
  //!
291
289
  //! Suppose the set of input ``thread_data`` across the block of threads is
292
290
  //! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
@@ -96,6 +96,7 @@ CUB_NAMESPACE_BEGIN
96
96
  //! // Collectively compute head flags for discontinuities in the segment
97
97
  //! int head_flags[4];
98
98
  //! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
99
+ //! }
99
100
  //!
100
101
  //! Suppose the set of input ``thread_data`` across the block of threads is
101
102
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
@@ -387,6 +388,7 @@ public:
387
388
  //! // Collectively compute head flags for discontinuities in the segment
388
389
  //! int head_flags[4];
389
390
  //! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
391
+ //! }
390
392
  //!
391
393
  //! Suppose the set of input ``thread_data`` across the block of threads is
392
394
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
@@ -463,8 +465,9 @@ public:
463
465
  //!
464
466
  //! // Collectively compute head flags for discontinuities in the segment
465
467
  //! int head_flags[4];
466
- //! BlockDiscontinuity(temp_storage).FlagHeads(
467
- //! head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
468
+ //! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data,
469
+ //! cub::Inequality(), tile_predecessor_item);
470
+ //! }
468
471
  //!
469
472
  //! Suppose the set of input ``thread_data`` across the block of threads is
470
473
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``,
@@ -549,6 +552,7 @@ public:
549
552
  //! // Collectively compute tail flags for discontinuities in the segment
550
553
  //! int tail_flags[4];
551
554
  //! BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
555
+ //! }
552
556
  //!
553
557
  //! Suppose the set of input ``thread_data`` across the block of threads is
554
558
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``.
@@ -640,8 +644,9 @@ public:
640
644
  //!
641
645
  //! // Collectively compute tail flags for discontinuities in the segment
642
646
  //! int tail_flags[4];
643
- //! BlockDiscontinuity(temp_storage).FlagTails(
644
- //! tail_flags, thread_data, cub::Inequality(), tile_successor_item);
647
+ //! BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data,
648
+ //! cub::Inequality(), tile_successor_item);
649
+ //! }
645
650
  //!
646
651
  //! Suppose the set of input ``thread_data`` across the block of threads is
647
652
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
@@ -742,8 +747,9 @@ public:
742
747
  //! // Collectively compute head and flags for discontinuities in the segment
743
748
  //! int head_flags[4];
744
749
  //! int tail_flags[4];
745
- //! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
746
- //! head_flags, tail_flags, thread_data, cub::Inequality());
750
+ //! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tail_flags, thread_data,
751
+ //! cub::Inequality());
752
+ //! }
747
753
  //!
748
754
  //! Suppose the set of input ``thread_data`` across the block of threads is
749
755
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
@@ -864,8 +870,10 @@ public:
864
870
  //! // Collectively compute head and flags for discontinuities in the segment
865
871
  //! int head_flags[4];
866
872
  //! int tail_flags[4];
867
- //! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
868
- //! head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
873
+ //! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tail_flags,
874
+ //! tile_successor_item, thread_data,
875
+ //! cub::Inequality());
876
+ //! }
869
877
  //!
870
878
  //! Suppose the set of input ``thread_data`` across the block of threads is
871
879
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
@@ -997,9 +1005,10 @@ public:
997
1005
  //! // Collectively compute head and flags for discontinuities in the segment
998
1006
  //! int head_flags[4];
999
1007
  //! int tail_flags[4];
1000
- //! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
1001
- //! head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
1002
- //! thread_data, cub::Inequality());
1008
+ //! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tile_predecessor_item,
1009
+ //! tail_flags, tile_successor_item,
1010
+ //! thread_data, cub::Inequality());
1011
+ //! }
1003
1012
  //!
1004
1013
  //! Suppose the set of input ``thread_data`` across the block of threads is
1005
1014
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
@@ -1126,9 +1135,10 @@ public:
1126
1135
  //! // Collectively compute head and flags for discontinuities in the segment
1127
1136
  //! int head_flags[4];
1128
1137
  //! int tail_flags[4];
1129
- //! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
1130
- //! head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
1131
- //! thread_data, cub::Inequality());
1138
+ //! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tile_predecessor_item,
1139
+ //! tail_flags, tile_successor_item,
1140
+ //! thread_data, cub::Inequality());
1141
+ //! }
1132
1142
  //!
1133
1143
  //! Suppose the set of input ``thread_data`` across the block of threads is
1134
1144
  //! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
@@ -101,6 +101,7 @@ CUB_NAMESPACE_BEGIN
101
101
  //!
102
102
  //! // Collectively exchange data into a blocked arrangement across threads
103
103
  //! BlockExchange(temp_storage).StripedToBlocked(thread_data);
104
+ //! }
104
105
  //!
105
106
  //! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
106
107
  //! [1,129,257,385], ..., [127,255,383,511] }``. The corresponding output ``thread_data`` in those threads will be
@@ -883,6 +884,7 @@ public:
883
884
  //!
884
885
  //! // Collectively exchange data into a blocked arrangement across threads
885
886
  //! BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
887
+ //! }
886
888
  //!
887
889
  //! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
888
890
  //! [1,129,257,385], ..., [127,255,383,511] }`` after loading from device-accessible memory. The corresponding output
@@ -933,6 +935,7 @@ public:
933
935
  //!
934
936
  //! // Store data striped across block threads into an ordered tile
935
937
  //! cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
938
+ //! }
936
939
  //!
937
940
  //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
938
941
  //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be
@@ -983,6 +986,7 @@ public:
983
986
  //!
984
987
  //! // Collectively exchange data into a blocked arrangement across threads
985
988
  //! BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
989
+ //! }
986
990
  //!
987
991
  //! Suppose the set of warp-striped input ``thread_data`` across the block of threads is ``{ [0,32,64,96],
988
992
  //! [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` after loading from device-accessible memory. (The first 128
@@ -1037,6 +1041,7 @@ public:
1037
1041
  //!
1038
1042
  //! // Store data striped across warp threads into an ordered tile
1039
1043
  //! cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
1044
+ //! }
1040
1045
  //!
1041
1046
  //! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
1042
1047
  //! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be
@@ -140,6 +140,7 @@ enum BlockHistogramAlgorithm
140
140
  //!
141
141
  //! // Compute the block-wide histogram
142
142
  //! BlockHistogram(temp_storage).Histogram(data, smem_histogram);
143
+ //! }
143
144
  //!
144
145
  //! Performance and Usage Considerations
145
146
  //! +++++++++++++++++++++++++++++++++++++++++++++
@@ -281,6 +282,7 @@ public:
281
282
  //!
282
283
  //! // Update the block-wide histogram
283
284
  //! BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
285
+ //! }
284
286
  //!
285
287
  //! @endrst
286
288
  //!
@@ -338,6 +340,7 @@ public:
338
340
  //!
339
341
  //! // Compute the block-wide histogram
340
342
  //! BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
343
+ //! }
341
344
  //!
342
345
  //! @endrst
343
346
  //!
@@ -399,6 +402,7 @@ public:
399
402
  //!
400
403
  //! // Update the block-wide histogram
401
404
  //! BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
405
+ //! }
402
406
  //!
403
407
  //! @endrst
404
408
  //!
@@ -771,6 +771,7 @@ enum BlockLoadAlgorithm
771
771
  //! // Load a segment of consecutive items that are blocked across threads
772
772
  //! int thread_data[4];
773
773
  //! BlockLoad(temp_storage).Load(d_data, thread_data);
774
+ //! }
774
775
  //!
775
776
  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads in
776
777
  //! those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
@@ -1123,6 +1124,7 @@ public:
1123
1124
  //! // Load a segment of consecutive items that are blocked across threads
1124
1125
  //! int thread_data[4];
1125
1126
  //! BlockLoad(temp_storage).Load(d_data, thread_data);
1127
+ //! }
1126
1128
  //!
1127
1129
  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads
1128
1130
  //! in those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
@@ -1170,6 +1172,7 @@ public:
1170
1172
  //! // Load a segment of consecutive items that are blocked across threads
1171
1173
  //! int thread_data[4];
1172
1174
  //! BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end);
1175
+ //! }
1173
1176
  //!
1174
1177
  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``block_items_end`` is ``5``. The set of
1175
1178
  //! ``thread_data`` across the block of threads in those threads will be ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``,
@@ -1222,6 +1225,7 @@ public:
1222
1225
  //! // Load a segment of consecutive items that are blocked across threads
1223
1226
  //! int thread_data[4];
1224
1227
  //! BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end, -1);
1228
+ //! }
1225
1229
  //!
1226
1230
  //! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...``, ``block_items_end`` is ``5``, and the out-of-bounds
1227
1231
  //! default is ``-1``. The set of ``thread_data`` across the block of threads in those threads will be
@@ -169,6 +169,7 @@ struct warp_in_block_matcher_t<Bits, 0, PartialWarpId>
169
169
  //! block_radix_rank(temp_storage).RankKeys(keys, ranks, extractor);
170
170
  //!
171
171
  //! ...
172
+ //! }
172
173
  //!
173
174
  //! Suppose the set of input ``keys`` across the block of threads is ``{ [16,10], [9,11] }``.
174
175
  //! The corresponding output ``ranks`` in those threads will be ``{ [3,1], [0,2] }``.
@@ -425,6 +425,7 @@ public:
425
425
  //!
426
426
  //! // Compute the block-wide max for thread0
427
427
  //! int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cuda::maximum<>{});
428
+ //! }
428
429
  //!
429
430
  //! @endrst
430
431
  //!
@@ -190,6 +190,7 @@ enum BlockScanAlgorithm
190
190
  //!
191
191
  //! // Collectively compute the block-wide exclusive prefix sum
192
192
  //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
193
+ //! }
193
194
  //!
194
195
  //! Suppose the set of input ``thread_data`` across the block of threads is
195
196
  //! ``{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}``.
@@ -333,6 +334,7 @@ public:
333
334
  //!
334
335
  //! // Collectively compute the block-wide exclusive prefix sum
335
336
  //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
337
+ //! }
336
338
  //!
337
339
  //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
338
340
  //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
@@ -386,6 +388,7 @@ public:
386
388
  //! // Collectively compute the block-wide exclusive prefix sum
387
389
  //! int block_aggregate;
388
390
  //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
391
+ //! }
389
392
  //!
390
393
  //! Suppose the set of input ``thread_data`` across the block of threads is ``1, 1, ..., 1``.
391
394
  //! The corresponding output ``thread_data`` in those threads will be ``0, 1, ..., 127``.
@@ -479,6 +482,7 @@ public:
479
482
  //! // Store scanned items to output segment
480
483
  //! d_data[block_offset + threadIdx.x] = thread_data;
481
484
  //! }
485
+ //! }
482
486
  //!
483
487
  //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
484
488
  //! The corresponding output for the first segment will be ``0, 1, ..., 127``.
@@ -545,6 +549,7 @@ public:
545
549
  //!
546
550
  //! // Collectively compute the block-wide exclusive prefix sum
547
551
  //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
552
+ //! }
548
553
  //!
549
554
  //! Suppose the set of input ``thread_data`` across the block of threads is
550
555
  //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
@@ -606,6 +611,7 @@ public:
606
611
  //! // Collectively compute the block-wide exclusive prefix sum
607
612
  //! int block_aggregate;
608
613
  //! BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
614
+ //! }
609
615
  //!
610
616
  //! Suppose the set of input ``thread_data`` across the block of threads is
611
617
  //! ``{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }``.
@@ -720,6 +726,7 @@ public:
720
726
  //! BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
721
727
  //! __syncthreads();
722
728
  //! }
729
+ //! }
723
730
  //!
724
731
  //! Suppose the input ``d_data`` is ``1, 1, 1, 1, 1, 1, 1, 1, ...``.
725
732
  //! The corresponding output for the first segment will be ``0, 1, 2, 3, ..., 510, 511``.
@@ -788,6 +795,7 @@ public:
788
795
  //!
789
796
  //! // Collectively compute the block-wide exclusive prefix max scan
790
797
  //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cuda::maximum<>{});
798
+ //! }
791
799
  //!
792
800
  //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
793
801
  //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
@@ -849,8 +857,9 @@ public:
849
857
  //!
850
858
  //! // Collectively compute the block-wide exclusive prefix max scan
851
859
  //! int block_aggregate;
852
- //! BlockScan(temp_storage).ExclusiveScan(
853
- //! thread_data, thread_data, INT_MIN, cuda::maximum<>{}, block_aggregate);
860
+ //! BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data,
861
+ //! INT_MIN, cuda::maximum<>{}, block_aggregate);
862
+ //! }
854
863
  //!
855
864
  //! Suppose the set of input ``thread_data`` across the block of threads is ``0, -1, 2, -3, ..., 126, -127``.
856
865
  //! The corresponding output ``thread_data`` in those threads will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
@@ -960,6 +969,7 @@ public:
960
969
  //! // Store scanned items to output segment
961
970
  //! d_data[block_offset + threadIdx.x] = thread_data;
962
971
  //! }
972
+ //! }
963
973
  //!
964
974
  //! Suppose the input ``d_data`` is ``0, -1, 2, -3, 4, -5, ...``.
965
975
  //! The corresponding output for the first segment will be ``INT_MIN, 0, 0, 2, ..., 124, 126``.
@@ -616,6 +616,7 @@ enum BlockStoreAlgorithm
616
616
  //!
617
617
  //! // Store items to linear memory
618
618
  //! BlockStore(temp_storage).Store(d_data, thread_data);
619
+ //! }
619
620
  //!
620
621
  //! Suppose the set of ``thread_data`` across the block of threads is
621
622
  //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
@@ -1156,8 +1157,8 @@ public:
1156
1157
  //! ...
1157
1158
  //!
1158
1159
  //! // Store items to linear memory
1159
- //! int thread_data[4];
1160
1160
  //! BlockStore(temp_storage).Store(d_data, thread_data);
1161
+ //! }
1161
1162
  //!
1162
1163
  //! Suppose the set of ``thread_data`` across the block of threads is
1163
1164
  //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
@@ -1208,8 +1209,8 @@ public:
1208
1209
  //! ...
1209
1210
  //!
1210
1211
  //! // Store items to linear memory
1211
- //! int thread_data[4];
1212
1212
  //! BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
1213
+ //! }
1213
1214
  //!
1214
1215
  //! Suppose the set of ``thread_data`` across the block of threads is
1215
1216
  //! ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }`` and ``valid_items`` is ``5``.
@@ -15,71 +15,76 @@
15
15
 
16
16
  #include <cub/detail/fast_modulo_division.cuh> // fast_div_mod
17
17
 
18
+ #include <cuda/std/__mdspan/extents.h>
18
19
  #include <cuda/std/__type_traits/make_unsigned.h>
19
20
  #include <cuda/std/__utility/integer_sequence.h>
20
21
  #include <cuda/std/array>
21
22
  #include <cuda/std/cstddef>
22
- #include <cuda/std/mdspan>
23
23
 
24
24
  CUB_NAMESPACE_BEGIN
25
-
26
25
  namespace detail
27
26
  {
28
27
 
28
+ _CCCL_DIAG_PUSH
29
+ _CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code (even if there are no branches!)
30
+
29
31
  // Compute the submdspan size of a given rank
30
- template <size_t Rank, typename IndexType, size_t Extent0, size_t... Extents>
31
- [[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
32
- sub_size(const ::cuda::std::extents<IndexType, Extent0, Extents...>& ext)
32
+ template <typename IndexType, size_t... Extents>
33
+ [[nodiscard]] _CCCL_API constexpr ::cuda::std::make_unsigned_t<IndexType>
34
+ size_range(const ::cuda::std::extents<IndexType, Extents...>& ext, int start, int end)
33
35
  {
36
+ _CCCL_ASSERT(start >= 0 && end <= static_cast<int>(ext.rank()), "invalid start or end");
34
37
  ::cuda::std::make_unsigned_t<IndexType> s = 1;
35
- for (IndexType i = Rank; i < IndexType{1 + sizeof...(Extents)}; i++) // <- pointless comparison with zero-rank extent
38
+ for (auto i = start; i < end; i++)
36
39
  {
37
40
  s *= ext.extent(i);
38
41
  }
39
42
  return s;
40
43
  }
41
44
 
42
- // avoid pointless comparison of unsigned integer with zero (nvcc 11.x doesn't support nv_diag warning suppression)
43
- template <size_t Rank, typename IndexType>
44
- [[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
45
- sub_size(const ::cuda::std::extents<IndexType>&)
45
+ _CCCL_DIAG_POP // MSVC(4702)
46
+
47
+ template <typename IndexType, size_t... Extents>
48
+ [[nodiscard]] _CCCL_API constexpr ::cuda::std::make_unsigned_t<IndexType>
49
+ size(const ::cuda::std::extents<IndexType, Extents...>& ext)
46
50
  {
47
- return ::cuda::std::make_unsigned_t<IndexType>{1};
51
+ return cub::detail::size_range(ext, 0, static_cast<int>(ext.rank()));
48
52
  }
49
53
 
50
- // TODO: move to cuda::std
51
- template <typename IndexType, size_t... Extents>
52
- [[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr ::cuda::std::make_unsigned_t<IndexType>
53
- size(const ::cuda::std::extents<IndexType, Extents...>& ext)
54
+ template <bool IsLayoutRight, int Position, typename IndexType, size_t... E>
55
+ [[nodiscard]] _CCCL_API auto sub_size_fast_div_mod_impl(const ::cuda::std::extents<IndexType, E...>& ext)
54
56
  {
55
- return cub::detail::sub_size<0>(ext);
57
+ using fast_mod_div_t = fast_div_mod<IndexType>;
58
+ constexpr auto start = IsLayoutRight ? Position + 1 : 0;
59
+ constexpr auto end = IsLayoutRight ? sizeof...(E) : Position;
60
+ return fast_mod_div_t(cub::detail::size_range(ext, start, end));
56
61
  }
57
62
 
58
63
  // precompute modulo/division for each submdspan size (by rank)
59
- template <typename IndexType, size_t... E, size_t... Ranks>
60
- [[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
61
- sub_sizes_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Ranks...> = {})
64
+ template <bool IsLayoutRight, typename IndexType, size_t... E, size_t... Positions>
65
+ [[nodiscard]] _CCCL_API auto
66
+ sub_sizes_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Positions...> = {})
62
67
  {
63
- // deduction guides don't work with nvcc 11.x
64
68
  using fast_mod_div_t = fast_div_mod<IndexType>;
65
- return ::cuda::std::array<fast_mod_div_t, sizeof...(Ranks)>{fast_mod_div_t(sub_size<Ranks + 1>(ext))...};
69
+ using array_t = ::cuda::std::array<fast_mod_div_t, sizeof...(Positions)>;
70
+ return array_t{cub::detail::sub_size_fast_div_mod_impl<IsLayoutRight, Positions>(ext)...};
66
71
  }
67
72
 
68
73
  // precompute modulo/division for each mdspan extent
69
- template <typename IndexType, size_t... E, size_t... Ranks>
70
- [[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE auto
71
- extents_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Ranks...> = {})
74
+ template <typename IndexType, size_t... E, size_t... Positions>
75
+ [[nodiscard]] _CCCL_API auto
76
+ extents_fast_div_mod(const ::cuda::std::extents<IndexType, E...>& ext, ::cuda::std::index_sequence<Positions...> = {})
72
77
  {
73
78
  using fast_mod_div_t = fast_div_mod<IndexType>;
74
- return ::cuda::std::array<fast_mod_div_t, sizeof...(Ranks)>{fast_mod_div_t(ext.extent(Ranks))...};
79
+ using array_t = ::cuda::std::array<fast_mod_div_t, sizeof...(Positions)>;
80
+ return array_t{fast_mod_div_t(ext.extent(Positions))...};
75
81
  }
76
82
 
77
83
  // GCC <= 9 constexpr workaround: Extent must be passed as type only, even const Extent& doesn't work
78
- template <int Rank, typename Extents>
79
- [[nodiscard]] _CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr bool is_sub_size_static()
84
+ template <typename Extents>
85
+ [[nodiscard]] _CCCL_API constexpr bool are_extents_in_range_static(int start, int end)
80
86
  {
81
- using index_type = typename Extents::index_type;
82
- for (index_type i = Rank; i < Extents::rank(); i++)
87
+ for (auto i = start; i < end; i++)
83
88
  {
84
89
  if (Extents::static_extent(i) == ::cuda::std::dynamic_extent)
85
90
  {
@@ -106,5 +111,4 @@ template <typename MappingTypeLhs, typename MappingTypeRhs>
106
111
  }
107
112
 
108
113
  } // namespace detail
109
-
110
114
  CUB_NAMESPACE_END
@@ -29,7 +29,7 @@
29
29
 
30
30
  #include <cub/config.cuh>
31
31
 
32
- #include <thrust/detail/algorithm_wrapper.h>
32
+ #include <cuda/std/__cccl/algorithm_wrapper.h>
33
33
 
34
34
  #include <format>
35
35
  #include <string_view>