cuda-cccl 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  2. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  3. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  4. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  5. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  6. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  7. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
  8. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  9. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  10. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  11. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  12. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  13. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  14. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  15. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
  16. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  17. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
  18. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
  19. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
  20. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  21. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  22. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  23. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  24. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  25. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  26. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  27. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  28. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  29. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  30. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  31. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  32. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  33. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  34. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  35. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  36. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  37. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  38. cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
  39. cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
  40. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  41. cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
  42. cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
  43. cuda/cccl/headers/include/cuda/__event/event.h +26 -26
  44. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  45. cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
  46. cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
  47. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  48. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  49. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  50. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  51. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  52. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  53. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
  54. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
  55. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
  56. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  57. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  58. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  59. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  60. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  61. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  62. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  63. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  64. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  65. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  66. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
  67. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  68. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
  69. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  70. cuda/cccl/headers/include/cuda/cmath +1 -0
  71. cuda/cccl/headers/include/cuda/devices +3 -0
  72. cuda/cccl/headers/include/cuda/memory +1 -0
  73. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  74. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  75. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  76. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  77. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  78. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  79. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  80. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  81. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  82. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  83. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  84. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  85. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  86. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  87. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  88. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  89. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  90. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  91. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  92. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  93. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  94. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  95. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  96. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  97. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  98. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  99. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  100. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  101. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  102. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  103. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  104. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  105. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  106. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  107. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  108. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  109. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  110. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  111. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  112. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  113. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  114. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  115. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  116. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  117. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  118. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  120. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  121. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  122. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  123. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  124. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  125. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  126. cuda/cccl/headers/include/cuda/std/string_view +146 -11
  127. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  128. cuda/cccl/headers/include/cuda/utility +1 -0
  129. cuda/cccl/headers/include/nv/target +7 -2
  130. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  131. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  132. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  133. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  134. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  135. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  136. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  137. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  138. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  139. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  140. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  141. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  142. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  143. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  144. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  145. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  146. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  147. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  148. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  149. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  150. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  151. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  152. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  153. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  154. cuda/compute/__init__.py +2 -0
  155. cuda/compute/_bindings.pyi +43 -1
  156. cuda/compute/_bindings_impl.pyx +156 -7
  157. cuda/compute/algorithms/_scan.py +108 -36
  158. cuda/compute/algorithms/_transform.py +32 -11
  159. cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  160. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  161. cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  162. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  163. cuda/compute/iterators/__init__.py +2 -0
  164. cuda/compute/iterators/_factories.py +28 -0
  165. cuda/compute/iterators/_iterators.py +206 -1
  166. cuda/compute/numba_utils.py +2 -2
  167. cuda/compute/typing.py +2 -0
  168. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  169. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
  170. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  171. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  172. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  173. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  174. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  175. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  176. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  177. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  178. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  179. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  180. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  181. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  182. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  183. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  184. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  185. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,29 +1,5 @@
1
- /******************************************************************************
2
- * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3
- *
4
- * Redistribution and use in source and binary forms, with or without
5
- * modification, are permitted provided that the following conditions are met:
6
- * * Redistributions of source code must retain the above copyright
7
- * notice, this list of conditions and the following disclaimer.
8
- * * Redistributions in binary form must reproduce the above copyright
9
- * notice, this list of conditions and the following disclaimer in the
10
- * documentation and/or other materials provided with the distribution.
11
- * * Neither the name of the NVIDIA CORPORATION nor the
12
- * names of its contributors may be used to endorse or promote products
13
- * derived from this software without specific prior written permission.
14
- *
15
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
- *
26
- ******************************************************************************/
1
+ // SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ // SPDX-License-Identifier: BSD-3-Clause
27
3
 
28
4
  #pragma once
29
5
 
@@ -41,24 +17,23 @@
41
17
  #include <cub/util_namespace.cuh>
42
18
 
43
19
  #include <thrust/detail/raw_reference_cast.h>
44
- #include <thrust/distance.h>
45
20
  #include <thrust/type_traits/is_contiguous_iterator.h>
46
21
  #include <thrust/type_traits/unwrap_contiguous_iterator.h>
47
22
 
48
23
  #include <cuda/__cmath/ceil_div.h>
24
+ #include <cuda/std/__concepts/concept_macros.h>
25
+ #include <cuda/std/__fwd/mdspan.h>
49
26
  #include <cuda/std/__iterator/distance.h>
50
27
  #include <cuda/std/__mdspan/extents.h>
28
+ #include <cuda/std/__mdspan/layout_left.h>
29
+ #include <cuda/std/__mdspan/layout_right.h>
51
30
  #include <cuda/std/__memory/is_sufficiently_aligned.h>
52
31
  #include <cuda/std/__type_traits/is_integral.h>
53
- #include <cuda/std/__utility/integer_sequence.h>
54
32
  #include <cuda/std/array>
55
33
 
56
34
  CUB_NAMESPACE_BEGIN
57
35
 
58
- namespace detail
59
- {
60
-
61
- namespace for_each
36
+ namespace detail::for_each
62
37
  {
63
38
 
64
39
  /**
@@ -122,8 +97,7 @@ struct op_wrapper_vectorized_t
122
97
  }
123
98
  };
124
99
 
125
- } // namespace for_each
126
- } // namespace detail
100
+ } // namespace detail::for_each
127
101
 
128
102
  struct DeviceFor
129
103
  {
@@ -568,6 +542,10 @@ public:
568
542
  {
569
543
  _CCCL_NVTX_RANGE_SCOPE("cub::DeviceFor::Bulk");
570
544
  static_assert(::cuda::std::is_integral_v<ShapeT>, "ShapeT must be an integral type");
545
+ if (shape == 0)
546
+ {
547
+ return cudaSuccess;
548
+ }
571
549
  using offset_t = ShapeT;
572
550
  return detail::for_each::dispatch_t<offset_t, OpT>::dispatch(static_cast<offset_t>(shape), op, stream);
573
551
  }
@@ -833,7 +811,8 @@ public:
833
811
  //! Overview
834
812
  //! +++++++++++++++++++++++++++++++++++++++++++++
835
813
  //!
836
- //! Iterate through a multi-dimensional extents into
814
+ //! Iterate through a multi-dimensional extents into a single linear index and a list of indices for each extent
815
+ //! dimension.
837
816
  //!
838
817
  //! - a single linear index that represents the current iteration
839
818
  //! - indices of each extent dimension
@@ -899,8 +878,6 @@ public:
899
878
  OpType op,
900
879
  cudaStream_t stream = {})
901
880
  {
902
- // TODO: check dimensions overflows
903
- // TODO: check tha arity of OpType is equal to sizeof...(ExtentsType)
904
881
  if (d_temp_storage == nullptr)
905
882
  {
906
883
  temp_storage_bytes = 1;
@@ -967,19 +944,120 @@ public:
967
944
  template <typename IndexType, size_t... Extents, typename OpType>
968
945
  CUB_RUNTIME_FUNCTION static cudaError_t
969
946
  ForEachInExtents(const ::cuda::std::extents<IndexType, Extents...>& extents, OpType op, cudaStream_t stream = {})
947
+ {
948
+ using extents_type = ::cuda::std::extents<IndexType, Extents...>;
949
+ return cub::DeviceFor::ForEachInLayout(::cuda::std::layout_right::mapping<extents_type>{extents}, op, stream);
950
+ }
951
+
952
+ /*********************************************************************************************************************
953
+ * ForEachInLayout
954
+ ********************************************************************************************************************/
955
+
956
+ //! @rst
957
+ //! Overview
958
+ //! +++++++++++++++++++++++++++++++++++++++++++++
959
+ //!
960
+ //! Iterate through multi-dimensional extents using a specific mdspan layout, applying a function object for each
961
+ //! element, passing
962
+ //!
963
+ //! - a single linear index that represents the current iteration
964
+ //! - a list of indices containing the coordinates for each extent dimension
965
+ //!
966
+ //! The iteration order depends on the layout type:
967
+ //!
968
+ //! - ``layout_right``: Iterates in row-major order (rightmost index varies fastest)
969
+ //! - ``layout_left``: Iterates in column-major order (leftmost index varies fastest)
970
+ //!
971
+ //! - The return value of ``op``, if any, is ignored.
972
+ //!
973
+ //! A Simple Example
974
+ //! +++++++++++++++++++++++++++++++++++++++++++++
975
+ //!
976
+ //! The following code snippet demonstrates how to use ``ForEachInLayout`` to iterate through a 2D matrix in
977
+ //! column-major order using ``layout_left``.
978
+ //!
979
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_each_in_layout_api.cu
980
+ //! :language: c++
981
+ //! :dedent:
982
+ //! :start-after: example-begin for-each-in-layout-op
983
+ //! :end-before: example-end for-each-in-layout-op
984
+ //!
985
+ //! .. literalinclude:: ../../../cub/test/catch2_test_device_for_each_in_layout_api.cu
986
+ //! :language: c++
987
+ //! :dedent:
988
+ //! :start-after: example-begin for-each-in-layout-example
989
+ //! :end-before: example-end for-each-in-layout-example
990
+ //!
991
+ //! @endrst
992
+ //!
993
+ //! @tparam Layout
994
+ //! **[inferred]** The mdspan layout type, must be either ``cuda::std::layout_left`` or ``cuda::std::layout_right``
995
+ //!
996
+ //! @tparam IndexType
997
+ //! **[inferred]** An integral type that represents the extent index space
998
+ //!
999
+ //! @tparam Extents
1000
+ //! **[inferred]** The extent sizes for each rank index
1001
+ //!
1002
+ //! @tparam OpType
1003
+ //! **[inferred]** A function object with arity equal to the number of extents + 1 for the linear index (iteration).
1004
+ //! The first parameter is the linear index, followed by one parameter for each dimension coordinate.
1005
+ //!
1006
+ //! @param[in] layout
1007
+ //! Layout object that determines the iteration order (layout_left for column-major, layout_right for row-major)
1008
+ //!
1009
+ //! @param[in] extents
1010
+ //! Extents object that represents a multi-dimensional index space
1011
+ //!
1012
+ //! @param[in] op
1013
+ //! Function object to apply to each linear index (iteration) and multi-dimensional coordinates.
1014
+ //! Called as ``op(linear_index, coord_0, coord_1, ..., coord_n)``
1015
+ //!
1016
+ //! @param[in] stream
1017
+ //! CUDA stream to launch kernels within. Default stream is `nullptr`
1018
+ //!
1019
+ //! @return cudaError_t
1020
+ //! error status
1021
+ _CCCL_TEMPLATE(typename LayoutMapping, typename OpType)
1022
+ _CCCL_REQUIRES(::cuda::std::__is_any_mdspan_layout_mapping_left_or_right_v<LayoutMapping>)
1023
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
1024
+ ForEachInLayout(const LayoutMapping& layout_mapping, OpType op, cudaStream_t stream = {})
970
1025
  {
971
1026
  using namespace cub::detail;
972
- using extents_type = ::cuda::std::extents<IndexType, Extents...>;
1027
+ using extents_type = typename LayoutMapping::extents_type;
973
1028
  using extent_index_type = typename extents_type::index_type;
974
1029
  using fast_mod_array_t = ::cuda::std::array<fast_div_mod<extent_index_type>, extents_type::rank()>;
975
1030
  _CCCL_NVTX_RANGE_SCOPE("cub::DeviceFor::ForEachInExtents");
976
1031
  static constexpr auto seq = ::cuda::std::make_index_sequence<extents_type::rank()>{};
977
- fast_mod_array_t sub_sizes_div_array = cub::detail::sub_sizes_fast_div_mod(extents, seq);
1032
+ constexpr bool is_layout_right = ::cuda::std::__is_any_mdspan_layout_mapping_right_v<LayoutMapping>;
1033
+ auto extents = layout_mapping.extents();
1034
+ fast_mod_array_t sub_sizes_div_array = cub::detail::sub_sizes_fast_div_mod<is_layout_right>(extents, seq);
978
1035
  fast_mod_array_t extents_div_array = cub::detail::extents_fast_div_mod(extents, seq);
979
- for_each::op_wrapper_extents_t<OpType, extents_type, fast_mod_array_t> op_wrapper{
1036
+ for_each::op_wrapper_extents_t<OpType, extents_type, is_layout_right, fast_mod_array_t> op_wrapper{
980
1037
  op, extents, sub_sizes_div_array, extents_div_array};
981
1038
  return Bulk(static_cast<implicit_prom_t<extent_index_type>>(cub::detail::size(extents)), op_wrapper, stream);
982
1039
  }
1040
+
1041
+ #ifndef _CCCL_DOXYGEN_INVOKED
1042
+
1043
+ _CCCL_TEMPLATE(typename LayoutMapping, typename OpType)
1044
+ _CCCL_REQUIRES(::cuda::std::__is_any_mdspan_layout_mapping_left_or_right_v<LayoutMapping>)
1045
+ [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ForEachInLayout(
1046
+ void* d_temp_storage,
1047
+ size_t& temp_storage_bytes,
1048
+ const LayoutMapping& layout_mapping,
1049
+ OpType op,
1050
+ cudaStream_t stream = {})
1051
+ {
1052
+ if (d_temp_storage == nullptr)
1053
+ {
1054
+ temp_storage_bytes = 1;
1055
+ return cudaSuccess;
1056
+ }
1057
+ return ForEachInLayout(layout_mapping, op, stream);
1058
+ }
1059
+
1060
+ #endif // !_CCCL_DOXYGEN_INVOKED
983
1061
  };
984
1062
 
985
1063
  CUB_NAMESPACE_END
@@ -52,15 +52,15 @@
52
52
  #include <cub/thread/thread_operators.cuh>
53
53
  #include <cub/util_type.cuh>
54
54
 
55
- #include <thrust/iterator/tabulate_output_iterator.h>
56
-
57
55
  #include <cuda/__execution/determinism.h>
58
56
  #include <cuda/__execution/require.h>
59
57
  #include <cuda/__execution/tune.h>
60
58
  #include <cuda/__functional/maximum.h>
61
59
  #include <cuda/__functional/minimum.h>
60
+ #include <cuda/__iterator/tabulate_output_iterator.h>
62
61
  #include <cuda/__memory_resource/get_memory_resource.h>
63
62
  #include <cuda/__stream/get_stream.h>
63
+ #include <cuda/__stream/stream_ref.h>
64
64
  #include <cuda/std/__execution/env.h>
65
65
  #include <cuda/std/__functional/identity.h>
66
66
  #include <cuda/std/__functional/invoke.h>
@@ -70,7 +70,6 @@
70
70
  #include <cuda/std/__type_traits/is_same.h>
71
71
  #include <cuda/std/cstdint>
72
72
  #include <cuda/std/limits>
73
- #include <cuda/stream_ref>
74
73
 
75
74
  CUB_NAMESPACE_BEGIN
76
75
 
@@ -1215,7 +1214,7 @@ public:
1215
1214
  OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
1216
1215
 
1217
1216
  // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1218
- auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
1217
+ auto out_it = ::cuda::make_tabulate_output_iterator(
1219
1218
  detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
1220
1219
 
1221
1220
  return detail::reduce::dispatch_streaming_arg_reduce_t<
@@ -1341,7 +1340,7 @@ public:
1341
1340
  OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
1342
1341
 
1343
1342
  // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1344
- auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
1343
+ auto out_it = ::cuda::make_tabulate_output_iterator(
1345
1344
  detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_min_out, d_index_out});
1346
1345
 
1347
1346
  // Query the required temporary storage size
@@ -1883,7 +1882,7 @@ public:
1883
1882
  OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::lowest()};
1884
1883
 
1885
1884
  // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
1886
- auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
1885
+ auto out_it = ::cuda::make_tabulate_output_iterator(
1887
1886
  detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
1888
1887
 
1889
1888
  return detail::reduce::dispatch_streaming_arg_reduce_t<
@@ -2133,7 +2132,7 @@ public:
2133
2132
  OutputExtremumT initial_value{::cuda::std::numeric_limits<InputValueT>::max()};
2134
2133
 
2135
2134
  // Tabulate output iterator that unzips the result and writes it to the user-provided output iterators
2136
- auto out_it = THRUST_NS_QUALIFIER::make_tabulate_output_iterator(
2135
+ auto out_it = ::cuda::make_tabulate_output_iterator(
2137
2136
  detail::reduce::unzip_and_write_arg_extremum_op<ExtremumOutIteratorT, IndexOutIteratorT>{d_max_out, d_index_out});
2138
2137
 
2139
2138
  // Query the required temporary storage size
@@ -156,14 +156,14 @@ struct DeviceSegmentedReduce
156
156
  //! @rst
157
157
  //! Random-access input iterator to the sequence of beginning offsets of
158
158
  //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
159
- //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
159
+ //! element of the *i*\ :sup:`th` data segment in ``d_in``
160
160
  //! @endrst
161
161
  //!
162
162
  //! @param[in] d_end_offsets
163
163
  //! @rst
164
164
  //! Random-access input iterator to the sequence of ending offsets of length
165
165
  //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
166
- //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
166
+ //! the *i*\ :sup:`th` data segment in ``d_in``.
167
167
  //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
168
168
  //! @endrst
169
169
  //!
@@ -372,15 +372,14 @@ struct DeviceSegmentedReduce
372
372
  //! @rst
373
373
  //! Random-access input iterator to the sequence of beginning offsets of
374
374
  //! length ``num_segments`, such that ``d_begin_offsets[i]`` is the first
375
- //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and
376
- //! ``d_values_*``
375
+ //! element of the *i*\ :sup:`th` data segment in ``d_in``
377
376
  //! @endrst
378
377
  //!
379
378
  //! @param[in] d_end_offsets
380
379
  //! @rst
381
380
  //! Random-access input iterator to the sequence of ending offsets of length
382
381
  //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
383
- //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
382
+ //! the *i*\ :sup:`th` data segment in ``d_in``.
384
383
  //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
385
384
  //! @endrst
386
385
  //!
@@ -578,14 +577,14 @@ struct DeviceSegmentedReduce
578
577
  //! @rst
579
578
  //! Random-access input iterator to the sequence of beginning offsets of
580
579
  //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
581
- //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
580
+ //! element of the *i*\ :sup:`th` data segment in ``d_in``
582
581
  //! @endrst
583
582
  //!
584
583
  //! @param[in] d_end_offsets
585
584
  //! @rst
586
585
  //! Random-access input iterator to the sequence of ending offsets of length
587
586
  //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
588
- //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
587
+ //! the *i*\ :sup:`th` data segment in ``d_in``.
589
588
  //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
590
589
  //! @endrst
591
590
  //!
@@ -792,14 +791,14 @@ struct DeviceSegmentedReduce
792
791
  //! @rst
793
792
  //! Random-access input iterator to the sequence of beginning offsets of
794
793
  //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
795
- //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
794
+ //! element of the *i*\ :sup:`th` data segment in ``d_in``
796
795
  //! @endrst
797
796
  //!
798
797
  //! @param[in] d_end_offsets
799
798
  //! @rst
800
799
  //! Random-access input iterator to the sequence of ending offsets of length
801
800
  //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
802
- //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
801
+ //! the *i*\ :sup:`th` data segment in ``d_in``.
803
802
  //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
804
803
  //! @endrst
805
804
  //!
@@ -1037,14 +1036,14 @@ struct DeviceSegmentedReduce
1037
1036
  //! @rst
1038
1037
  //! Random-access input iterator to the sequence of beginning offsets of
1039
1038
  //! length ``num_segments``, such that ``d_begin_offsets[i]`` is the first
1040
- //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
1039
+ //! element of the *i*\ :sup:`th` data segment in ``d_in``
1041
1040
  //! @endrst
1042
1041
  //!
1043
1042
  //! @param[in] d_end_offsets
1044
1043
  //! @rst
1045
1044
  //! Random-access input iterator to the sequence of ending offsets of length
1046
1045
  //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
1047
- //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
1046
+ //! the *i*\ :sup:`th` data segment in ``d_in``.
1048
1047
  //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
1049
1048
  //! @endrst
1050
1049
  //!
@@ -1249,14 +1248,14 @@ struct DeviceSegmentedReduce
1249
1248
  //! @rst
1250
1249
  //! Random-access input iterator to the sequence of beginning offsets of
1251
1250
  //! length `num_segments`, such that ``d_begin_offsets[i]`` is the first
1252
- //! element of the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``
1251
+ //! element of the *i*\ :sup:`th` data segment in ``d_in``
1253
1252
  //! @endrst
1254
1253
  //!
1255
1254
  //! @param[in] d_end_offsets
1256
1255
  //! @rst
1257
1256
  //! Random-access input iterator to the sequence of ending offsets of length
1258
1257
  //! ``num_segments``, such that ``d_end_offsets[i] - 1`` is the last element of
1259
- //! the *i*\ :sup:`th` data segment in ``d_keys_*`` and ``d_values_*``.
1258
+ //! the *i*\ :sup:`th` data segment in ``d_in``.
1260
1259
  //! If ``d_end_offsets[i] - 1 <= d_begin_offsets[i]``, the *i*\ :sup:`th` is considered empty.
1261
1260
  //! @endrst
1262
1261
  //!