cuda-cccl 0.1.3.2.0.dev438__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  7. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  20. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  21. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
  22. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  23. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  24. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
  25. cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
  26. cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
  27. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
  28. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  29. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
  30. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  31. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
  32. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
  33. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  34. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
  35. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +10 -5
  36. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  37. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  38. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
  39. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
  40. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  41. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  42. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  43. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  44. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  46. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  49. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
  52. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  55. cuda/cccl/headers/include/cub/util_device.cuh +51 -35
  56. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  57. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  58. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  59. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  60. cuda/cccl/headers/include/cuda/__algorithm/copy.h +4 -4
  61. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  62. cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
  63. cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
  64. cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
  65. cuda/cccl/headers/include/cuda/__device/device_ref.h +32 -51
  66. cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
  67. cuda/cccl/headers/include/cuda/__driver/driver_api.h +330 -36
  68. cuda/cccl/headers/include/cuda/__event/event.h +8 -8
  69. cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
  70. cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -4
  71. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  72. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  73. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
  74. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
  75. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
  76. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
  77. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
  78. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
  79. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
  80. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
  81. cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
  82. cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
  83. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
  84. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
  85. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  86. cuda/cccl/headers/include/cuda/algorithm +1 -1
  87. cuda/cccl/headers/include/cuda/devices +10 -0
  88. cuda/cccl/headers/include/cuda/iterator +1 -0
  89. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  90. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  91. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  92. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
  93. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
  94. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  95. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  96. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  97. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
  98. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
  99. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
  100. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
  101. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
  102. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
  103. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
  104. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  105. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
  106. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  107. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  108. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  109. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
  110. cuda/cccl/headers/include/cuda/std/string_view +12 -5
  111. cuda/cccl/headers/include/cuda/std/version +1 -4
  112. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  113. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  114. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  115. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
  116. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
  117. cuda/cccl/parallel/experimental/__init__.py +21 -70
  118. cuda/compute/__init__.py +77 -0
  119. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
  120. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +141 -1
  121. cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
  122. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  123. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  124. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  125. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
  126. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
  127. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  128. cuda/compute/algorithms/_three_way_partition.py +261 -0
  129. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
  130. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  131. cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  132. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  133. cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  134. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  135. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
  136. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  137. cuda/coop/__init__.py +8 -0
  138. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  139. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  140. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  141. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  142. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  143. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  144. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  145. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  146. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  147. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  148. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  149. cuda/coop/warp/__init__.py +9 -0
  150. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  151. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  152. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  153. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
  154. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +171 -166
  155. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  156. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  157. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
  158. cuda/cccl/parallel/experimental/.gitignore +0 -4
  159. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  160. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  161. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  162. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  163. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  164. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  165. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  166. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  167. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  168. /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
  169. /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
  170. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  171. /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
  172. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  173. /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
  174. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  175. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  176. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
  177. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -50,8 +50,8 @@
50
50
 
51
51
  #include <cuda/__ptx/instructions/get_sreg.h>
52
52
  #include <cuda/std/__algorithm/clamp.h>
53
- #include <cuda/std/__algorithm/max.h>
54
53
  #include <cuda/std/__bit/has_single_bit.h>
54
+ #include <cuda/std/__bit/integral.h>
55
55
  #include <cuda/std/__functional/operations.h>
56
56
  #include <cuda/std/__type_traits/integral_constant.h>
57
57
  #include <cuda/std/__type_traits/is_integral.h>
@@ -630,7 +630,7 @@ struct WarpScanShfl
630
630
  ballot = ballot & ::cuda::ptx::get_sreg_lanemask_le();
631
631
 
632
632
  // Find index of first set bit
633
- int segment_first_lane = ::cuda::std::max(0, 31 - __clz(ballot));
633
+ int segment_first_lane = ::cuda::std::__bit_log2(ballot);
634
634
 
635
635
  // Iterate scan steps
636
636
  _CCCL_PRAGMA_UNROLL_FULL()
@@ -11,7 +11,7 @@
11
11
  #ifndef __CUDA___ALGORITHM_COMMON
12
12
  #define __CUDA___ALGORITHM_COMMON
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -11,7 +11,7 @@
11
11
  #ifndef __CUDA___ALGORITHM_COPY_H
12
12
  #define __CUDA___ALGORITHM_COPY_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -38,11 +38,11 @@ enum class source_access_order
38
38
  {
39
39
  # if _CCCL_CTK_AT_LEAST(13, 0)
40
40
  //! @brief Access source in stream order
41
- stream = cudaMemcpySrcAccessOrderStream,
41
+ stream = ::cudaMemcpySrcAccessOrderStream,
42
42
  //! @brief Access source during the copy call, source can be destroyed after the API returns
43
- during_api_call = cudaMemcpySrcAccessOrderDuringApiCall,
43
+ during_api_call = ::cudaMemcpySrcAccessOrderDuringApiCall,
44
44
  //! @brief Access source in any order, the order can change across CUDA releases
45
- any = cudaMemcpySrcAccessOrderAny,
45
+ any = ::cudaMemcpySrcAccessOrderAny,
46
46
  # else
47
47
  any = 0x3,
48
48
  # endif // _CCCL_CTK_BELOW(13, 0)
@@ -11,7 +11,7 @@
11
11
  #ifndef __CUDA___ALGORITHM_FILL
12
12
  #define __CUDA___ALGORITHM_FILL
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA___DEVICE_ALL_DEVICES_H
12
12
  #define _CUDA___DEVICE_ALL_DEVICES_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -22,10 +22,12 @@
22
22
  #endif // no system header
23
23
 
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
+
26
+ # include <cuda/__device/device_ref.h>
25
27
  # include <cuda/__device/physical_device.h>
26
- # include <cuda/std/__cuda/api_wrapper.h>
27
- # include <cuda/std/cassert>
28
- # include <cuda/std/detail/libcxx/include/stdexcept>
28
+ # include <cuda/__driver/driver_api.h>
29
+ # include <cuda/__fwd/devices.h>
30
+ # include <cuda/std/__cstddef/types.h>
29
31
  # include <cuda/std/span>
30
32
 
31
33
  # include <vector>
@@ -33,132 +35,62 @@
33
35
  # include <cuda/std/__cccl/prologue.h>
34
36
 
35
37
  _CCCL_BEGIN_NAMESPACE_CUDA
36
- namespace __detail
37
- {
38
- //! @brief A random-access range of all available CUDA devices
39
- class all_devices
40
- {
41
- public:
42
- using size_type = ::std::vector<physical_device>::size_type;
43
- using iterator = ::std::vector<physical_device>::const_iterator;
44
- using const_iterator = ::std::vector<physical_device>::const_iterator;
45
-
46
- all_devices() = default;
47
-
48
- [[nodiscard]] const physical_device& operator[](size_type __i) const;
49
-
50
- [[nodiscard]] size_type size() const;
51
38
 
52
- [[nodiscard]] iterator begin() const noexcept;
53
-
54
- [[nodiscard]] iterator end() const noexcept;
55
-
56
- operator ::cuda::std::span<const device_ref>() const;
57
-
58
- private:
59
- struct __initializer_iterator;
60
-
61
- static const ::std::vector<physical_device>& __devices();
62
- };
63
-
64
- //! @brief An iterator used to in-place construct `device` objects in a
65
- //! std::vector.
66
- //!
67
- //! Since `device` objects are not movable or copyable, we need to construct them
68
- //! in-place with a proxy object that can be implicitly converted to a `device`
69
- //! object.
70
- struct all_devices::__initializer_iterator
39
+ [[nodiscard]] _CCCL_HOST_API inline ::std::vector<device_ref> __make_devices()
71
40
  {
72
- using value_type = __emplace_device;
73
- using reference = __emplace_device;
74
- using iterator_category = ::std::forward_iterator_tag;
75
- using difference_type = int;
76
- using pointer = __emplace_device;
77
-
78
- int __id_;
79
-
80
- __emplace_device operator*() const noexcept
41
+ ::std::vector<device_ref> __ret{};
42
+ __ret.reserve(::cuda::__physical_devices().size());
43
+ for (::cuda::std::size_t __i = 0; __i < ::cuda::__physical_devices().size(); ++__i)
81
44
  {
82
- return __emplace_device{__id_};
45
+ __ret.emplace_back(static_cast<int>(__i));
83
46
  }
47
+ return __ret;
48
+ }
84
49
 
85
- __emplace_device operator->() const noexcept
86
- {
87
- return __emplace_device{__id_};
88
- }
50
+ [[nodiscard]] inline ::cuda::std::span<const device_ref> __devices()
51
+ {
52
+ static const auto __devices = ::cuda::__make_devices();
53
+ return ::cuda::std::span<const device_ref>{__devices.data(), __devices.size()};
54
+ }
89
55
 
90
- __initializer_iterator& operator++() noexcept
91
- {
92
- ++__id_;
93
- return *this;
94
- }
56
+ //! @brief A random-access range of all available CUDA devices
57
+ class __all_devices
58
+ {
59
+ public:
60
+ using value_type = ::cuda::std::span<const device_ref>::value_type;
61
+ using size_type = ::cuda::std::span<const device_ref>::size_type;
62
+ using iterator = ::cuda::std::span<const device_ref>::iterator;
63
+
64
+ _CCCL_HIDE_FROM_ABI __all_devices() = default;
65
+ __all_devices(const __all_devices&) = delete;
66
+ __all_devices(__all_devices&&) = delete;
67
+ __all_devices& operator=(const __all_devices&) = delete;
68
+ __all_devices& operator=(__all_devices&&) = delete;
95
69
 
96
- __initializer_iterator operator++(int) noexcept
70
+ [[nodiscard]] _CCCL_HOST_API device_ref operator[](size_type __i) const
97
71
  {
98
- auto __tmp = *this;
99
- ++__id_;
100
- return __tmp;
72
+ if (__i >= size())
73
+ {
74
+ ::cuda::std::__throw_out_of_range("device index out of range");
75
+ }
76
+ return ::cuda::__devices()[__i];
101
77
  }
102
78
 
103
- bool operator==(const __initializer_iterator& __other) const noexcept
79
+ [[nodiscard]] _CCCL_HOST_API size_type size() const
104
80
  {
105
- return __id_ == __other.__id_;
81
+ return ::cuda::__devices().size();
106
82
  }
107
83
 
108
- bool operator!=(const __initializer_iterator& __other) const noexcept
84
+ [[nodiscard]] _CCCL_HOST_API iterator begin() const
109
85
  {
110
- return __id_ != __other.__id_;
86
+ return ::cuda::__devices().begin();
111
87
  }
112
- };
113
88
 
114
- [[nodiscard]] inline const physical_device& all_devices::operator[](size_type __id_) const
115
- {
116
- if (__id_ >= size())
89
+ [[nodiscard]] _CCCL_HOST_API iterator end() const
117
90
  {
118
- if (size() == 0)
119
- {
120
- ::cuda::std::__throw_out_of_range("device was requested but no CUDA devices found");
121
- }
122
- else
123
- {
124
- ::cuda::std::__throw_out_of_range(
125
- (::std::string("device index out of range: ") + ::std::to_string(__id_)).c_str());
126
- }
91
+ return ::cuda::__devices().end();
127
92
  }
128
- return __devices()[__id_];
129
- }
130
-
131
- [[nodiscard]] inline all_devices::size_type all_devices::size() const
132
- {
133
- return __devices().size();
134
- }
135
-
136
- [[nodiscard]] inline all_devices::iterator all_devices::begin() const noexcept
137
- {
138
- return __devices().begin();
139
- }
140
-
141
- [[nodiscard]] inline all_devices::iterator all_devices::end() const noexcept
142
- {
143
- return __devices().end();
144
- }
145
-
146
- inline all_devices::operator ::cuda::std::span<const device_ref>() const
147
- {
148
- static const ::std::vector<device_ref> __refs(begin(), end());
149
- return ::cuda::std::span<const device_ref>(__refs);
150
- }
151
-
152
- inline const ::std::vector<physical_device>& all_devices::__devices()
153
- {
154
- static const ::std::vector<physical_device> __devices = [] {
155
- int __count = 0;
156
- _CCCL_TRY_CUDA_API(::cudaGetDeviceCount, "failed to get the count of CUDA devices", &__count);
157
- return ::std::vector<physical_device>{__initializer_iterator{0}, __initializer_iterator{__count}};
158
- }();
159
- return __devices;
160
- }
161
- } // namespace __detail
93
+ };
162
94
 
163
95
  //! @brief A range of all available CUDA devices
164
96
  //!
@@ -174,7 +106,7 @@ inline const ::std::vector<physical_device>& all_devices::__devices()
174
106
  //! struct iterator;
175
107
  //! using const_iterator = iterator;
176
108
  //!
177
- //! [[nodiscard]] constexpr const physical_device& operator[](size_type i) const noexcept;
109
+ //! [[nodiscard]] device_ref operator[](size_type i) const noexcept;
178
110
  //!
179
111
  //! [[nodiscard]] size_type size() const;
180
112
  //!
@@ -186,7 +118,7 @@ inline const ::std::vector<physical_device>& all_devices::__devices()
186
118
  //!
187
119
  //! @par
188
120
  //! `__all_devices::iterator` is a random access iterator with a `reference`
189
- //! type of `const physical_device&`.
121
+ //! type of `const device_ref&`.
190
122
  //!
191
123
  //! @par Example
192
124
  //! @code
@@ -197,39 +129,7 @@ inline const ::std::vector<physical_device>& all_devices::__devices()
197
129
  //! @sa
198
130
  //! * device
199
131
  //! * device_ref
200
- inline constexpr __detail::all_devices devices{};
201
-
202
- inline const arch::traits_t& device_ref::arch_traits() const
203
- {
204
- return devices[get()].arch_traits();
205
- }
206
-
207
- [[nodiscard]] inline ::std::vector<device_ref> device_ref::peer_devices() const
208
- {
209
- ::std::vector<device_ref> __result;
210
- __result.reserve(devices.size());
211
-
212
- for (const physical_device& __other_dev : devices)
213
- {
214
- // Exclude the device this API is called on. The main use case for this API
215
- // is enable/disable peer access. While enable peer access can be called on
216
- // device on which memory resides, disable peer access will error-out.
217
- // Usage of the peer access control is smoother when *this is excluded,
218
- // while it can be easily added with .push_back() on the vector if a full
219
- // group of peers is needed (for cases other than peer access control)
220
- if (__other_dev != *this)
221
- {
222
- // While in almost all practical applications peer access should be symmetrical,
223
- // it is possible to build a system with one directional peer access, check
224
- // both ways here just to be safe
225
- if (has_peer_access_to(__other_dev) && __other_dev.has_peer_access_to(*this))
226
- {
227
- __result.push_back(__other_dev);
228
- }
229
- }
230
- }
231
- return __result;
232
- }
132
+ inline constexpr __all_devices devices{};
233
133
 
234
134
  _CCCL_END_NAMESPACE_CUDA
235
135
 
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA___DEVICE_ARCH_TRAITS_H
12
12
  #define _CUDA___DEVICE_ARCH_TRAITS_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -22,7 +22,9 @@
22
22
  #endif // no system header
23
23
 
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
+
25
26
  # include <cuda/__device/attributes.h>
27
+ # include <cuda/__fwd/devices.h>
26
28
  # include <cuda/std/__exception/cuda_error.h>
27
29
  # include <cuda/std/limits>
28
30
 
@@ -58,76 +60,76 @@ enum class id : int
58
60
  sm_120a = 120 * __arch_specific_id_multiplier,
59
61
  };
60
62
 
61
- // @brief Architecture traits
62
- // This type contains information about an architecture that is constant across devices of that architecture.
63
+ //! @brief Architecture traits
64
+ //! This type contains information about an architecture that is constant across devices of that architecture.
63
65
  struct traits_t
64
66
  {
65
67
  // Maximum number of threads per block
66
- const int max_threads_per_block = 1024;
68
+ int max_threads_per_block = 1024;
67
69
 
68
70
  // Maximum x-dimension of a block
69
- const int max_block_dim_x = 1024;
71
+ int max_block_dim_x = 1024;
70
72
 
71
73
  // Maximum y-dimension of a block
72
- const int max_block_dim_y = 1024;
74
+ int max_block_dim_y = 1024;
73
75
 
74
76
  // Maximum z-dimension of a block
75
- const int max_block_dim_z = 64;
77
+ int max_block_dim_z = 64;
76
78
 
77
79
  // Maximum x-dimension of a grid
78
- const int max_grid_dim_x = ::cuda::std::numeric_limits<int32_t>::max();
80
+ int max_grid_dim_x = ::cuda::std::numeric_limits<int32_t>::max();
79
81
 
80
82
  // Maximum y-dimension of a grid
81
- const int max_grid_dim_y = 64 * 1024 - 1;
83
+ int max_grid_dim_y = 64 * 1024 - 1;
82
84
 
83
85
  // Maximum z-dimension of a grid
84
- const int max_grid_dim_z = 64 * 1024 - 1;
86
+ int max_grid_dim_z = 64 * 1024 - 1;
85
87
 
86
88
  // Maximum amount of shared memory available to a thread block in bytes
87
- const int max_shared_memory_per_block = 48 * 1024;
89
+ ::cuda::std::size_t max_shared_memory_per_block = 48 * 1024;
88
90
 
89
91
  // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
90
- const int total_constant_memory = 64 * 1024;
92
+ ::cuda::std::size_t total_constant_memory = 64 * 1024;
91
93
 
92
94
  // Warp size in threads
93
- const int warp_size = 32;
95
+ int warp_size = 32;
94
96
 
95
97
  // Maximum number of concurrent grids on the device
96
- const int max_resident_grids = 128;
98
+ int max_resident_grids = 128;
97
99
 
98
100
  // true if the device can concurrently copy memory between host and device
99
101
  // while executing a kernel, or false if not
100
- const bool gpu_overlap = true;
102
+ bool gpu_overlap = true;
101
103
 
102
104
  // true if the device can map host memory into CUDA address space
103
- const bool can_map_host_memory = true;
105
+ bool can_map_host_memory = true;
104
106
 
105
107
  // true if the device supports executing multiple kernels within the same
106
108
  // context simultaneously, or false if not. It is not guaranteed that multiple
107
109
  // kernels will be resident on the device concurrently so this feature should
108
110
  // not be relied upon for correctness.
109
- const bool concurrent_kernels = true;
111
+ bool concurrent_kernels = true;
110
112
 
111
113
  // true if the device supports stream priorities, or false if not
112
- const bool stream_priorities_supported = true;
114
+ bool stream_priorities_supported = true;
113
115
 
114
116
  // true if device supports caching globals in L1 cache, false if not
115
- const bool global_l1_cache_supported = true;
117
+ bool global_l1_cache_supported = true;
116
118
 
117
119
  // true if device supports caching locals in L1 cache, false if not
118
- const bool local_l1_cache_supported = true;
120
+ bool local_l1_cache_supported = true;
119
121
 
120
122
  // TODO: We might want to have these per-arch
121
123
  // Maximum number of 32-bit registers available to a thread block
122
- const int max_registers_per_block = 64 * 1024;
124
+ int max_registers_per_block = 64 * 1024;
123
125
 
124
126
  // Maximum number of 32-bit registers available to a multiprocessor; this
125
127
  // number is shared by all thread blocks simultaneously resident on a
126
128
  // multiprocessor
127
- const int max_registers_per_multiprocessor = 64 * 1024;
129
+ int max_registers_per_multiprocessor = 64 * 1024;
128
130
 
129
131
  // Maximum number of 32-bit registers available to a thread
130
- const int max_registers_per_thread = 255;
132
+ int max_registers_per_thread = 255;
131
133
 
132
134
  // Identifier for the architecture
133
135
  id arch_id;
@@ -144,7 +146,7 @@ struct traits_t
144
146
  // Maximum amount of shared memory available to a multiprocessor in bytes;
145
147
  // this amount is shared by all thread blocks simultaneously resident on a
146
148
  // multiprocessor
147
- int max_shared_memory_per_multiprocessor;
149
+ ::cuda::std::size_t max_shared_memory_per_multiprocessor;
148
150
 
149
151
  // Maximum number of thread blocks that can reside on a multiprocessor
150
152
  int max_blocks_per_multiprocessor;
@@ -156,11 +158,11 @@ struct traits_t
156
158
  int max_warps_per_multiprocessor;
157
159
 
158
160
  // Shared memory reserved by CUDA driver per block in bytes
159
- int reserved_shared_memory_per_block;
161
+ ::cuda::std::size_t reserved_shared_memory_per_block;
160
162
 
161
163
  // Maximum per block shared memory size on the device. This value can be opted
162
164
  // into when using dynamic_shared_memory with NonPortableSize set to true
163
- int max_shared_memory_per_block_optin;
165
+ ::cuda::std::size_t max_shared_memory_per_block_optin;
164
166
 
165
167
  // TODO: Do we want these?:
166
168
  // true if architecture supports clusters
@@ -182,10 +184,10 @@ struct traits_t
182
184
  // @brief Architecture traits
183
185
  // Template function that returns the traits for an architecture with a given id.
184
186
  template <id _Id>
185
- [[nodiscard]] _CCCL_HOST_DEVICE constexpr traits_t traits();
187
+ [[nodiscard]] _CCCL_API constexpr traits_t traits();
186
188
 
187
189
  template <>
188
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_60>()
190
+ [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_60>()
189
191
  {
190
192
  traits_t __traits{};
191
193
  __traits.arch_id = id::sm_60;
@@ -208,7 +210,7 @@ template <>
208
210
  };
209
211
 
210
212
  template <>
211
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_61>()
213
+ [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_61>()
212
214
  {
213
215
  traits_t __traits{};
214
216
  __traits.arch_id = id::sm_61;
@@ -231,7 +233,7 @@ template <>
231
233
  };
232
234
 
233
235
  template <>
234
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_70>()
236
+ [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_70>()
235
237
  {
236
238
  traits_t __traits{};
237
239
  __traits.arch_id = id::sm_70;
@@ -255,7 +257,7 @@ template <>
255
257
  };
256
258
 
257
259
  template <>
258
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_75>()
260
+ [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_75>()
259
261
  {
260
262
  traits_t __traits{};
261
263
  __traits.arch_id = id::sm_75;
@@ -279,7 +281,7 @@ template <>
279
281
  };
280
282
 
281
283
  template <>
282
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_80>()
284
+ [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_80>()
283
285
  {
284
286
  traits_t __traits{};
285
287
  __traits.arch_id = id::sm_80;
@@ -303,7 +305,7 @@ template <>
303
305
  };
304
306
 
305
307
  template <>
306
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_86>()
308
+ [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_86>()
307
309
  {
308
310
  traits_t __traits{};
309
311
  __traits.arch_id = id::sm_86;
@@ -327,7 +329,7 @@ template <>
327
329
  };
328
330
 
329
331
  template <>
330
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_89>()
332
+ [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_89>()
331
333
  {
332
334
  traits_t __traits{};
333
335
  __traits.arch_id = id::sm_89;
@@ -351,7 +353,7 @@ template <>
351
353
  };
352
354
 
353
355
  template <>
354
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_90>()
356
+ [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_90>()
355
357
  {
356
358
  traits_t __traits{};
357
359
  __traits.arch_id = id::sm_90;
@@ -376,13 +378,13 @@ template <>
376
378
 
377
379
  // No sm_90a specific fields for now.
378
380
  template <>
379
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_90a>()
381
+ [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_90a>()
380
382
  {
381
383
  return ::cuda::arch::traits<id::sm_90>();
382
384
  };
383
385
 
384
386
  template <>
385
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_100>()
387
+ [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_100>()
386
388
  {
387
389
  traits_t __traits{};
388
390
  __traits.arch_id = id::sm_100;
@@ -406,13 +408,13 @@ template <>
406
408
  };
407
409
 
408
410
  template <>
409
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_100a>()
411
+ [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_100a>()
410
412
  {
411
413
  return ::cuda::arch::traits<id::sm_100>();
412
414
  };
413
415
 
414
416
  template <>
415
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_103>()
417
+ [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_103>()
416
418
  {
417
419
  traits_t __traits = ::cuda::arch::traits<id::sm_100>();
418
420
  __traits.arch_id = id::sm_103;
@@ -423,13 +425,13 @@ template <>
423
425
  };
424
426
 
425
427
  template <>
426
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_103a>()
428
+ [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_103a>()
427
429
  {
428
430
  return ::cuda::arch::traits<id::sm_103>();
429
431
  };
430
432
 
431
433
  template <>
432
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_110>()
434
+ [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_110>()
433
435
  {
434
436
  traits_t __traits = ::cuda::arch::traits<id::sm_100>();
435
437
  __traits.arch_id = id::sm_110;
@@ -440,7 +442,7 @@ template <>
440
442
  };
441
443
 
442
444
  template <>
443
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_110a>()
445
+ [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_110a>()
444
446
  {
445
447
  return ::cuda::arch::traits<id::sm_110>();
446
448
  };
@@ -470,7 +472,7 @@ template <>
470
472
  };
471
473
 
472
474
  template <>
473
- [[nodiscard]] _CCCL_HOST_DEVICE inline constexpr traits_t traits<id::sm_120a>()
475
+ [[nodiscard]] _CCCL_API inline constexpr traits_t traits<id::sm_120a>()
474
476
  {
475
477
  return ::cuda::arch::traits<id::sm_120>();
476
478
  };
@@ -516,7 +518,7 @@ inline constexpr int __highest_known_arch = 120;
516
518
  case id::sm_120a:
517
519
  return ::cuda::arch::traits<id::sm_120a>();
518
520
  default:
519
- ::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Traits requested for an unknown architecture");
521
+ ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Traits requested for an unknown architecture");
520
522
  break;
521
523
  }
522
524
  }
@@ -525,7 +527,7 @@ inline constexpr int __highest_known_arch = 120;
525
527
  {
526
528
  if (compute_capability < 60 || compute_capability > __highest_known_arch)
527
529
  {
528
- ::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Compute capability out of range");
530
+ ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
529
531
  }
530
532
  return static_cast<id>(compute_capability);
531
533
  }
@@ -535,7 +537,7 @@ inline constexpr int __highest_known_arch = 120;
535
537
  return ::cuda::arch::traits_for_id(::cuda::arch::id_for_compute_capability(compute_capability));
536
538
  }
537
539
 
538
- _CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
540
+ [[nodiscard]] _CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
539
541
  {
540
542
  switch (value)
541
543
  {
@@ -550,13 +552,13 @@ _CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
550
552
  case 120:
551
553
  return id::sm_120a;
552
554
  default:
553
- ::cuda::__throw_cuda_error(cudaErrorInvalidValue, "Compute capability out of range");
555
+ ::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
554
556
  break;
555
557
  }
556
558
  }
557
559
 
558
560
  //! @brief Provides architecture traits of the architecture matching __CUDA_ARCH__ macro
559
- [[nodiscard]] _CCCL_DEVICE inline constexpr arch::traits_t current_traits()
561
+ [[nodiscard]] _CCCL_DEVICE_API inline constexpr arch::traits_t current_traits()
560
562
  {
561
563
  // fixme: this doesn't work with nvc++ -cuda
562
564
  # ifdef __CUDA_ARCH__
@@ -571,7 +573,7 @@ _CCCL_API inline constexpr id __special_id_for_compute_capability(int value)
571
573
  # endif // __CUDA_ARCH__
572
574
  }
573
575
 
574
- [[nodiscard]] inline constexpr arch::traits_t
576
+ [[nodiscard]] _CCCL_HOST_API inline constexpr arch::traits_t
575
577
  __arch_traits_might_be_unknown(int __device, unsigned int __compute_capability)
576
578
  {
577
579
  if (__compute_capability <= arch::__highest_known_arch)