cuda-cccl 0.1.3.2.0.dev438__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.1__cp313-cp313-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  7. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  20. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  21. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
  22. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  23. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  24. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
  25. cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
  26. cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
  27. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
  28. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  29. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
  30. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  31. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
  32. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
  33. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  34. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
  35. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +10 -5
  36. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  37. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  38. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
  39. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
  40. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  41. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  42. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  43. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  44. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  46. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  49. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
  52. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  55. cuda/cccl/headers/include/cub/util_device.cuh +51 -35
  56. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  57. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  58. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  59. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  60. cuda/cccl/headers/include/cuda/__algorithm/copy.h +4 -4
  61. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  62. cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
  63. cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
  64. cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
  65. cuda/cccl/headers/include/cuda/__device/device_ref.h +32 -51
  66. cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
  67. cuda/cccl/headers/include/cuda/__driver/driver_api.h +330 -36
  68. cuda/cccl/headers/include/cuda/__event/event.h +8 -8
  69. cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
  70. cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -4
  71. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  72. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  73. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
  74. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
  75. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
  76. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
  77. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
  78. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
  79. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
  80. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
  81. cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
  82. cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
  83. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
  84. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
  85. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  86. cuda/cccl/headers/include/cuda/algorithm +1 -1
  87. cuda/cccl/headers/include/cuda/devices +10 -0
  88. cuda/cccl/headers/include/cuda/iterator +1 -0
  89. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  90. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  91. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  92. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
  93. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
  94. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  95. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  96. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  97. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
  98. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
  99. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
  100. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
  101. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
  102. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
  103. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
  104. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  105. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
  106. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  107. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  108. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  109. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
  110. cuda/cccl/headers/include/cuda/std/string_view +12 -5
  111. cuda/cccl/headers/include/cuda/std/version +1 -4
  112. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  113. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  114. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  115. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
  116. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
  117. cuda/cccl/parallel/experimental/__init__.py +21 -70
  118. cuda/compute/__init__.py +77 -0
  119. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
  120. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +141 -1
  121. cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
  122. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  123. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  124. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  125. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
  126. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
  127. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  128. cuda/compute/algorithms/_three_way_partition.py +261 -0
  129. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
  130. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  131. cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  132. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  133. cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  134. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  135. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
  136. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  137. cuda/coop/__init__.py +8 -0
  138. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  139. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  140. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  141. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  142. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  143. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  144. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  145. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  146. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  147. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  148. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  149. cuda/coop/warp/__init__.py +9 -0
  150. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  151. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  152. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  153. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
  154. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +171 -166
  155. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  156. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  157. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
  158. cuda/cccl/parallel/experimental/.gitignore +0 -4
  159. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  160. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
  161. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  162. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  163. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  164. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  165. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  166. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  167. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  168. /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
  169. /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
  170. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  171. /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
  172. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  173. /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
  174. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  175. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  176. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
  177. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA___DEVICE_DEVICE_REF_H
12
12
  #define _CUDA___DEVICE_DEVICE_REF_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -22,45 +22,32 @@
22
22
  #endif // no system header
23
23
 
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
+
25
26
  # include <cuda/__driver/driver_api.h>
27
+ # include <cuda/__fwd/devices.h>
26
28
  # include <cuda/__runtime/types.h>
27
- # include <cuda/std/__cuda/api_wrapper.h>
28
-
29
- # include <string>
30
- # include <vector>
29
+ # include <cuda/std/span>
30
+ # include <cuda/std/string_view>
31
31
 
32
32
  # include <cuda/std/__cccl/prologue.h>
33
33
 
34
34
  _CCCL_BEGIN_NAMESPACE_CUDA
35
- class physical_device;
36
- namespace arch
37
- {
38
- struct traits_t;
39
- } // namespace arch
40
-
41
- namespace __detail
42
- {
43
- template <::cudaDeviceAttr _Attr>
44
- struct __dev_attr;
45
- } // namespace __detail
46
35
 
47
36
  //! @brief A non-owning representation of a CUDA device
48
37
  class device_ref
49
38
  {
50
- friend class physical_device;
51
-
52
39
  int __id_ = 0;
53
40
 
54
41
  public:
55
42
  //! @brief Create a `device_ref` object from a native device ordinal.
56
- /*implicit*/ constexpr device_ref(int __id) noexcept
43
+ /*implicit*/ _CCCL_HOST_API constexpr device_ref(int __id) noexcept
57
44
  : __id_(__id)
58
45
  {}
59
46
 
60
47
  //! @brief Retrieve the native ordinal of the `device_ref`
61
48
  //!
62
49
  //! @return int The native device ordinal held by the `device_ref` object
63
- [[nodiscard]] constexpr int get() const noexcept
50
+ [[nodiscard]] _CCCL_HOST_API constexpr int get() const noexcept
64
51
  {
65
52
  return __id_;
66
53
  }
@@ -73,7 +60,7 @@ public:
73
60
  //! @param __lhs The first `device_ref` to compare
74
61
  //! @param __rhs The second `device_ref` to compare
75
62
  //! @return true if `lhs` and `rhs` refer to the same device ordinal
76
- [[nodiscard]] friend constexpr bool operator==(device_ref __lhs, device_ref __rhs) noexcept
63
+ [[nodiscard]] friend _CCCL_HOST_API constexpr bool operator==(device_ref __lhs, device_ref __rhs) noexcept
77
64
  {
78
65
  return __lhs.__id_ == __rhs.__id_;
79
66
  }
@@ -87,7 +74,7 @@ public:
87
74
  //! @param __lhs The first `device_ref` to compare
88
75
  //! @param __rhs The second `device_ref` to compare
89
76
  //! @return true if `lhs` and `rhs` refer to different device ordinal
90
- [[nodiscard]] constexpr friend bool operator!=(device_ref __lhs, device_ref __rhs) noexcept
77
+ [[nodiscard]] friend _CCCL_HOST_API constexpr bool operator!=(device_ref __lhs, device_ref __rhs) noexcept
91
78
  {
92
79
  return __lhs.__id_ != __rhs.__id_;
93
80
  }
@@ -102,38 +89,35 @@ public:
102
89
  //!
103
90
  //! @sa device::attrs
104
91
  template <typename _Attr>
105
- [[nodiscard]] auto attribute(_Attr __attr) const
92
+ [[nodiscard]] _CCCL_HOST_API auto attribute(_Attr __attr) const
106
93
  {
107
94
  return __attr(*this);
108
95
  }
109
96
 
110
97
  //! @overload
111
98
  template <::cudaDeviceAttr _Attr>
112
- [[nodiscard]] auto attribute() const
99
+ [[nodiscard]] _CCCL_HOST_API auto attribute() const
113
100
  {
114
- return attribute(__detail::__dev_attr<_Attr>());
101
+ return attribute(__dev_attr<_Attr>());
115
102
  }
116
103
 
117
104
  //! @brief Retrieve the memory location of this device
118
105
  //!
119
106
  //! @return The memory location of this device
120
- [[nodiscard]] operator memory_location() const noexcept
107
+ [[nodiscard]] _CCCL_HOST_API operator memory_location() const noexcept
121
108
  {
122
109
  return memory_location{::cudaMemLocationTypeDevice, get()};
123
110
  }
124
111
 
125
- //! @brief Retrieve string with the name of this device.
126
- //!
127
- //! @return String containing the name of this device.
128
- [[nodiscard]] ::std::string name() const
129
- {
130
- constexpr int __max_name_length = 256;
131
- ::std::string __name(256, 0);
112
+ //! @brief Initializes the primary context of the device.
113
+ _CCCL_HOST_API void init() const; // implemented in <cuda/__device/physical_device.h> to avoid circular dependency
132
114
 
133
- // For some reason there is no separate name query in CUDA runtime
134
- ::cuda::__driver::__deviceGetName(__name.data(), __max_name_length, get());
135
- return __name;
136
- }
115
+ //! @brief Retrieve the name of this device.
116
+ //!
117
+ //! @return String view containing the name of this device.
118
+ [[nodiscard]] _CCCL_HOST_API ::cuda::std::string_view name() const; // implemented in
119
+ // <cuda/__device/physical_device.h> to avoid
120
+ // circular dependency
137
121
 
138
122
  //! @brief Queries if its possible for this device to directly access specified device's memory.
139
123
  //!
@@ -143,16 +127,10 @@ public:
143
127
  //!
144
128
  //! @param __other_dev Device to query the peer access
145
129
  //! @return true if its possible for this device to access the specified device's memory
146
- bool has_peer_access_to(device_ref __other_dev) const
130
+ [[nodiscard]] _CCCL_HOST_API bool has_peer_access_to(device_ref __other_dev) const
147
131
  {
148
- int __can_access;
149
- _CCCL_TRY_CUDA_API(
150
- ::cudaDeviceCanAccessPeer,
151
- "Could not query if device can be peer accessed",
152
- &__can_access,
153
- get(),
154
- __other_dev.get());
155
- return __can_access;
132
+ return ::cuda::__driver::__deviceCanAccessPeer(
133
+ ::cuda::__driver::__deviceGet(get()), ::cuda::__driver::__deviceGet(__other_dev.get()));
156
134
  }
157
135
 
158
136
  //! @brief Retrieve architecture traits of this device.
@@ -161,19 +139,22 @@ public:
161
139
  //! that are shared by all devices belonging to given architecture.
162
140
  //!
163
141
  //! @return A reference to `arch_traits_t` object containing architecture traits of this device
164
- const arch::traits_t& arch_traits() const;
142
+ [[nodiscard]] _CCCL_HOST_API const arch::traits_t& arch_traits() const; // implemented in
143
+ // <cuda/__device/physical_device.h> to avoid
144
+ // circular dependency
165
145
 
166
146
  // TODO this might return some more complex type in the future
167
147
  // TODO we might want to include the calling device, depends on what we decide
168
148
  // peer access APIs
169
149
 
170
- //! @brief Retrieve a vector of `device_ref`s that are peers of this device
150
+ //! @brief Retrieve `device_ref`s that are peers of this device
171
151
  //!
172
- //! The device on which this API is called is not included in the vector,
173
- //! if a full group of peer devices is needed, it needs to be pushed_back separately.
152
+ //! The device on which this API is called is not included in the vector.
174
153
  //!
175
154
  //! @throws cuda_error if any peer access query fails
176
- ::std::vector<device_ref> peer_devices() const;
155
+ [[nodiscard]] _CCCL_HOST_API ::cuda::std::span<const device_ref> peers() const; // implemented in
156
+ // <cuda/__device/physical_device.h>
157
+ // to avoid circular dependency
177
158
  };
178
159
 
179
160
  _CCCL_END_NAMESPACE_CUDA
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA___DEVICE_PHYSICAL_DEVICE_H
12
12
  #define _CUDA___DEVICE_PHYSICAL_DEVICE_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -24,58 +24,71 @@
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
 
26
26
  # include <cuda/__device/arch_traits.h>
27
- # include <cuda/__device/attributes.h>
28
27
  # include <cuda/__device/device_ref.h>
29
28
  # include <cuda/__driver/driver_api.h>
29
+ # include <cuda/__fwd/devices.h>
30
+ # include <cuda/std/__cstddef/types.h>
31
+ # include <cuda/std/span>
32
+ # include <cuda/std/string_view>
30
33
 
31
34
  # include <cassert>
35
+ # include <memory>
32
36
  # include <mutex>
37
+ # include <vector>
33
38
 
34
39
  # include <cuda/std/__cccl/prologue.h>
35
40
 
36
41
  _CCCL_BEGIN_NAMESPACE_CUDA
37
- namespace __detail
38
- {
39
- //! @brief A proxy object used to in-place construct a `device` object from an
40
- //! integer ID. Used in __detail/all_devices.cuh.
41
- struct __emplace_device
42
- {
43
- int __id_;
44
-
45
- [[nodiscard]] operator physical_device() const;
46
42
 
47
- [[nodiscard]] constexpr const __emplace_device* operator->() const;
48
- };
49
- } // namespace __detail
50
-
51
- //! @brief For a given attribute, type of the attribute value.
52
- //!
53
- //! @par Example
54
- //! @code
55
- //! using threads_per_block_t = device::attr_result_t<device_attributes::max_threads_per_block>;
56
- //! static_assert(std::is_same_v<threads_per_block_t, int>);
57
- //! @endcode
58
- //!
59
- //! @sa device_attributes
60
- template <::cudaDeviceAttr _Attr>
61
- using device_attribute_result_t = typename __detail::__dev_attr<_Attr>::type;
43
+ [[nodiscard]] inline ::cuda::std::span<__physical_device> __physical_devices();
62
44
 
63
45
  // This is the element type of the the global `devices` array. In the future, we
64
46
  // can cache device properties here.
65
47
  //
66
48
  //! @brief An immovable "owning" representation of a CUDA device.
67
- class physical_device : public device_ref
49
+ class __physical_device
68
50
  {
51
+ friend _CCCL_HOST_API inline ::std::unique_ptr<__physical_device[]>
52
+ __make_physical_devices(::cuda::std::size_t __device_count);
53
+
54
+ ::CUdevice __device_{};
55
+
56
+ // TODO We should have some of the attributes just return from the arch traits
57
+ ::std::once_flag __traits_once_flag_{};
58
+ arch::traits_t __traits_{};
59
+
60
+ ::std::once_flag __primary_ctx_once_flag_{};
61
+ ::CUcontext __primary_ctx_{};
62
+
63
+ static constexpr ::cuda::std::size_t __max_name_length{256};
64
+ ::std::once_flag __name_once_flag_{};
65
+ char __name_[__max_name_length]{};
66
+ ::cuda::std::size_t __name_length_{};
67
+
68
+ ::std::once_flag __peers_once_flag_{};
69
+ ::std::vector<device_ref> __peers_{};
70
+
69
71
  public:
70
- # ifndef _CCCL_DOXYGEN_INVOKED // Do not document
71
- # if _CCCL_COMPILER(MSVC)
72
- // When __EDG__ is defined, std::construct_at will not permit constructing
73
- // a device object from an __emplace_device object. This is a workaround.
74
- physical_device(__detail::__emplace_device __ed)
75
- : physical_device(__ed.__id_)
76
- {}
77
- # endif // _CCCL_COMPILER(MSVC)
78
- # endif // _CCCL_COMPILER(MSVC)
72
+ _CCCL_HIDE_FROM_ABI __physical_device() = default;
73
+
74
+ _CCCL_HOST_API ~__physical_device()
75
+ {
76
+ if (__primary_ctx_ != nullptr)
77
+ {
78
+ [[maybe_unused]] const auto __ignore = ::cuda::__driver::__primaryCtxReleaseNoThrow(__device_);
79
+ }
80
+ }
81
+
82
+ //! @brief Retrieve the primary context for this device.
83
+ //!
84
+ //! @return A reference to the primary context for this device.
85
+ [[nodiscard]] _CCCL_HOST_API ::CUcontext __primary_context()
86
+ {
87
+ ::std::call_once(__primary_ctx_once_flag_, [this]() {
88
+ __primary_ctx_ = ::cuda::__driver::__primaryCtxRetain(__device_);
89
+ });
90
+ return __primary_ctx_;
91
+ }
79
92
 
80
93
  //! @brief Retrieve architecture traits of this device.
81
94
  //!
@@ -83,81 +96,97 @@ public:
83
96
  //! that are shared by all devices belonging to given architecture.
84
97
  //!
85
98
  //! @return A reference to `arch_traits_t` object containing architecture traits of this device
86
- const arch::traits_t& arch_traits() const noexcept
99
+ [[nodiscard]] _CCCL_HOST_API const arch::traits_t& __arch_traits()
87
100
  {
88
- return __traits;
101
+ ::std::call_once(__traits_once_flag_, [this]() {
102
+ const auto __id = ::cuda::__driver::__cudevice_to_ordinal(__device_);
103
+ __traits_ = ::cuda::arch::__arch_traits_might_be_unknown(__id, device_attributes::compute_capability(__id));
104
+ });
105
+ return __traits_;
89
106
  }
90
107
 
91
- //! @brief Retrieve the primary context for this device.
92
- //!
93
- //! @return A reference to the primary context for this device.
94
- ::CUcontext primary_context() const
108
+ [[nodiscard]] _CCCL_HOST_API ::cuda::std::string_view __name()
95
109
  {
96
- ::std::call_once(__init_once, [this]() {
97
- __device = ::cuda::__driver::__deviceGet(__id_);
98
- __primary_ctx = ::cuda::__driver::__primaryCtxRetain(__device);
110
+ ::std::call_once(__name_once_flag_, [this]() {
111
+ const auto __id = ::cuda::__driver::__cudevice_to_ordinal(__device_);
112
+ ::cuda::__driver::__deviceGetName(__name_, __max_name_length, __id);
113
+ __name_length_ = ::cuda::std::char_traits<char>::length(__name_);
99
114
  });
100
- _CCCL_ASSERT(__primary_ctx != nullptr, "cuda::primary_context failed to get context");
101
-
102
- return __primary_ctx;
115
+ return ::cuda::std::string_view{__name_, __name_length_};
103
116
  }
104
117
 
105
- ~physical_device()
118
+ [[nodiscard]] _CCCL_HOST_API ::cuda::std::span<const device_ref> __peers()
106
119
  {
107
- if (__primary_ctx)
108
- {
109
- ::cuda::__driver::__primaryCtxRelease(__device);
110
- }
120
+ ::std::call_once(__peers_once_flag_, [this]() {
121
+ const auto __count = static_cast<int>(::cuda::__physical_devices().size());
122
+ const auto __id = ::cuda::__driver::__cudevice_to_ordinal(__device_);
123
+ __peers_.reserve(__count);
124
+ for (int __other_id = 0; __other_id < __count; ++__other_id)
125
+ {
126
+ // Exclude the device this API is called on. The main use case for this API
127
+ // is enable/disable peer access. While enable peer access can be called on
128
+ // device on which memory resides, disable peer access will error-out.
129
+ // Usage of the peer access control is smoother when *this is excluded,
130
+ // while it can be easily added with .push_back() on the vector if a full
131
+ // group of peers is needed (for cases other than peer access control)
132
+ if (__other_id != __id)
133
+ {
134
+ device_ref __dev{__id};
135
+ device_ref __other_dev{__other_id};
136
+
137
+ // While in almost all practical applications peer access should be symmetrical,
138
+ // it is possible to build a system with one directional peer access, check
139
+ // both ways here just to be safe
140
+ if (__dev.has_peer_access_to(__other_dev) && __other_dev.has_peer_access_to(__dev))
141
+ {
142
+ __peers_.push_back(__other_dev);
143
+ }
144
+ }
145
+ }
146
+ });
147
+ return ::cuda::std::span<const device_ref>{__peers_};
111
148
  }
149
+ };
112
150
 
113
- private:
114
- // TODO: put a mutable thread-safe (or thread_local) cache of device
115
- // properties here.
116
-
117
- friend class device_ref;
118
- friend struct __detail::__emplace_device;
119
-
120
- mutable ::CUcontext __primary_ctx = nullptr;
121
- mutable ::CUdevice __device{};
122
- mutable ::std::once_flag __init_once;
123
-
124
- // TODO should this be a reference/pointer to the constexpr traits instances?
125
- // Do we care about lazy init?
126
- // We should have some of the attributes just return from the arch traits
127
- arch::traits_t __traits;
128
-
129
- explicit physical_device(int __id)
130
- : device_ref(__id)
131
- , __traits(arch::__arch_traits_might_be_unknown(__id, device_attributes::compute_capability(__id)))
132
- {}
151
+ [[nodiscard]] _CCCL_HOST_API inline ::std::unique_ptr<__physical_device[]>
152
+ __make_physical_devices(::cuda::std::size_t __device_count)
153
+ {
154
+ ::std::unique_ptr<__physical_device[]> __devices{::new __physical_device[__device_count]};
155
+ for (::cuda::std::size_t __i = 0; __i < __device_count; ++__i)
156
+ {
157
+ __devices[__i].__device_ = static_cast<int>(__i);
158
+ }
159
+ return __devices;
160
+ }
133
161
 
134
- // `device` objects are not movable or copyable.
135
- physical_device(physical_device&&) = delete;
136
- physical_device(const physical_device&) = delete;
137
- physical_device& operator=(physical_device&&) = delete;
138
- physical_device& operator=(const physical_device&) = delete;
162
+ [[nodiscard]] inline ::cuda::std::span<__physical_device> __physical_devices()
163
+ {
164
+ static const auto __device_count = static_cast<::cuda::std::size_t>(::cuda::__driver::__deviceGetCount());
165
+ static const auto __devices = ::cuda::__make_physical_devices(__device_count);
166
+ return ::cuda::std::span<__physical_device>{__devices.get(), __device_count};
167
+ }
139
168
 
140
- friend bool operator==(const physical_device& __lhs, int __rhs) = delete;
141
- friend bool operator==(int __lhs, const physical_device& __rhs) = delete;
169
+ // device_ref methods dependent on __physical_device
142
170
 
143
- # if _CCCL_STD_VER <= 2017
144
- friend bool operator!=(const physical_device& __lhs, int __rhs) = delete;
145
- friend bool operator!=(int __lhs, const physical_device& __rhs) = delete;
146
- # endif // _CCCL_STD_VER <= 2017
147
- };
171
+ _CCCL_HOST_API inline void device_ref::init() const
172
+ {
173
+ (void) ::cuda::__physical_devices()[__id_].__primary_context();
174
+ }
148
175
 
149
- namespace __detail
176
+ [[nodiscard]] _CCCL_HOST_API inline ::cuda::std::string_view device_ref::name() const
150
177
  {
151
- [[nodiscard]] inline __emplace_device::operator physical_device() const
178
+ return ::cuda::__physical_devices()[__id_].__name();
179
+ }
180
+
181
+ [[nodiscard]] _CCCL_HOST_API inline const arch::traits_t& device_ref::arch_traits() const
152
182
  {
153
- return physical_device(__id_);
183
+ return ::cuda::__physical_devices()[__id_].__arch_traits();
154
184
  }
155
185
 
156
- [[nodiscard]] inline constexpr const __emplace_device* __emplace_device::operator->() const
186
+ [[nodiscard]] _CCCL_HOST_API inline ::cuda::std::span<const device_ref> device_ref::peers() const
157
187
  {
158
- return this;
188
+ return ::cuda::__physical_devices()[__id_].__peers();
159
189
  }
160
- } // namespace __detail
161
190
 
162
191
  _CCCL_END_NAMESPACE_CUDA
163
192