cuda-cccl 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (144) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  7. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  20. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
  21. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  22. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  23. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
  24. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
  25. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  26. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
  27. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
  28. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  29. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  30. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  31. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  32. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  33. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  34. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  35. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  36. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  37. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  38. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  39. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  40. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  41. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  42. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  43. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  44. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  46. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  47. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  48. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  49. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  50. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  51. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  52. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  53. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  54. cuda/cccl/headers/include/cuda/__device/arch_traits.h +48 -46
  55. cuda/cccl/headers/include/cuda/__device/attributes.h +171 -121
  56. cuda/cccl/headers/include/cuda/__device/device_ref.h +30 -42
  57. cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
  58. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  59. cuda/cccl/headers/include/cuda/__event/event.h +1 -0
  60. cuda/cccl/headers/include/cuda/__event/timed_event.h +1 -0
  61. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  62. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  63. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
  64. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
  65. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
  66. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
  67. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +1 -0
  68. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  69. cuda/cccl/headers/include/cuda/algorithm +1 -1
  70. cuda/cccl/headers/include/cuda/devices +10 -0
  71. cuda/cccl/headers/include/cuda/iterator +1 -0
  72. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  73. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  74. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  75. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  76. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  77. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  78. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  79. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  80. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  81. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  82. cuda/cccl/headers/include/cuda/std/string_view +12 -5
  83. cuda/cccl/headers/include/cuda/std/version +1 -4
  84. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  85. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  86. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  87. cuda/cccl/parallel/experimental/__init__.py +21 -74
  88. cuda/compute/__init__.py +77 -0
  89. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +1 -1
  90. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  91. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  92. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  93. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  94. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -4
  95. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  96. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  97. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
  98. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  99. cuda/{cccl/parallel/experimental → compute}/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  100. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  101. cuda/{cccl/parallel/experimental → compute}/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  102. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  103. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
  104. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  105. cuda/coop/__init__.py +8 -0
  106. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  107. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  108. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  109. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  110. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  111. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  112. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  113. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  114. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  115. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  116. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  117. cuda/coop/warp/__init__.py +9 -0
  118. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  119. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  120. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  121. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
  122. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +141 -138
  123. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  125. cuda/cccl/parallel/experimental/.gitignore +0 -4
  126. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  127. /cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +0 -0
  128. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  129. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  130. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  131. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  132. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  133. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  134. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  135. /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
  136. /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
  137. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  138. /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
  139. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  140. /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
  141. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  142. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  143. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
  144. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA___DEVICE_ATTRIBUTES_H
12
12
  #define _CUDA___DEVICE_ATTRIBUTES_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -25,27 +25,24 @@
25
25
 
26
26
  # include <cuda/__device/device_ref.h>
27
27
  # include <cuda/__driver/driver_api.h>
28
- # include <cuda/std/__cccl/attributes.h>
29
- # include <cuda/std/__cuda/api_wrapper.h>
28
+ # include <cuda/__fwd/devices.h>
29
+ # include <cuda/std/__cstddef/types.h>
30
30
 
31
31
  # include <cuda/std/__cccl/prologue.h>
32
32
 
33
33
  _CCCL_BEGIN_NAMESPACE_CUDA
34
34
 
35
- namespace __detail
36
- {
37
-
38
35
  template <::cudaDeviceAttr _Attr, typename _Type>
39
36
  struct __dev_attr_impl
40
37
  {
41
38
  using type = _Type;
42
39
 
43
- [[nodiscard]] constexpr operator ::cudaDeviceAttr() const noexcept
40
+ [[nodiscard]] _CCCL_HOST_API constexpr operator ::cudaDeviceAttr() const noexcept
44
41
  {
45
42
  return _Attr;
46
43
  }
47
44
 
48
- [[nodiscard]] type operator()(device_ref __dev) const
45
+ [[nodiscard]] _CCCL_HOST_API type operator()(device_ref __dev) const
49
46
  {
50
47
  return static_cast<type>(::cuda::__driver::__deviceGetAttribute(
51
48
  static_cast<::CUdevice_attribute>(_Attr), ::cuda::__driver::__deviceGet(__dev.get())));
@@ -55,13 +52,36 @@ struct __dev_attr_impl
55
52
  template <::cudaDeviceAttr _Attr>
56
53
  struct __dev_attr : __dev_attr_impl<_Attr, int>
57
54
  {};
58
-
55
+ template <>
56
+ struct __dev_attr<::cudaDevAttrMaxSharedMemoryPerBlock> //
57
+ : __dev_attr_impl<::cudaDevAttrMaxSharedMemoryPerBlock, ::cuda::std::size_t>
58
+ {};
59
+ template <>
60
+ struct __dev_attr<::cudaDevAttrTotalConstantMemory> //
61
+ : __dev_attr_impl<::cudaDevAttrTotalConstantMemory, ::cuda::std::size_t>
62
+ {};
63
+ template <>
64
+ struct __dev_attr<::cudaDevAttrMaxPitch> //
65
+ : __dev_attr_impl<::cudaDevAttrMaxPitch, ::cuda::std::size_t>
66
+ {};
67
+ template <>
68
+ struct __dev_attr<::cudaDevAttrMaxTexture2DLinearPitch> //
69
+ : __dev_attr_impl<::cudaDevAttrMaxTexture2DLinearPitch, ::cuda::std::size_t>
70
+ {};
59
71
  // TODO: give this a strong type for kilohertz
60
72
  template <>
61
73
  struct __dev_attr<::cudaDevAttrClockRate> //
62
74
  : __dev_attr_impl<::cudaDevAttrClockRate, int>
63
75
  {};
64
76
  template <>
77
+ struct __dev_attr<::cudaDevAttrTextureAlignment> //
78
+ : __dev_attr_impl<::cudaDevAttrTextureAlignment, ::cuda::std::size_t>
79
+ {};
80
+ template <>
81
+ struct __dev_attr<::cudaDevAttrTexturePitchAlignment> //
82
+ : __dev_attr_impl<::cudaDevAttrTexturePitchAlignment, ::cuda::std::size_t>
83
+ {};
84
+ template <>
65
85
  struct __dev_attr<::cudaDevAttrGpuOverlap> //
66
86
  : __dev_attr_impl<::cudaDevAttrGpuOverlap, bool>
67
87
  {};
@@ -107,10 +127,9 @@ template <>
107
127
  struct __dev_attr<::cudaDevAttrGlobalMemoryBusWidth> //
108
128
  : __dev_attr_impl<::cudaDevAttrGlobalMemoryBusWidth, int>
109
129
  {};
110
- // TODO: give this a strong type for bytes
111
130
  template <>
112
131
  struct __dev_attr<::cudaDevAttrL2CacheSize> //
113
- : __dev_attr_impl<::cudaDevAttrL2CacheSize, int>
132
+ : __dev_attr_impl<::cudaDevAttrL2CacheSize, ::cuda::std::size_t>
114
133
  {};
115
134
  template <>
116
135
  struct __dev_attr<::cudaDevAttrUnifiedAddressing> //
@@ -129,6 +148,10 @@ struct __dev_attr<::cudaDevAttrLocalL1CacheSupported> //
129
148
  : __dev_attr_impl<::cudaDevAttrLocalL1CacheSupported, bool>
130
149
  {};
131
150
  template <>
151
+ struct __dev_attr<::cudaDevAttrMaxSharedMemoryPerMultiprocessor> //
152
+ : __dev_attr_impl<::cudaDevAttrMaxSharedMemoryPerMultiprocessor, ::cuda::std::size_t>
153
+ {};
154
+ template <>
132
155
  struct __dev_attr<::cudaDevAttrManagedMemory> //
133
156
  : __dev_attr_impl<::cudaDevAttrManagedMemory, bool>
134
157
  {};
@@ -173,6 +196,22 @@ struct __dev_attr<::cudaDevAttrDirectManagedMemAccessFromHost> //
173
196
  : __dev_attr_impl<::cudaDevAttrDirectManagedMemAccessFromHost, bool>
174
197
  {};
175
198
  template <>
199
+ struct __dev_attr<::cudaDevAttrMaxSharedMemoryPerBlockOptin> //
200
+ : __dev_attr_impl<::cudaDevAttrMaxSharedMemoryPerBlockOptin, ::cuda::std::size_t>
201
+ {};
202
+ template <>
203
+ struct __dev_attr<::cudaDevAttrMaxPersistingL2CacheSize> //
204
+ : __dev_attr_impl<::cudaDevAttrMaxPersistingL2CacheSize, ::cuda::std::size_t>
205
+ {};
206
+ template <>
207
+ struct __dev_attr<::cudaDevAttrMaxAccessPolicyWindowSize> //
208
+ : __dev_attr_impl<::cudaDevAttrMaxAccessPolicyWindowSize, ::cuda::std::size_t>
209
+ {};
210
+ template <>
211
+ struct __dev_attr<::cudaDevAttrReservedSharedMemoryPerBlock> //
212
+ : __dev_attr_impl<::cudaDevAttrReservedSharedMemoryPerBlock, ::cuda::std::size_t>
213
+ {};
214
+ template <>
176
215
  struct __dev_attr<::cudaDevAttrSparseCudaArraySupported> //
177
216
  : __dev_attr_impl<::cudaDevAttrSparseCudaArraySupported, bool>
178
217
  {};
@@ -239,463 +278,460 @@ struct __dev_attr<::cudaDevAttrNumaConfig> //
239
278
  };
240
279
  # endif // _CCCL_CTK_AT_LEAST(12, 2)
241
280
 
242
- } // namespace __detail
243
-
244
281
  namespace device_attributes
245
282
  {
246
283
  // Maximum number of threads per block
247
- using max_threads_per_block_t = __detail::__dev_attr<::cudaDevAttrMaxThreadsPerBlock>;
284
+ using max_threads_per_block_t = __dev_attr<::cudaDevAttrMaxThreadsPerBlock>;
248
285
  static constexpr max_threads_per_block_t max_threads_per_block{};
249
286
 
250
287
  // Maximum x-dimension of a block
251
- using max_block_dim_x_t = __detail::__dev_attr<::cudaDevAttrMaxBlockDimX>;
288
+ using max_block_dim_x_t = __dev_attr<::cudaDevAttrMaxBlockDimX>;
252
289
  static constexpr max_block_dim_x_t max_block_dim_x{};
253
290
 
254
291
  // Maximum y-dimension of a block
255
- using max_block_dim_y_t = __detail::__dev_attr<::cudaDevAttrMaxBlockDimY>;
292
+ using max_block_dim_y_t = __dev_attr<::cudaDevAttrMaxBlockDimY>;
256
293
  static constexpr max_block_dim_y_t max_block_dim_y{};
257
294
 
258
295
  // Maximum z-dimension of a block
259
- using max_block_dim_z_t = __detail::__dev_attr<::cudaDevAttrMaxBlockDimZ>;
296
+ using max_block_dim_z_t = __dev_attr<::cudaDevAttrMaxBlockDimZ>;
260
297
  static constexpr max_block_dim_z_t max_block_dim_z{};
261
298
 
262
299
  // Maximum x-dimension of a grid
263
- using max_grid_dim_x_t = __detail::__dev_attr<::cudaDevAttrMaxGridDimX>;
300
+ using max_grid_dim_x_t = __dev_attr<::cudaDevAttrMaxGridDimX>;
264
301
  static constexpr max_grid_dim_x_t max_grid_dim_x{};
265
302
 
266
303
  // Maximum y-dimension of a grid
267
- using max_grid_dim_y_t = __detail::__dev_attr<::cudaDevAttrMaxGridDimY>;
304
+ using max_grid_dim_y_t = __dev_attr<::cudaDevAttrMaxGridDimY>;
268
305
  static constexpr max_grid_dim_y_t max_grid_dim_y{};
269
306
 
270
307
  // Maximum z-dimension of a grid
271
- using max_grid_dim_z_t = __detail::__dev_attr<::cudaDevAttrMaxGridDimZ>;
308
+ using max_grid_dim_z_t = __dev_attr<::cudaDevAttrMaxGridDimZ>;
272
309
  static constexpr max_grid_dim_z_t max_grid_dim_z{};
273
310
 
274
311
  // Maximum amount of shared memory available to a thread block in bytes
275
- using max_shared_memory_per_block_t = __detail::__dev_attr<::cudaDevAttrMaxSharedMemoryPerBlock>;
312
+ using max_shared_memory_per_block_t = __dev_attr<::cudaDevAttrMaxSharedMemoryPerBlock>;
276
313
  static constexpr max_shared_memory_per_block_t max_shared_memory_per_block{};
277
314
 
278
315
  // Memory available on device for __constant__ variables in a CUDA C kernel in bytes
279
- using total_constant_memory_t = __detail::__dev_attr<::cudaDevAttrTotalConstantMemory>;
316
+ using total_constant_memory_t = __dev_attr<::cudaDevAttrTotalConstantMemory>;
280
317
  static constexpr total_constant_memory_t total_constant_memory{};
281
318
 
282
319
  // Warp size in threads
283
- using warp_size_t = __detail::__dev_attr<::cudaDevAttrWarpSize>;
320
+ using warp_size_t = __dev_attr<::cudaDevAttrWarpSize>;
284
321
  static constexpr warp_size_t warp_size{};
285
322
 
286
323
  // Maximum pitch in bytes allowed by the memory copy functions that involve
287
324
  // memory regions allocated through cudaMallocPitch()
288
- using max_pitch_t = __detail::__dev_attr<::cudaDevAttrMaxPitch>;
325
+ using max_pitch_t = __dev_attr<::cudaDevAttrMaxPitch>;
289
326
  static constexpr max_pitch_t max_pitch{};
290
327
 
291
328
  // Maximum 1D texture width
292
- using max_texture_1d_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture1DWidth>;
329
+ using max_texture_1d_width_t = __dev_attr<::cudaDevAttrMaxTexture1DWidth>;
293
330
  static constexpr max_texture_1d_width_t max_texture_1d_width{};
294
331
 
295
332
  // Maximum width for a 1D texture bound to linear memory
296
- using max_texture_1d_linear_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture1DLinearWidth>;
333
+ using max_texture_1d_linear_width_t = __dev_attr<::cudaDevAttrMaxTexture1DLinearWidth>;
297
334
  static constexpr max_texture_1d_linear_width_t max_texture_1d_linear_width{};
298
335
 
299
336
  // Maximum mipmapped 1D texture width
300
- using max_texture_1d_mipmapped_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture1DMipmappedWidth>;
337
+ using max_texture_1d_mipmapped_width_t = __dev_attr<::cudaDevAttrMaxTexture1DMipmappedWidth>;
301
338
  static constexpr max_texture_1d_mipmapped_width_t max_texture_1d_mipmapped_width{};
302
339
 
303
340
  // Maximum 2D texture width
304
- using max_texture_2d_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DWidth>;
341
+ using max_texture_2d_width_t = __dev_attr<::cudaDevAttrMaxTexture2DWidth>;
305
342
  static constexpr max_texture_2d_width_t max_texture_2d_width{};
306
343
 
307
344
  // Maximum 2D texture height
308
- using max_texture_2d_height_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DHeight>;
345
+ using max_texture_2d_height_t = __dev_attr<::cudaDevAttrMaxTexture2DHeight>;
309
346
  static constexpr max_texture_2d_height_t max_texture_2d_height{};
310
347
 
311
348
  // Maximum width for a 2D texture bound to linear memory
312
- using max_texture_2d_linear_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DLinearWidth>;
349
+ using max_texture_2d_linear_width_t = __dev_attr<::cudaDevAttrMaxTexture2DLinearWidth>;
313
350
  static constexpr max_texture_2d_linear_width_t max_texture_2d_linear_width{};
314
351
 
315
352
  // Maximum height for a 2D texture bound to linear memory
316
- using max_texture_2d_linear_height_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DLinearHeight>;
353
+ using max_texture_2d_linear_height_t = __dev_attr<::cudaDevAttrMaxTexture2DLinearHeight>;
317
354
  static constexpr max_texture_2d_linear_height_t max_texture_2d_linear_height{};
318
355
 
319
356
  // Maximum pitch in bytes for a 2D texture bound to linear memory
320
- using max_texture_2d_linear_pitch_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DLinearPitch>;
357
+ using max_texture_2d_linear_pitch_t = __dev_attr<::cudaDevAttrMaxTexture2DLinearPitch>;
321
358
  static constexpr max_texture_2d_linear_pitch_t max_texture_2d_linear_pitch{};
322
359
 
323
360
  // Maximum mipmapped 2D texture width
324
- using max_texture_2d_mipmapped_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DMipmappedWidth>;
361
+ using max_texture_2d_mipmapped_width_t = __dev_attr<::cudaDevAttrMaxTexture2DMipmappedWidth>;
325
362
  static constexpr max_texture_2d_mipmapped_width_t max_texture_2d_mipmapped_width{};
326
363
 
327
364
  // Maximum mipmapped 2D texture height
328
- using max_texture_2d_mipmapped_height_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DMipmappedHeight>;
365
+ using max_texture_2d_mipmapped_height_t = __dev_attr<::cudaDevAttrMaxTexture2DMipmappedHeight>;
329
366
  static constexpr max_texture_2d_mipmapped_height_t max_texture_2d_mipmapped_height{};
330
367
 
331
368
  // Maximum 3D texture width
332
- using max_texture_3d_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture3DWidth>;
369
+ using max_texture_3d_width_t = __dev_attr<::cudaDevAttrMaxTexture3DWidth>;
333
370
  static constexpr max_texture_3d_width_t max_texture_3d_width{};
334
371
 
335
372
  // Maximum 3D texture height
336
- using max_texture_3d_height_t = __detail::__dev_attr<::cudaDevAttrMaxTexture3DHeight>;
373
+ using max_texture_3d_height_t = __dev_attr<::cudaDevAttrMaxTexture3DHeight>;
337
374
  static constexpr max_texture_3d_height_t max_texture_3d_height{};
338
375
 
339
376
  // Maximum 3D texture depth
340
- using max_texture_3d_depth_t = __detail::__dev_attr<::cudaDevAttrMaxTexture3DDepth>;
377
+ using max_texture_3d_depth_t = __dev_attr<::cudaDevAttrMaxTexture3DDepth>;
341
378
  static constexpr max_texture_3d_depth_t max_texture_3d_depth{};
342
379
 
343
380
  // Alternate maximum 3D texture width, 0 if no alternate maximum 3D texture size is supported
344
- using max_texture_3d_width_alt_t = __detail::__dev_attr<::cudaDevAttrMaxTexture3DWidthAlt>;
381
+ using max_texture_3d_width_alt_t = __dev_attr<::cudaDevAttrMaxTexture3DWidthAlt>;
345
382
  static constexpr max_texture_3d_width_alt_t max_texture_3d_width_alt{};
346
383
 
347
384
  // Alternate maximum 3D texture height, 0 if no alternate maximum 3D texture size is supported
348
- using max_texture_3d_height_alt_t = __detail::__dev_attr<::cudaDevAttrMaxTexture3DHeightAlt>;
385
+ using max_texture_3d_height_alt_t = __dev_attr<::cudaDevAttrMaxTexture3DHeightAlt>;
349
386
  static constexpr max_texture_3d_height_alt_t max_texture_3d_height_alt{};
350
387
 
351
388
  // Alternate maximum 3D texture depth, 0 if no alternate maximum 3D texture size is supported
352
- using max_texture_3d_depth_alt_t = __detail::__dev_attr<::cudaDevAttrMaxTexture3DDepthAlt>;
389
+ using max_texture_3d_depth_alt_t = __dev_attr<::cudaDevAttrMaxTexture3DDepthAlt>;
353
390
  static constexpr max_texture_3d_depth_alt_t max_texture_3d_depth_alt{};
354
391
 
355
392
  // Maximum cubemap texture width or height
356
- using max_texture_cubemap_width_t = __detail::__dev_attr<::cudaDevAttrMaxTextureCubemapWidth>;
393
+ using max_texture_cubemap_width_t = __dev_attr<::cudaDevAttrMaxTextureCubemapWidth>;
357
394
  static constexpr max_texture_cubemap_width_t max_texture_cubemap_width{};
358
395
 
359
396
  // Maximum 1D layered texture width
360
- using max_texture_1d_layered_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture1DLayeredWidth>;
397
+ using max_texture_1d_layered_width_t = __dev_attr<::cudaDevAttrMaxTexture1DLayeredWidth>;
361
398
  static constexpr max_texture_1d_layered_width_t max_texture_1d_layered_width{};
362
399
 
363
400
  // Maximum layers in a 1D layered texture
364
- using max_texture_1d_layered_layers_t = __detail::__dev_attr<::cudaDevAttrMaxTexture1DLayeredLayers>;
401
+ using max_texture_1d_layered_layers_t = __dev_attr<::cudaDevAttrMaxTexture1DLayeredLayers>;
365
402
  static constexpr max_texture_1d_layered_layers_t max_texture_1d_layered_layers{};
366
403
 
367
404
  // Maximum 2D layered texture width
368
- using max_texture_2d_layered_width_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DLayeredWidth>;
405
+ using max_texture_2d_layered_width_t = __dev_attr<::cudaDevAttrMaxTexture2DLayeredWidth>;
369
406
  static constexpr max_texture_2d_layered_width_t max_texture_2d_layered_width{};
370
407
 
371
408
  // Maximum 2D layered texture height
372
- using max_texture_2d_layered_height_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DLayeredHeight>;
409
+ using max_texture_2d_layered_height_t = __dev_attr<::cudaDevAttrMaxTexture2DLayeredHeight>;
373
410
  static constexpr max_texture_2d_layered_height_t max_texture_2d_layered_height{};
374
411
 
375
412
  // Maximum layers in a 2D layered texture
376
- using max_texture_2d_layered_layers_t = __detail::__dev_attr<::cudaDevAttrMaxTexture2DLayeredLayers>;
413
+ using max_texture_2d_layered_layers_t = __dev_attr<::cudaDevAttrMaxTexture2DLayeredLayers>;
377
414
  static constexpr max_texture_2d_layered_layers_t max_texture_2d_layered_layers{};
378
415
 
379
416
  // Maximum cubemap layered texture width or height
380
- using max_texture_cubemap_layered_width_t = __detail::__dev_attr<::cudaDevAttrMaxTextureCubemapLayeredWidth>;
417
+ using max_texture_cubemap_layered_width_t = __dev_attr<::cudaDevAttrMaxTextureCubemapLayeredWidth>;
381
418
  static constexpr max_texture_cubemap_layered_width_t max_texture_cubemap_layered_width{};
382
419
 
383
420
  // Maximum layers in a cubemap layered texture
384
- using max_texture_cubemap_layered_layers_t = __detail::__dev_attr<::cudaDevAttrMaxTextureCubemapLayeredLayers>;
421
+ using max_texture_cubemap_layered_layers_t = __dev_attr<::cudaDevAttrMaxTextureCubemapLayeredLayers>;
385
422
  static constexpr max_texture_cubemap_layered_layers_t max_texture_cubemap_layered_layers{};
386
423
 
387
424
  // Maximum 1D surface width
388
- using max_surface_1d_width_t = __detail::__dev_attr<::cudaDevAttrMaxSurface1DWidth>;
425
+ using max_surface_1d_width_t = __dev_attr<::cudaDevAttrMaxSurface1DWidth>;
389
426
  static constexpr max_surface_1d_width_t max_surface_1d_width{};
390
427
 
391
428
  // Maximum 2D surface width
392
- using max_surface_2d_width_t = __detail::__dev_attr<::cudaDevAttrMaxSurface2DWidth>;
429
+ using max_surface_2d_width_t = __dev_attr<::cudaDevAttrMaxSurface2DWidth>;
393
430
  static constexpr max_surface_2d_width_t max_surface_2d_width{};
394
431
 
395
432
  // Maximum 2D surface height
396
- using max_surface_2d_height_t = __detail::__dev_attr<::cudaDevAttrMaxSurface2DHeight>;
433
+ using max_surface_2d_height_t = __dev_attr<::cudaDevAttrMaxSurface2DHeight>;
397
434
  static constexpr max_surface_2d_height_t max_surface_2d_height{};
398
435
 
399
436
  // Maximum 3D surface width
400
- using max_surface_3d_width_t = __detail::__dev_attr<::cudaDevAttrMaxSurface3DWidth>;
437
+ using max_surface_3d_width_t = __dev_attr<::cudaDevAttrMaxSurface3DWidth>;
401
438
  static constexpr max_surface_3d_width_t max_surface_3d_width{};
402
439
 
403
440
  // Maximum 3D surface height
404
- using max_surface_3d_height_t = __detail::__dev_attr<::cudaDevAttrMaxSurface3DHeight>;
441
+ using max_surface_3d_height_t = __dev_attr<::cudaDevAttrMaxSurface3DHeight>;
405
442
  static constexpr max_surface_3d_height_t max_surface_3d_height{};
406
443
 
407
444
  // Maximum 3D surface depth
408
- using max_surface_3d_depth_t = __detail::__dev_attr<::cudaDevAttrMaxSurface3DDepth>;
445
+ using max_surface_3d_depth_t = __dev_attr<::cudaDevAttrMaxSurface3DDepth>;
409
446
  static constexpr max_surface_3d_depth_t max_surface_3d_depth{};
410
447
 
411
448
  // Maximum 1D layered surface width
412
- using max_surface_1d_layered_width_t = __detail::__dev_attr<::cudaDevAttrMaxSurface1DLayeredWidth>;
449
+ using max_surface_1d_layered_width_t = __dev_attr<::cudaDevAttrMaxSurface1DLayeredWidth>;
413
450
  static constexpr max_surface_1d_layered_width_t max_surface_1d_layered_width{};
414
451
 
415
452
  // Maximum layers in a 1D layered surface
416
- using max_surface_1d_layered_layers_t = __detail::__dev_attr<::cudaDevAttrMaxSurface1DLayeredLayers>;
453
+ using max_surface_1d_layered_layers_t = __dev_attr<::cudaDevAttrMaxSurface1DLayeredLayers>;
417
454
  static constexpr max_surface_1d_layered_layers_t max_surface_1d_layered_layers{};
418
455
 
419
456
  // Maximum 2D layered surface width
420
- using max_surface_2d_layered_width_t = __detail::__dev_attr<::cudaDevAttrMaxSurface2DLayeredWidth>;
457
+ using max_surface_2d_layered_width_t = __dev_attr<::cudaDevAttrMaxSurface2DLayeredWidth>;
421
458
  static constexpr max_surface_2d_layered_width_t max_surface_2d_layered_width{};
422
459
 
423
460
  // Maximum 2D layered surface height
424
- using max_surface_2d_layered_height_t = __detail::__dev_attr<::cudaDevAttrMaxSurface2DLayeredHeight>;
461
+ using max_surface_2d_layered_height_t = __dev_attr<::cudaDevAttrMaxSurface2DLayeredHeight>;
425
462
  static constexpr max_surface_2d_layered_height_t max_surface_2d_layered_height{};
426
463
 
427
464
  // Maximum layers in a 2D layered surface
428
- using max_surface_2d_layered_layers_t = __detail::__dev_attr<::cudaDevAttrMaxSurface2DLayeredLayers>;
465
+ using max_surface_2d_layered_layers_t = __dev_attr<::cudaDevAttrMaxSurface2DLayeredLayers>;
429
466
  static constexpr max_surface_2d_layered_layers_t max_surface_2d_layered_layers{};
430
467
 
431
468
  // Maximum cubemap surface width
432
- using max_surface_cubemap_width_t = __detail::__dev_attr<::cudaDevAttrMaxSurfaceCubemapWidth>;
469
+ using max_surface_cubemap_width_t = __dev_attr<::cudaDevAttrMaxSurfaceCubemapWidth>;
433
470
  static constexpr max_surface_cubemap_width_t max_surface_cubemap_width{};
434
471
 
435
472
  // Maximum cubemap layered surface width
436
- using max_surface_cubemap_layered_width_t = __detail::__dev_attr<::cudaDevAttrMaxSurfaceCubemapLayeredWidth>;
473
+ using max_surface_cubemap_layered_width_t = __dev_attr<::cudaDevAttrMaxSurfaceCubemapLayeredWidth>;
437
474
  static constexpr max_surface_cubemap_layered_width_t max_surface_cubemap_layered_width{};
438
475
 
439
476
  // Maximum layers in a cubemap layered surface
440
- using max_surface_cubemap_layered_layers_t = __detail::__dev_attr<::cudaDevAttrMaxSurfaceCubemapLayeredLayers>;
477
+ using max_surface_cubemap_layered_layers_t = __dev_attr<::cudaDevAttrMaxSurfaceCubemapLayeredLayers>;
441
478
  static constexpr max_surface_cubemap_layered_layers_t max_surface_cubemap_layered_layers{};
442
479
 
443
480
  // Maximum number of 32-bit registers available to a thread block
444
- using max_registers_per_block_t = __detail::__dev_attr<::cudaDevAttrMaxRegistersPerBlock>;
481
+ using max_registers_per_block_t = __dev_attr<::cudaDevAttrMaxRegistersPerBlock>;
445
482
  static constexpr max_registers_per_block_t max_registers_per_block{};
446
483
 
447
484
  // Peak clock frequency in kilohertz
448
- using clock_rate_t = __detail::__dev_attr<::cudaDevAttrClockRate>;
485
+ using clock_rate_t = __dev_attr<::cudaDevAttrClockRate>;
449
486
  static constexpr clock_rate_t clock_rate{};
450
487
 
451
488
  // Alignment requirement; texture base addresses aligned to textureAlign bytes
452
489
  // do not need an offset applied to texture fetches
453
- using texture_alignment_t = __detail::__dev_attr<::cudaDevAttrTextureAlignment>;
490
+ using texture_alignment_t = __dev_attr<::cudaDevAttrTextureAlignment>;
454
491
  static constexpr texture_alignment_t texture_alignment{};
455
492
 
456
493
  // Pitch alignment requirement for 2D texture references bound to pitched memory
457
- using texture_pitch_alignment_t = __detail::__dev_attr<::cudaDevAttrTexturePitchAlignment>;
494
+ using texture_pitch_alignment_t = __dev_attr<::cudaDevAttrTexturePitchAlignment>;
458
495
  static constexpr texture_pitch_alignment_t texture_pitch_alignment{};
459
496
 
460
497
  // true if the device can concurrently copy memory between host and device
461
498
  // while executing a kernel, or false if not
462
- using gpu_overlap_t = __detail::__dev_attr<::cudaDevAttrGpuOverlap>;
499
+ using gpu_overlap_t = __dev_attr<::cudaDevAttrGpuOverlap>;
463
500
  static constexpr gpu_overlap_t gpu_overlap{};
464
501
 
465
502
  // Number of multiprocessors on the device
466
- using multiprocessor_count_t = __detail::__dev_attr<::cudaDevAttrMultiProcessorCount>;
503
+ using multiprocessor_count_t = __dev_attr<::cudaDevAttrMultiProcessorCount>;
467
504
  static constexpr multiprocessor_count_t multiprocessor_count{};
468
505
 
469
506
  // true if there is a run time limit for kernels executed on the device, or
470
507
  // false if not
471
- using kernel_exec_timeout_t = __detail::__dev_attr<::cudaDevAttrKernelExecTimeout>;
508
+ using kernel_exec_timeout_t = __dev_attr<::cudaDevAttrKernelExecTimeout>;
472
509
  static constexpr kernel_exec_timeout_t kernel_exec_timeout{};
473
510
 
474
511
  // true if the device is integrated with the memory subsystem, or false if not
475
- using integrated_t = __detail::__dev_attr<::cudaDevAttrIntegrated>;
512
+ using integrated_t = __dev_attr<::cudaDevAttrIntegrated>;
476
513
  static constexpr integrated_t integrated{};
477
514
 
478
515
  // true if the device can map host memory into CUDA address space
479
- using can_map_host_memory_t = __detail::__dev_attr<::cudaDevAttrCanMapHostMemory>;
516
+ using can_map_host_memory_t = __dev_attr<::cudaDevAttrCanMapHostMemory>;
480
517
  static constexpr can_map_host_memory_t can_map_host_memory{};
481
518
 
482
519
  // Compute mode is the compute mode that the device is currently in.
483
- using compute_mode_t = __detail::__dev_attr<::cudaDevAttrComputeMode>;
520
+ using compute_mode_t = __dev_attr<::cudaDevAttrComputeMode>;
484
521
  static constexpr compute_mode_t compute_mode{};
485
522
 
486
523
  // true if the device supports executing multiple kernels within the same
487
524
  // context simultaneously, or false if not. It is not guaranteed that multiple
488
525
  // kernels will be resident on the device concurrently so this feature should
489
526
  // not be relied upon for correctness.
490
- using concurrent_kernels_t = __detail::__dev_attr<::cudaDevAttrConcurrentKernels>;
527
+ using concurrent_kernels_t = __dev_attr<::cudaDevAttrConcurrentKernels>;
491
528
  static constexpr concurrent_kernels_t concurrent_kernels{};
492
529
 
493
530
  // true if error correction is enabled on the device, 0 if error correction is
494
531
  // disabled or not supported by the device
495
- using ecc_enabled_t = __detail::__dev_attr<::cudaDevAttrEccEnabled>;
532
+ using ecc_enabled_t = __dev_attr<::cudaDevAttrEccEnabled>;
496
533
  static constexpr ecc_enabled_t ecc_enabled{};
497
534
 
498
535
  // PCI bus identifier of the device
499
- using pci_bus_id_t = __detail::__dev_attr<::cudaDevAttrPciBusId>;
536
+ using pci_bus_id_t = __dev_attr<::cudaDevAttrPciBusId>;
500
537
  static constexpr pci_bus_id_t pci_bus_id{};
501
538
 
502
539
  // PCI device (also known as slot) identifier of the device
503
- using pci_device_id_t = __detail::__dev_attr<::cudaDevAttrPciDeviceId>;
540
+ using pci_device_id_t = __dev_attr<::cudaDevAttrPciDeviceId>;
504
541
  static constexpr pci_device_id_t pci_device_id{};
505
542
 
506
543
  // true if the device is using a TCC driver. TCC is only available on Tesla
507
544
  // hardware running Windows Vista or later.
508
- using tcc_driver_t = __detail::__dev_attr<::cudaDevAttrTccDriver>;
545
+ using tcc_driver_t = __dev_attr<::cudaDevAttrTccDriver>;
509
546
  static constexpr tcc_driver_t tcc_driver{};
510
547
 
511
548
  // Peak memory clock frequency in kilohertz
512
- using memory_clock_rate_t = __detail::__dev_attr<::cudaDevAttrMemoryClockRate>;
549
+ using memory_clock_rate_t = __dev_attr<::cudaDevAttrMemoryClockRate>;
513
550
  static constexpr memory_clock_rate_t memory_clock_rate{};
514
551
 
515
552
  // Global memory bus width in bits
516
- using global_memory_bus_width_t = __detail::__dev_attr<::cudaDevAttrGlobalMemoryBusWidth>;
553
+ using global_memory_bus_width_t = __dev_attr<::cudaDevAttrGlobalMemoryBusWidth>;
517
554
  static constexpr global_memory_bus_width_t global_memory_bus_width{};
518
555
 
519
556
  // Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
520
- using l2_cache_size_t = __detail::__dev_attr<::cudaDevAttrL2CacheSize>;
557
+ using l2_cache_size_t = __dev_attr<::cudaDevAttrL2CacheSize>;
521
558
  static constexpr l2_cache_size_t l2_cache_size{};
522
559
 
523
560
  // Maximum resident threads per multiprocessor
524
- using max_threads_per_multiprocessor_t = __detail::__dev_attr<::cudaDevAttrMaxThreadsPerMultiProcessor>;
561
+ using max_threads_per_multiprocessor_t = __dev_attr<::cudaDevAttrMaxThreadsPerMultiProcessor>;
525
562
  static constexpr max_threads_per_multiprocessor_t max_threads_per_multiprocessor{};
526
563
 
527
564
  // true if the device shares a unified address space with the host, or false
528
565
  // if not
529
- using unified_addressing_t = __detail::__dev_attr<::cudaDevAttrUnifiedAddressing>;
566
+ using unified_addressing_t = __dev_attr<::cudaDevAttrUnifiedAddressing>;
530
567
  static constexpr unified_addressing_t unified_addressing{};
531
568
 
532
569
  // Major compute capability version number
533
- using compute_capability_major_t = __detail::__dev_attr<::cudaDevAttrComputeCapabilityMajor>;
570
+ using compute_capability_major_t = __dev_attr<::cudaDevAttrComputeCapabilityMajor>;
534
571
  static constexpr compute_capability_major_t compute_capability_major{};
535
572
 
536
573
  // Minor compute capability version number
537
- using compute_capability_minor_t = __detail::__dev_attr<::cudaDevAttrComputeCapabilityMinor>;
574
+ using compute_capability_minor_t = __dev_attr<::cudaDevAttrComputeCapabilityMinor>;
538
575
  static constexpr compute_capability_minor_t compute_capability_minor{};
539
576
 
540
577
  // true if the device supports stream priorities, or false if not
541
- using stream_priorities_supported_t = __detail::__dev_attr<::cudaDevAttrStreamPrioritiesSupported>;
578
+ using stream_priorities_supported_t = __dev_attr<::cudaDevAttrStreamPrioritiesSupported>;
542
579
  static constexpr stream_priorities_supported_t stream_priorities_supported{};
543
580
 
544
581
  // true if device supports caching globals in L1 cache, false if not
545
- using global_l1_cache_supported_t = __detail::__dev_attr<::cudaDevAttrGlobalL1CacheSupported>;
582
+ using global_l1_cache_supported_t = __dev_attr<::cudaDevAttrGlobalL1CacheSupported>;
546
583
  static constexpr global_l1_cache_supported_t global_l1_cache_supported{};
547
584
 
548
585
  // true if device supports caching locals in L1 cache, false if not
549
- using local_l1_cache_supported_t = __detail::__dev_attr<::cudaDevAttrLocalL1CacheSupported>;
586
+ using local_l1_cache_supported_t = __dev_attr<::cudaDevAttrLocalL1CacheSupported>;
550
587
  static constexpr local_l1_cache_supported_t local_l1_cache_supported{};
551
588
 
552
589
  // Maximum amount of shared memory available to a multiprocessor in bytes;
553
590
  // this amount is shared by all thread blocks simultaneously resident on a
554
591
  // multiprocessor
555
- using max_shared_memory_per_multiprocessor_t = __detail::__dev_attr<::cudaDevAttrMaxSharedMemoryPerMultiprocessor>;
592
+ using max_shared_memory_per_multiprocessor_t = __dev_attr<::cudaDevAttrMaxSharedMemoryPerMultiprocessor>;
556
593
  static constexpr max_shared_memory_per_multiprocessor_t max_shared_memory_per_multiprocessor{};
557
594
 
558
595
  // Maximum number of 32-bit registers available to a multiprocessor; this
559
596
  // number is shared by all thread blocks simultaneously resident on a
560
597
  // multiprocessor
561
- using max_registers_per_multiprocessor_t = __detail::__dev_attr<::cudaDevAttrMaxRegistersPerMultiprocessor>;
598
+ using max_registers_per_multiprocessor_t = __dev_attr<::cudaDevAttrMaxRegistersPerMultiprocessor>;
562
599
  static constexpr max_registers_per_multiprocessor_t max_registers_per_multiprocessor{};
563
600
 
564
601
  // true if device supports allocating managed memory, false if not
565
- using managed_memory_t = __detail::__dev_attr<::cudaDevAttrManagedMemory>;
602
+ using managed_memory_t = __dev_attr<::cudaDevAttrManagedMemory>;
566
603
  static constexpr managed_memory_t managed_memory{};
567
604
 
568
605
  // true if device is on a multi-GPU board, false if not
569
- using is_multi_gpu_board_t = __detail::__dev_attr<::cudaDevAttrIsMultiGpuBoard>;
606
+ using is_multi_gpu_board_t = __dev_attr<::cudaDevAttrIsMultiGpuBoard>;
570
607
  static constexpr is_multi_gpu_board_t is_multi_gpu_board{};
571
608
 
572
609
  // Unique identifier for a group of devices on the same multi-GPU board
573
- using multi_gpu_board_group_id_t = __detail::__dev_attr<::cudaDevAttrMultiGpuBoardGroupID>;
610
+ using multi_gpu_board_group_id_t = __dev_attr<::cudaDevAttrMultiGpuBoardGroupID>;
574
611
  static constexpr multi_gpu_board_group_id_t multi_gpu_board_group_id{};
575
612
 
576
613
  // true if the link between the device and the host supports native atomic
577
614
  // operations
578
- using host_native_atomic_supported_t = __detail::__dev_attr<::cudaDevAttrHostNativeAtomicSupported>;
615
+ using host_native_atomic_supported_t = __dev_attr<::cudaDevAttrHostNativeAtomicSupported>;
579
616
  static constexpr host_native_atomic_supported_t host_native_atomic_supported{};
580
617
 
581
618
  // Ratio of single precision performance (in floating-point operations per
582
619
  // second) to double precision performance
583
- using single_to_double_precision_perf_ratio_t = __detail::__dev_attr<::cudaDevAttrSingleToDoublePrecisionPerfRatio>;
620
+ using single_to_double_precision_perf_ratio_t = __dev_attr<::cudaDevAttrSingleToDoublePrecisionPerfRatio>;
584
621
  static constexpr single_to_double_precision_perf_ratio_t single_to_double_precision_perf_ratio{};
585
622
 
586
623
  // true if the device supports coherently accessing pageable memory without
587
624
  // calling cudaHostRegister on it, and false otherwise
588
- using pageable_memory_access_t = __detail::__dev_attr<::cudaDevAttrPageableMemoryAccess>;
625
+ using pageable_memory_access_t = __dev_attr<::cudaDevAttrPageableMemoryAccess>;
589
626
  static constexpr pageable_memory_access_t pageable_memory_access{};
590
627
 
591
628
  // true if the device can coherently access managed memory concurrently with
592
629
  // the CPU, and false otherwise
593
- using concurrent_managed_access_t = __detail::__dev_attr<::cudaDevAttrConcurrentManagedAccess>;
630
+ using concurrent_managed_access_t = __dev_attr<::cudaDevAttrConcurrentManagedAccess>;
594
631
  static constexpr concurrent_managed_access_t concurrent_managed_access{};
595
632
 
596
633
  // true if the device supports Compute Preemption, false if not
597
- using compute_preemption_supported_t = __detail::__dev_attr<::cudaDevAttrComputePreemptionSupported>;
634
+ using compute_preemption_supported_t = __dev_attr<::cudaDevAttrComputePreemptionSupported>;
598
635
  static constexpr compute_preemption_supported_t compute_preemption_supported{};
599
636
 
600
637
  // true if the device can access host registered memory at the same virtual
601
638
  // address as the CPU, and false otherwise
602
- using can_use_host_pointer_for_registered_mem_t = __detail::__dev_attr<::cudaDevAttrCanUseHostPointerForRegisteredMem>;
639
+ using can_use_host_pointer_for_registered_mem_t = __dev_attr<::cudaDevAttrCanUseHostPointerForRegisteredMem>;
603
640
  static constexpr can_use_host_pointer_for_registered_mem_t can_use_host_pointer_for_registered_mem{};
604
641
 
605
642
  // true if the device supports launching cooperative kernels via
606
643
  // cudaLaunchCooperativeKernel, and false otherwise
607
- using cooperative_launch_t = __detail::__dev_attr<::cudaDevAttrCooperativeLaunch>;
644
+ using cooperative_launch_t = __dev_attr<::cudaDevAttrCooperativeLaunch>;
608
645
  static constexpr cooperative_launch_t cooperative_launch{};
609
646
 
610
647
  // true if the device supports flushing of outstanding remote writes, and
611
648
  // false otherwise
612
- using can_flush_remote_writes_t = __detail::__dev_attr<::cudaDevAttrCanFlushRemoteWrites>;
649
+ using can_flush_remote_writes_t = __dev_attr<::cudaDevAttrCanFlushRemoteWrites>;
613
650
  static constexpr can_flush_remote_writes_t can_flush_remote_writes{};
614
651
 
615
652
  // true if the device supports host memory registration via cudaHostRegister,
616
653
  // and false otherwise
617
- using host_register_supported_t = __detail::__dev_attr<::cudaDevAttrHostRegisterSupported>;
654
+ using host_register_supported_t = __dev_attr<::cudaDevAttrHostRegisterSupported>;
618
655
  static constexpr host_register_supported_t host_register_supported{};
619
656
 
620
657
  // true if the device accesses pageable memory via the host's page tables, and
621
658
  // false otherwise
622
- using pageable_memory_access_uses_host_page_tables_t =
623
- __detail::__dev_attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables>;
659
+ using pageable_memory_access_uses_host_page_tables_t = __dev_attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables>;
624
660
  static constexpr pageable_memory_access_uses_host_page_tables_t pageable_memory_access_uses_host_page_tables{};
625
661
 
626
662
  // true if the host can directly access managed memory on the device without
627
663
  // migration, and false otherwise
628
- using direct_managed_mem_access_from_host_t = __detail::__dev_attr<::cudaDevAttrDirectManagedMemAccessFromHost>;
664
+ using direct_managed_mem_access_from_host_t = __dev_attr<::cudaDevAttrDirectManagedMemAccessFromHost>;
629
665
  static constexpr direct_managed_mem_access_from_host_t direct_managed_mem_access_from_host{};
630
666
 
631
667
  // Maximum per block shared memory size on the device. This value can be opted
632
668
  // into when using dynamic_shared_memory with NonPortableSize set to true
633
- using max_shared_memory_per_block_optin_t = __detail::__dev_attr<::cudaDevAttrMaxSharedMemoryPerBlockOptin>;
669
+ using max_shared_memory_per_block_optin_t = __dev_attr<::cudaDevAttrMaxSharedMemoryPerBlockOptin>;
634
670
  static constexpr max_shared_memory_per_block_optin_t max_shared_memory_per_block_optin{};
635
671
 
636
672
  // Maximum number of thread blocks that can reside on a multiprocessor
637
- using max_blocks_per_multiprocessor_t = __detail::__dev_attr<::cudaDevAttrMaxBlocksPerMultiprocessor>;
673
+ using max_blocks_per_multiprocessor_t = __dev_attr<::cudaDevAttrMaxBlocksPerMultiprocessor>;
638
674
  static constexpr max_blocks_per_multiprocessor_t max_blocks_per_multiprocessor{};
639
675
 
640
676
  // Maximum L2 persisting lines capacity setting in bytes
641
- using max_persisting_l2_cache_size_t = __detail::__dev_attr<::cudaDevAttrMaxPersistingL2CacheSize>;
677
+ using max_persisting_l2_cache_size_t = __dev_attr<::cudaDevAttrMaxPersistingL2CacheSize>;
642
678
  static constexpr max_persisting_l2_cache_size_t max_persisting_l2_cache_size{};
643
679
 
644
680
  // Maximum value of cudaAccessPolicyWindow::num_bytes
645
- using max_access_policy_window_size_t = __detail::__dev_attr<::cudaDevAttrMaxAccessPolicyWindowSize>;
681
+ using max_access_policy_window_size_t = __dev_attr<::cudaDevAttrMaxAccessPolicyWindowSize>;
646
682
  static constexpr max_access_policy_window_size_t max_access_policy_window_size{};
647
683
 
648
684
  // Shared memory reserved by CUDA driver per block in bytes
649
- using reserved_shared_memory_per_block_t = __detail::__dev_attr<::cudaDevAttrReservedSharedMemoryPerBlock>;
685
+ using reserved_shared_memory_per_block_t = __dev_attr<::cudaDevAttrReservedSharedMemoryPerBlock>;
650
686
  static constexpr reserved_shared_memory_per_block_t reserved_shared_memory_per_block{};
651
687
 
652
688
  // true if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays.
653
- using sparse_cuda_array_supported_t = __detail::__dev_attr<::cudaDevAttrSparseCudaArraySupported>;
689
+ using sparse_cuda_array_supported_t = __dev_attr<::cudaDevAttrSparseCudaArraySupported>;
654
690
  static constexpr sparse_cuda_array_supported_t sparse_cuda_array_supported{};
655
691
 
656
692
  // Device supports using the cudaHostRegister flag cudaHostRegisterReadOnly to
657
693
  // register memory that must be mapped as read-only to the GPU
658
- using host_register_read_only_supported_t = __detail::__dev_attr<::cudaDevAttrHostRegisterReadOnlySupported>;
694
+ using host_register_read_only_supported_t = __dev_attr<::cudaDevAttrHostRegisterReadOnlySupported>;
659
695
  static constexpr host_register_read_only_supported_t host_register_read_only_supported{};
660
696
 
661
697
  // true if the device supports using the cudaMallocAsync and cudaMemPool
662
698
  // family of APIs, and false otherwise
663
- using memory_pools_supported_t = __detail::__dev_attr<::cudaDevAttrMemoryPoolsSupported>;
699
+ using memory_pools_supported_t = __dev_attr<::cudaDevAttrMemoryPoolsSupported>;
664
700
  static constexpr memory_pools_supported_t memory_pools_supported{};
665
701
 
666
702
  // true if the device supports GPUDirect RDMA APIs, and false otherwise
667
- using gpu_direct_rdma_supported_t = __detail::__dev_attr<::cudaDevAttrGPUDirectRDMASupported>;
703
+ using gpu_direct_rdma_supported_t = __dev_attr<::cudaDevAttrGPUDirectRDMASupported>;
668
704
  static constexpr gpu_direct_rdma_supported_t gpu_direct_rdma_supported{};
669
705
 
670
706
  // bitmask to be interpreted according to the
671
707
  // cudaFlushGPUDirectRDMAWritesOptions enum
672
- using gpu_direct_rdma_flush_writes_options_t = __detail::__dev_attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions>;
708
+ using gpu_direct_rdma_flush_writes_options_t = __dev_attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions>;
673
709
  static constexpr gpu_direct_rdma_flush_writes_options_t gpu_direct_rdma_flush_writes_options{};
674
710
 
675
711
  // see the cudaGPUDirectRDMAWritesOrdering enum for numerical values
676
- using gpu_direct_rdma_writes_ordering_t = __detail::__dev_attr<::cudaDevAttrGPUDirectRDMAWritesOrdering>;
712
+ using gpu_direct_rdma_writes_ordering_t = __dev_attr<::cudaDevAttrGPUDirectRDMAWritesOrdering>;
677
713
  static constexpr gpu_direct_rdma_writes_ordering_t gpu_direct_rdma_writes_ordering{};
678
714
 
679
715
  // Bitmask of handle types supported with mempool based IPC
680
- using memory_pool_supported_handle_types_t = __detail::__dev_attr<::cudaDevAttrMemoryPoolSupportedHandleTypes>;
716
+ using memory_pool_supported_handle_types_t = __dev_attr<::cudaDevAttrMemoryPoolSupportedHandleTypes>;
681
717
  static constexpr memory_pool_supported_handle_types_t memory_pool_supported_handle_types{};
682
718
 
683
719
  // true if the device supports deferred mapping CUDA arrays and CUDA mipmapped
684
720
  // arrays.
685
- using deferred_mapping_cuda_array_supported_t = __detail::__dev_attr<::cudaDevAttrDeferredMappingCudaArraySupported>;
721
+ using deferred_mapping_cuda_array_supported_t = __dev_attr<::cudaDevAttrDeferredMappingCudaArraySupported>;
686
722
  static constexpr deferred_mapping_cuda_array_supported_t deferred_mapping_cuda_array_supported{};
687
723
 
688
724
  // true if the device supports IPC Events, false otherwise.
689
- using ipc_event_support_t = __detail::__dev_attr<::cudaDevAttrIpcEventSupport>;
725
+ using ipc_event_support_t = __dev_attr<::cudaDevAttrIpcEventSupport>;
690
726
  static constexpr ipc_event_support_t ipc_event_support{};
691
727
 
692
728
  # if _CCCL_CTK_AT_LEAST(12, 2)
693
729
  // NUMA configuration of a device: value is of type cudaDeviceNumaConfig enum
694
- using numa_config_t = __detail::__dev_attr<::cudaDevAttrNumaConfig>;
730
+ using numa_config_t = __dev_attr<::cudaDevAttrNumaConfig>;
695
731
  static constexpr numa_config_t numa_config{};
696
732
 
697
733
  // NUMA node ID of the GPU memory
698
- using numa_id_t = __detail::__dev_attr<::cudaDevAttrNumaId>;
734
+ using numa_id_t = __dev_attr<::cudaDevAttrNumaId>;
699
735
  static constexpr numa_id_t numa_id{};
700
736
  # endif // _CCCL_CTK_AT_LEAST(12, 2)
701
737
 
@@ -703,7 +739,9 @@ static constexpr numa_id_t numa_id{};
703
739
  // capability in a single query
704
740
  struct compute_capability_t
705
741
  {
706
- [[nodiscard]] int operator()(device_ref __dev_id) const
742
+ using type = int;
743
+
744
+ [[nodiscard]] _CCCL_HOST_API type operator()(device_ref __dev_id) const
707
745
  {
708
746
  return 10 * ::cuda::device_attributes::compute_capability_major(__dev_id)
709
747
  + ::cuda::device_attributes::compute_capability_minor(__dev_id);
@@ -712,6 +750,18 @@ struct compute_capability_t
712
750
  static constexpr compute_capability_t compute_capability{};
713
751
  } // namespace device_attributes
714
752
 
753
+ //! @brief For a given attribute, type of the attribute value.
754
+ //!
755
+ //! @par Example
756
+ //! @code
757
+ //! using threads_per_block_t = device::attr_result_t<device_attributes::max_threads_per_block>;
758
+ //! static_assert(std::is_same_v<threads_per_block_t, int>);
759
+ //! @endcode
760
+ //!
761
+ //! @sa device_attributes
762
+ template <::cudaDeviceAttr _Attr>
763
+ using device_attribute_result_t = typename __dev_attr<_Attr>::type;
764
+
715
765
  _CCCL_END_NAMESPACE_CUDA
716
766
 
717
767
  # include <cuda/std/__cccl/epilogue.h>