cuda-cccl 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  2. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  3. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  4. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  5. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  6. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  7. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
  8. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  9. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  10. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  11. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  12. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  13. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  14. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  15. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
  16. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  17. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
  18. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
  19. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
  20. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  21. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  22. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  23. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  24. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  25. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  26. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  27. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  28. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  29. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  30. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  31. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  32. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  33. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  34. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  35. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  36. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  37. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  38. cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
  39. cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
  40. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  41. cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
  42. cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
  43. cuda/cccl/headers/include/cuda/__event/event.h +26 -26
  44. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  45. cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
  46. cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
  47. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  48. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  49. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  50. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  51. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  52. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  53. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
  54. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
  55. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
  56. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  57. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  58. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  59. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  60. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  61. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  62. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  63. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  64. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  65. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  66. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
  67. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  68. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
  69. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  70. cuda/cccl/headers/include/cuda/cmath +1 -0
  71. cuda/cccl/headers/include/cuda/devices +3 -0
  72. cuda/cccl/headers/include/cuda/memory +1 -0
  73. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  74. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  75. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  76. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  77. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  78. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  79. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  80. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  81. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  82. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  83. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  84. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  85. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  86. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  87. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  88. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  89. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  90. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  91. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  92. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  93. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  94. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  95. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  96. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  97. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  98. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  99. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  100. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  101. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  102. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  103. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  104. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  105. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  106. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  107. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  108. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  109. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  110. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  111. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  112. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  113. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  114. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  115. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  116. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  117. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  118. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  120. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  121. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  122. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  123. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  124. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  125. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  126. cuda/cccl/headers/include/cuda/std/string_view +146 -11
  127. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  128. cuda/cccl/headers/include/cuda/utility +1 -0
  129. cuda/cccl/headers/include/nv/target +7 -2
  130. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  131. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  132. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  133. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  134. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  135. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  136. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  137. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  138. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  139. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  140. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  141. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  142. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  143. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  144. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  145. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  146. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  147. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  148. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  149. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  150. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  151. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  152. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  153. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  154. cuda/compute/__init__.py +2 -0
  155. cuda/compute/_bindings.pyi +43 -1
  156. cuda/compute/_bindings_impl.pyx +156 -7
  157. cuda/compute/algorithms/_scan.py +108 -36
  158. cuda/compute/algorithms/_transform.py +32 -11
  159. cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  160. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  161. cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  162. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  163. cuda/compute/iterators/__init__.py +2 -0
  164. cuda/compute/iterators/_factories.py +28 -0
  165. cuda/compute/iterators/_iterators.py +206 -1
  166. cuda/compute/numba_utils.py +2 -2
  167. cuda/compute/typing.py +2 -0
  168. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  169. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
  170. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  171. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  172. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  173. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  174. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  175. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  176. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  177. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  178. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  179. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  180. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  181. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  182. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  183. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  184. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  185. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -23,6 +23,7 @@
23
23
 
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
 
26
+ # include <cuda/__device/compute_capability.h>
26
27
  # include <cuda/__device/device_ref.h>
27
28
  # include <cuda/__driver/driver_api.h>
28
29
  # include <cuda/__fwd/devices.h>
@@ -739,12 +740,12 @@ static constexpr numa_id_t numa_id{};
739
740
  // capability in a single query
740
741
  struct compute_capability_t
741
742
  {
742
- using type = int;
743
+ using type = ::cuda::compute_capability;
743
744
 
744
745
  [[nodiscard]] _CCCL_HOST_API type operator()(device_ref __dev_id) const
745
746
  {
746
- return 10 * ::cuda::device_attributes::compute_capability_major(__dev_id)
747
- + ::cuda::device_attributes::compute_capability_minor(__dev_id);
747
+ return type{::cuda::device_attributes::compute_capability_major(__dev_id),
748
+ ::cuda::device_attributes::compute_capability_minor(__dev_id)};
748
749
  }
749
750
  };
750
751
  static constexpr compute_capability_t compute_capability{};
@@ -0,0 +1,171 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef _CUDA___DEVICE_COMPUTE_CAPABILITY_H
12
+ #define _CUDA___DEVICE_COMPUTE_CAPABILITY_H
13
+
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
24
+ #include <cuda/__fwd/devices.h>
25
+ #include <cuda/std/__utility/to_underlying.h>
26
+
27
+ #include <cuda/std/__cccl/prologue.h>
28
+
29
+ _CCCL_BEGIN_NAMESPACE_CUDA
30
+
31
+ //! @brief Type representing the CUDA compute capability.
32
+ class compute_capability
33
+ {
34
+ int __cc_{}; //!< The stored compute capability in format 10 * major + minor.
35
+
36
+ public:
37
+ _CCCL_HIDE_FROM_ABI constexpr compute_capability() noexcept = default;
38
+
39
+ //! @brief Constructs the object from compute capability \c __cc. The expected format is 10 * major + minor.
40
+ //!
41
+ //! @param __cc Compute capability.
42
+ _CCCL_API explicit constexpr compute_capability(int __cc) noexcept
43
+ : __cc_{__cc}
44
+ {}
45
+
46
+ //! @brief Constructs the object by combining the \c __major and \c __minor compute capability.
47
+ //!
48
+ //! @param __major The major compute capability.
49
+ //! @param __minor The minor compute capability. Must be less than 10.
50
+ _CCCL_API constexpr compute_capability(int __major, int __minor) noexcept
51
+ : __cc_{10 * __major + __minor}
52
+ {
53
+ _CCCL_ASSERT(__minor < 10, "invalid minor compute capability");
54
+ }
55
+
56
+ //! @brief Constructs the object from the architecture id.
57
+ //!
58
+ //! @param __arch_id The architecture id.
59
+ _CCCL_API explicit constexpr compute_capability(arch_id __arch_id) noexcept
60
+ {
61
+ const auto __val = ::cuda::std::to_underlying(__arch_id);
62
+ if (__val > __arch_specific_id_multiplier)
63
+ {
64
+ __cc_ = __val / __arch_specific_id_multiplier;
65
+ }
66
+ else
67
+ {
68
+ __cc_ = __val;
69
+ }
70
+ }
71
+
72
+ _CCCL_HIDE_FROM_ABI constexpr compute_capability(const compute_capability&) noexcept = default;
73
+
74
+ _CCCL_HIDE_FROM_ABI constexpr compute_capability& operator=(const compute_capability& __other) noexcept = default;
75
+
76
+ //! @brief Gets the stored compute capability.
77
+ //!
78
+ //! @return The stored compute capability in format 10 * major + minor.
79
+ [[nodiscard]] _CCCL_API constexpr int get() const noexcept
80
+ {
81
+ return __cc_;
82
+ }
83
+
84
+ //! @brief Gets the major compute capability.
85
+ //!
86
+ //! @return Major compute capability.
87
+ [[nodiscard]] _CCCL_API constexpr int major() const noexcept
88
+ {
89
+ return __cc_ / 10;
90
+ }
91
+
92
+ //! @brief Gets the minor compute capability.
93
+ //!
94
+ //! @return Minor compute capability. The value is always less than 10.
95
+ [[nodiscard]] _CCCL_API constexpr int minor() const noexcept
96
+ {
97
+ return __cc_ % 10;
98
+ }
99
+
100
+ //! @brief Conversion operator to \c int.
101
+ //!
102
+ //! @return The stored compute capability in format 10 * major + minor.
103
+ _CCCL_API explicit constexpr operator int() const noexcept
104
+ {
105
+ return __cc_;
106
+ }
107
+
108
+ //! @brief Equality operator.
109
+ [[nodiscard]] friend _CCCL_API constexpr bool operator==(compute_capability __lhs, compute_capability __rhs) noexcept
110
+ {
111
+ return __lhs.__cc_ == __rhs.__cc_;
112
+ }
113
+
114
+ //! @brief Inequality operator.
115
+ [[nodiscard]] friend _CCCL_API constexpr bool operator!=(compute_capability __lhs, compute_capability __rhs) noexcept
116
+ {
117
+ return __lhs.__cc_ != __rhs.__cc_;
118
+ }
119
+
120
+ //! @brief Less than operator.
121
+ [[nodiscard]] friend _CCCL_API constexpr bool operator<(compute_capability __lhs, compute_capability __rhs) noexcept
122
+ {
123
+ return __lhs.__cc_ < __rhs.__cc_;
124
+ }
125
+
126
+ //! @brief Less than or equal to operator.
127
+ [[nodiscard]] friend _CCCL_API constexpr bool operator<=(compute_capability __lhs, compute_capability __rhs) noexcept
128
+ {
129
+ return __lhs.__cc_ <= __rhs.__cc_;
130
+ }
131
+
132
+ //! @brief Greater than operator.
133
+ [[nodiscard]] friend _CCCL_API constexpr bool operator>(compute_capability __lhs, compute_capability __rhs) noexcept
134
+ {
135
+ return __lhs.__cc_ > __rhs.__cc_;
136
+ }
137
+
138
+ //! @brief Greater than or equal to operator.
139
+ [[nodiscard]] friend _CCCL_API constexpr bool operator>=(compute_capability __lhs, compute_capability __rhs) noexcept
140
+ {
141
+ return __lhs.__cc_ >= __rhs.__cc_;
142
+ }
143
+ };
144
+
145
+ _CCCL_END_NAMESPACE_CUDA
146
+
147
+ #if _CCCL_CUDA_COMPILATION()
148
+
149
+ _CCCL_BEGIN_NAMESPACE_CUDA_DEVICE
150
+
151
+ //! @brief Returns the \c cuda::compute_capability that is currently being compiled.
152
+ //!
153
+ //! @note This API cannot be used in constexpr context when compiling with nvc++ in CUDA mode.
154
+ [[nodiscard]] _CCCL_DEVICE_API _CCCL_TARGET_CONSTEXPR ::cuda::compute_capability current_compute_capability() noexcept
155
+ {
156
+ # if _CCCL_CUDA_COMPILER(NVHPC)
157
+ return ::cuda::compute_capability{__builtin_current_device_sm()};
158
+ # elif _CCCL_DEVICE_COMPILATION()
159
+ return ::cuda::compute_capability{__CUDA_ARCH__ / 10};
160
+ # else // ^^^ _CCCL_DEVICE_COMPILATION() ^^^ / vvv !_CCCL_DEVICE_COMPILATION() vvv
161
+ return {};
162
+ # endif // ^^^ !_CCCL_DEVICE_COMPILATION() ^^^
163
+ }
164
+
165
+ _CCCL_END_NAMESPACE_CUDA_DEVICE
166
+
167
+ #endif // _CCCL_CUDA_COMPILATION()
168
+
169
+ #include <cuda/std/__cccl/epilogue.h>
170
+
171
+ #endif // _CUDA___DEVICE_COMPUTE_CAPABILITY_H
@@ -133,16 +133,6 @@ public:
133
133
  ::cuda::__driver::__deviceGet(get()), ::cuda::__driver::__deviceGet(__other_dev.get()));
134
134
  }
135
135
 
136
- //! @brief Retrieve architecture traits of this device.
137
- //!
138
- //! Architecture traits object contains information about certain traits
139
- //! that are shared by all devices belonging to given architecture.
140
- //!
141
- //! @return A reference to `arch_traits_t` object containing architecture traits of this device
142
- [[nodiscard]] _CCCL_HOST_API const arch::traits_t& arch_traits() const; // implemented in
143
- // <cuda/__device/physical_device.h> to avoid
144
- // circular dependency
145
-
146
136
  // TODO this might return some more complex type in the future
147
137
  // TODO we might want to include the calling device, depends on what we decide
148
138
  // peer access APIs
@@ -23,16 +23,15 @@
23
23
 
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
 
26
- # include <cuda/__device/arch_traits.h>
27
26
  # include <cuda/__device/device_ref.h>
28
27
  # include <cuda/__driver/driver_api.h>
29
28
  # include <cuda/__fwd/devices.h>
29
+ # include <cuda/std/__cccl/memory_wrapper.h>
30
30
  # include <cuda/std/__cstddef/types.h>
31
31
  # include <cuda/std/span>
32
32
  # include <cuda/std/string_view>
33
33
 
34
34
  # include <cassert>
35
- # include <memory>
36
35
  # include <mutex>
37
36
  # include <vector>
38
37
 
@@ -53,10 +52,6 @@ class __physical_device
53
52
 
54
53
  ::CUdevice __device_{};
55
54
 
56
- // TODO We should have some of the attributes just return from the arch traits
57
- ::std::once_flag __traits_once_flag_{};
58
- arch::traits_t __traits_{};
59
-
60
55
  ::std::once_flag __primary_ctx_once_flag_{};
61
56
  ::CUcontext __primary_ctx_{};
62
57
 
@@ -90,21 +85,6 @@ public:
90
85
  return __primary_ctx_;
91
86
  }
92
87
 
93
- //! @brief Retrieve architecture traits of this device.
94
- //!
95
- //! Architecture traits object contains information about certain traits
96
- //! that are shared by all devices belonging to given architecture.
97
- //!
98
- //! @return A reference to `arch_traits_t` object containing architecture traits of this device
99
- [[nodiscard]] _CCCL_HOST_API const arch::traits_t& __arch_traits()
100
- {
101
- ::std::call_once(__traits_once_flag_, [this]() {
102
- const auto __id = ::cuda::__driver::__cudevice_to_ordinal(__device_);
103
- __traits_ = ::cuda::arch::__arch_traits_might_be_unknown(__id, device_attributes::compute_capability(__id));
104
- });
105
- return __traits_;
106
- }
107
-
108
88
  [[nodiscard]] _CCCL_HOST_API ::cuda::std::string_view __name()
109
89
  {
110
90
  ::std::call_once(__name_once_flag_, [this]() {
@@ -178,11 +158,6 @@ _CCCL_HOST_API inline void device_ref::init() const
178
158
  return ::cuda::__physical_devices()[__id_].__name();
179
159
  }
180
160
 
181
- [[nodiscard]] _CCCL_HOST_API inline const arch::traits_t& device_ref::arch_traits() const
182
- {
183
- return ::cuda::__physical_devices()[__id_].__arch_traits();
184
- }
185
-
186
161
  [[nodiscard]] _CCCL_HOST_API inline ::cuda::std::span<const device_ref> device_ref::peers() const
187
162
  {
188
163
  return ::cuda::__physical_devices()[__id_].__peers();
@@ -28,8 +28,8 @@
28
28
  # include <cuda/__event/event_ref.h>
29
29
  # include <cuda/__runtime/ensure_current_context.h>
30
30
  # include <cuda/__utility/no_init.h>
31
+ # include <cuda/std/__utility/to_underlying.h>
31
32
  # include <cuda/std/cstddef>
32
- # include <cuda/std/utility>
33
33
 
34
34
  # include <cuda/std/__cccl/prologue.h>
35
35
 
@@ -37,38 +37,43 @@ _CCCL_BEGIN_NAMESPACE_CUDA
37
37
 
38
38
  class timed_event;
39
39
 
40
+ //! @brief Flags to use when creating the event.
41
+ enum class event_flags : unsigned
42
+ {
43
+ none = cudaEventDefault,
44
+ blocking_sync = cudaEventBlockingSync,
45
+ interprocess = cudaEventInterprocess,
46
+ };
47
+
48
+ [[nodiscard]] _CCCL_HOST_API constexpr event_flags operator|(event_flags __lhs, event_flags __rhs) noexcept
49
+ {
50
+ return static_cast<event_flags>(::cuda::std::to_underlying(__lhs) | ::cuda::std::to_underlying(__rhs));
51
+ }
52
+
40
53
  //! @brief An owning wrapper for an untimed `cudaEvent_t`.
41
54
  class event : public event_ref
42
55
  {
43
56
  friend class timed_event;
44
57
 
45
58
  public:
46
- //! @brief Flags to use when creating the event.
47
- enum class flags : unsigned
48
- {
49
- none = cudaEventDefault,
50
- blocking_sync = cudaEventBlockingSync,
51
- interprocess = cudaEventInterprocess,
52
- };
53
-
54
59
  //! @brief Construct a new `event` object with timing disabled, and record
55
60
  //! the event in the specified stream.
56
61
  //!
57
62
  //! @throws cuda_error if the event creation fails.
58
- explicit event(stream_ref __stream, flags __flags = flags::none);
63
+ _CCCL_HOST_API explicit event(stream_ref __stream, event_flags __flags = event_flags::none);
59
64
 
60
65
  //! @brief Construct a new `event` object with timing disabled. The event can only be recorded on streams from the
61
66
  //! specified device.
62
67
  //!
63
68
  //! @throws cuda_error if the event creation fails.
64
- explicit event(device_ref __device, flags __flags = flags::none)
65
- : event(__device, static_cast<unsigned int>(__flags) | cudaEventDisableTiming)
69
+ _CCCL_HOST_API explicit event(device_ref __device, event_flags __flags = event_flags::none)
70
+ : event(__device, ::cuda::std::to_underlying(__flags) | cudaEventDisableTiming)
66
71
  {}
67
72
 
68
73
  //! @brief Construct a new `event` object into the moved-from state.
69
74
  //!
70
75
  //! @post `get()` returns `cudaEvent_t()`.
71
- explicit constexpr event(no_init_t) noexcept
76
+ _CCCL_HOST_API explicit constexpr event(no_init_t) noexcept
72
77
  : event_ref(::cudaEvent_t{})
73
78
  {}
74
79
 
@@ -77,7 +82,7 @@ public:
77
82
  //! @param __other
78
83
  //!
79
84
  //! @post `__other` is in a moved-from state.
80
- constexpr event(event&& __other) noexcept
85
+ _CCCL_HOST_API constexpr event(event&& __other) noexcept
81
86
  : event_ref(::cuda::std::exchange(__other.__event_, {}))
82
87
  {}
83
88
 
@@ -87,7 +92,7 @@ public:
87
92
  //! @brief Destroy the `event` object
88
93
  //!
89
94
  //! @note If the event fails to be destroyed, the error is silently ignored.
90
- ~event()
95
+ _CCCL_HOST_API ~event()
91
96
  {
92
97
  if (__event_ != nullptr)
93
98
  {
@@ -102,7 +107,7 @@ public:
102
107
  //! @param __other
103
108
  //!
104
109
  //! @post `__other` is in a moved-from state.
105
- event& operator=(event&& __other) noexcept
110
+ _CCCL_HOST_API event& operator=(event&& __other) noexcept
106
111
  {
107
112
  event __tmp(::cuda::std::move(__other));
108
113
  ::cuda::std::swap(__event_, __tmp.__event_);
@@ -119,7 +124,7 @@ public:
119
124
  //! @return event The constructed `event` object
120
125
  //!
121
126
  //! @note The constructed `event` object takes ownership of the native handle.
122
- [[nodiscard]] static event from_native_handle(::cudaEvent_t __evnt) noexcept
127
+ [[nodiscard]] static _CCCL_HOST_API event from_native_handle(::cudaEvent_t __evnt) noexcept
123
128
  {
124
129
  return event(__evnt);
125
130
  }
@@ -135,26 +140,21 @@ public:
135
140
  //! @return cudaEvent_t The native handle being held by the `event` object.
136
141
  //!
137
142
  //! @post The event object is in a moved-from state.
138
- [[nodiscard]] constexpr ::cudaEvent_t release() noexcept
143
+ [[nodiscard]] _CCCL_HOST_API constexpr ::cudaEvent_t release() noexcept
139
144
  {
140
145
  return ::cuda::std::exchange(__event_, {});
141
146
  }
142
147
 
143
- [[nodiscard]] friend constexpr flags operator|(flags __lhs, flags __rhs) noexcept
144
- {
145
- return static_cast<flags>(static_cast<unsigned>(__lhs) | static_cast<unsigned>(__rhs));
146
- }
147
-
148
148
  private:
149
149
  // Use `event::from_native_handle(e)` to construct an owning `event`
150
150
  // object from a `cudaEvent_t` handle.
151
- explicit constexpr event(::cudaEvent_t __evnt) noexcept
151
+ _CCCL_HOST_API explicit constexpr event(::cudaEvent_t __evnt) noexcept
152
152
  : event_ref(__evnt)
153
153
  {}
154
154
 
155
- explicit event(stream_ref __stream, unsigned __flags);
155
+ _CCCL_HOST_API explicit event(stream_ref __stream, unsigned __flags);
156
156
 
157
- explicit event(device_ref __device, unsigned __flags)
157
+ _CCCL_HOST_API explicit event(device_ref __device, unsigned __flags)
158
158
  : event_ref(::cudaEvent_t{})
159
159
  {
160
160
  [[maybe_unused]] __ensure_current_context __ctx_setter(__device);
@@ -56,7 +56,7 @@ public:
56
56
  //!
57
57
  //! @note: It is the callers responsibility to ensure the `event_ref` does not
58
58
  //! outlive the event denoted by the `cudaEvent_t` handle.
59
- constexpr event_ref(::cudaEvent_t __evnt) noexcept
59
+ _CCCL_HOST_API constexpr event_ref(::cudaEvent_t __evnt) noexcept
60
60
  : __event_(__evnt)
61
61
  {}
62
62
 
@@ -108,7 +108,7 @@ public:
108
108
  //! @brief Retrieve the native `cudaEvent_t` handle.
109
109
  //!
110
110
  //! @return cudaEvent_t The native handle being held by the event_ref object.
111
- [[nodiscard]] constexpr ::cudaEvent_t get() const noexcept
111
+ [[nodiscard]] _CCCL_HOST_API constexpr ::cudaEvent_t get() const noexcept
112
112
  {
113
113
  return __event_;
114
114
  }
@@ -116,7 +116,7 @@ public:
116
116
  //! @brief Checks if the `event_ref` is valid
117
117
  //!
118
118
  //! @return true if the `event_ref` is valid, false otherwise.
119
- [[nodiscard]] explicit constexpr operator bool() const noexcept
119
+ [[nodiscard]] _CCCL_HOST_API explicit constexpr operator bool() const noexcept
120
120
  {
121
121
  return __event_ != nullptr;
122
122
  }
@@ -129,7 +129,7 @@ public:
129
129
  //! @param __lhs The first `event_ref` to compare
130
130
  //! @param __rhs The second `event_ref` to compare
131
131
  //! @return true if `lhs` and `rhs` refer to the same `cudaEvent_t` object.
132
- [[nodiscard]] friend constexpr bool operator==(event_ref __lhs, event_ref __rhs) noexcept
132
+ [[nodiscard]] friend _CCCL_HOST_API constexpr bool operator==(event_ref __lhs, event_ref __rhs) noexcept
133
133
  {
134
134
  return __lhs.__event_ == __rhs.__event_;
135
135
  }
@@ -142,7 +142,7 @@ public:
142
142
  //! @param __lhs The first `event_ref` to compare
143
143
  //! @param __rhs The second `event_ref` to compare
144
144
  //! @return true if `lhs` and `rhs` refer to different `cudaEvent_t` objects.
145
- [[nodiscard]] friend constexpr bool operator!=(event_ref __lhs, event_ref __rhs) noexcept
145
+ [[nodiscard]] friend _CCCL_HOST_API constexpr bool operator!=(event_ref __lhs, event_ref __rhs) noexcept
146
146
  {
147
147
  return __lhs.__event_ != __rhs.__event_;
148
148
  }
@@ -31,6 +31,7 @@
31
31
  # include <cuda/__event/event.h>
32
32
  # include <cuda/__utility/no_init.h>
33
33
  # include <cuda/std/__chrono/duration.h>
34
+ # include <cuda/std/__utility/to_underlying.h>
34
35
  # include <cuda/std/cstddef>
35
36
 
36
37
  # include <cuda/std/__cccl/prologue.h>
@@ -45,20 +46,20 @@ public:
45
46
  //! and record the event on the specified stream.
46
47
  //!
47
48
  //! @throws cuda_error if the event creation fails.
48
- explicit timed_event(stream_ref __stream, flags __flags = flags::none);
49
+ _CCCL_HOST_API explicit timed_event(stream_ref __stream, event_flags __flags = event_flags::none);
49
50
 
50
51
  //! @brief Construct a new `timed_event` object with the specified flags. The event can only be recorded on streams
51
52
  //! from the specified device.
52
53
  //!
53
54
  //! @throws cuda_error if the event creation fails.
54
- explicit timed_event(device_ref __device, flags __flags = flags::none)
55
- : event(__device, static_cast<unsigned>(__flags))
55
+ _CCCL_HOST_API explicit timed_event(device_ref __device, event_flags __flags = event_flags::none)
56
+ : event(__device, ::cuda::std::to_underlying(__flags))
56
57
  {}
57
58
 
58
59
  //! @brief Construct a new `timed_event` object into the moved-from state.
59
60
  //!
60
61
  //! @post `get()` returns `cudaEvent_t()`.
61
- explicit constexpr timed_event(no_init_t) noexcept
62
+ _CCCL_HOST_API explicit constexpr timed_event(no_init_t) noexcept
62
63
  : event(no_init)
63
64
  {}
64
65
 
@@ -74,7 +75,7 @@ public:
74
75
  //! @return timed_event The constructed `timed_event` object
75
76
  //!
76
77
  //! @note The constructed `timed_event` object takes ownership of the native handle.
77
- [[nodiscard]] static timed_event from_native_handle(::cudaEvent_t __evnt) noexcept
78
+ [[nodiscard]] static _CCCL_HOST_API timed_event from_native_handle(::cudaEvent_t __evnt) noexcept
78
79
  {
79
80
  return timed_event(__evnt);
80
81
  }
@@ -95,7 +96,8 @@ public:
95
96
  //! @return cuda::std::chrono::nanoseconds The elapsed time in nanoseconds.
96
97
  //!
97
98
  //! @note The elapsed time has a resolution of approximately 0.5 microseconds.
98
- [[nodiscard]] friend ::cuda::std::chrono::nanoseconds operator-(const timed_event& __end, const timed_event& __start)
99
+ [[nodiscard]] friend _CCCL_HOST_API ::cuda::std::chrono::nanoseconds
100
+ operator-(const timed_event& __end, const timed_event& __start)
99
101
  {
100
102
  const auto __ms = ::cuda::__driver::__eventElapsedTime(__start.get(), __end.get());
101
103
  return ::cuda::std::chrono::nanoseconds(static_cast<::cuda::std::chrono::nanoseconds::rep>(__ms * 1'000'000.0));
@@ -104,7 +106,7 @@ public:
104
106
  private:
105
107
  // Use `timed_event::from_native_handle(e)` to construct an owning `timed_event`
106
108
  // object from a `cudaEvent_t` handle.
107
- explicit constexpr timed_event(::cudaEvent_t __evnt) noexcept
109
+ _CCCL_HOST_API explicit constexpr timed_event(::cudaEvent_t __evnt) noexcept
108
110
  : event(__evnt)
109
111
  {}
110
112
  };
@@ -31,11 +31,11 @@ class __physical_device;
31
31
  class device_ref;
32
32
  template <::cudaDeviceAttr _Attr>
33
33
  struct __dev_attr;
34
+ struct arch_traits_t;
35
+ class compute_capability;
36
+ enum class arch_id : int;
34
37
 
35
- namespace arch
36
- {
37
- struct traits_t;
38
- } // namespace arch
38
+ inline constexpr int __arch_specific_id_multiplier = 100000;
39
39
 
40
40
  _CCCL_END_NAMESPACE_CUDA
41
41