cuda-cccl 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  2. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  3. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  4. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  5. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  6. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  7. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
  8. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  9. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  10. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  11. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  12. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  13. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  14. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  15. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
  16. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  17. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
  18. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
  19. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
  20. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  21. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  22. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  23. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  24. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  25. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  26. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  27. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  28. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  29. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  30. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  31. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  32. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  33. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  34. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  35. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  36. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  37. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  38. cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
  39. cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
  40. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  41. cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
  42. cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
  43. cuda/cccl/headers/include/cuda/__event/event.h +26 -26
  44. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  45. cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
  46. cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
  47. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  48. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  49. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  50. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  51. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  52. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  53. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
  54. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
  55. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
  56. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  57. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  58. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  59. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  60. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  61. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  62. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  63. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  64. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  65. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  66. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
  67. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  68. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
  69. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  70. cuda/cccl/headers/include/cuda/cmath +1 -0
  71. cuda/cccl/headers/include/cuda/devices +3 -0
  72. cuda/cccl/headers/include/cuda/memory +1 -0
  73. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  74. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  75. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  76. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  77. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  78. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  79. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  80. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  81. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  82. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  83. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  84. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  85. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  86. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  87. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  88. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  89. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  90. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  91. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  92. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  93. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  94. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  95. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  96. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  97. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  98. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  99. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  100. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  101. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  102. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  103. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  104. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  105. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  106. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  107. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  108. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  109. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  110. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  111. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  112. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  113. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  114. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  115. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  116. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  117. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  118. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  120. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  121. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  122. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  123. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  124. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  125. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  126. cuda/cccl/headers/include/cuda/std/string_view +146 -11
  127. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  128. cuda/cccl/headers/include/cuda/utility +1 -0
  129. cuda/cccl/headers/include/nv/target +7 -2
  130. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  131. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  132. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  133. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  134. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  135. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  136. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  137. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  138. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  139. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  140. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  141. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  142. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  143. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  144. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  145. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  146. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  147. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  148. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  149. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  150. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  151. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  152. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  153. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  154. cuda/compute/__init__.py +2 -0
  155. cuda/compute/_bindings.pyi +43 -1
  156. cuda/compute/_bindings_impl.pyx +156 -7
  157. cuda/compute/algorithms/_scan.py +108 -36
  158. cuda/compute/algorithms/_transform.py +32 -11
  159. cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  160. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  161. cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  162. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  163. cuda/compute/iterators/__init__.py +2 -0
  164. cuda/compute/iterators/_factories.py +28 -0
  165. cuda/compute/iterators/_iterators.py +206 -1
  166. cuda/compute/numba_utils.py +2 -2
  167. cuda/compute/typing.py +2 -0
  168. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  169. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
  170. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  171. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  172. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  173. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  174. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  175. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  176. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  177. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  178. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  179. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  180. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  181. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  182. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  183. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  184. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  185. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -22,6 +22,16 @@
22
22
  #endif // no system header
23
23
 
24
24
  #include <cuda/__mdspan/restrict_accessor.h>
25
+ #include <cuda/std/__concepts/concept_macros.h>
26
+ #include <cuda/std/__fwd/array.h>
27
+ #include <cuda/std/__fwd/span.h>
28
+ #include <cuda/std/__type_traits/extent.h>
29
+ #include <cuda/std/__type_traits/is_convertible.h>
30
+ #include <cuda/std/__type_traits/is_pointer.h>
31
+ #include <cuda/std/__type_traits/rank.h>
32
+ #include <cuda/std/__type_traits/remove_all_extents.h>
33
+ #include <cuda/std/__type_traits/remove_pointer.h>
34
+ #include <cuda/std/__type_traits/remove_reference.h>
25
35
  #include <cuda/std/mdspan>
26
36
 
27
37
  #include <cuda/std/__cccl/prologue.h>
@@ -32,7 +42,63 @@ template <typename _ElementType,
32
42
  typename _Extents,
33
43
  typename _LayoutPolicy = ::cuda::std::layout_right,
34
44
  typename _AccessorPolicy = ::cuda::std::default_accessor<_ElementType>>
35
- using restrict_mdspan = ::cuda::std::mdspan<_ElementType, _Extents, _LayoutPolicy, restrict_accessor<_AccessorPolicy>>;
45
+ class restrict_mdspan
46
+ : public ::cuda::std::mdspan<_ElementType, _Extents, _LayoutPolicy, restrict_accessor<_AccessorPolicy>>
47
+ {
48
+ public:
49
+ _LIBCUDACXX_DELEGATE_CONSTRUCTORS(
50
+ restrict_mdspan, ::cuda::std::mdspan, _ElementType, _Extents, _LayoutPolicy, restrict_accessor<_AccessorPolicy>);
51
+
52
+ _CCCL_API friend constexpr void swap(restrict_mdspan& __x, restrict_mdspan& __y) noexcept
53
+ {
54
+ swap(static_cast<__base&>(__x), static_cast<__base&>(__y));
55
+ }
56
+ };
57
+
58
+ _CCCL_TEMPLATE(class _ElementType, class... _OtherIndexTypes)
59
+ _CCCL_REQUIRES((sizeof...(_OtherIndexTypes) > 0)
60
+ _CCCL_AND(::cuda::std::is_convertible_v<_OtherIndexTypes, size_t>&&... && true))
61
+ _CCCL_HOST_DEVICE explicit restrict_mdspan(_ElementType*, _OtherIndexTypes...)
62
+ -> restrict_mdspan<_ElementType, ::cuda::std::extents<size_t, ::cuda::std::__maybe_static_ext<_OtherIndexTypes>...>>;
63
+
64
+ _CCCL_TEMPLATE(class _Pointer)
65
+ _CCCL_REQUIRES(::cuda::std::is_pointer_v<::cuda::std::remove_reference_t<_Pointer>>)
66
+ _CCCL_HOST_DEVICE restrict_mdspan(_Pointer&&)
67
+ -> restrict_mdspan<::cuda::std::remove_pointer_t<::cuda::std::remove_reference_t<_Pointer>>,
68
+ ::cuda::std::extents<size_t>>;
69
+
70
+ _CCCL_TEMPLATE(class _CArray)
71
+ _CCCL_REQUIRES(::cuda::std::is_array_v<_CArray> _CCCL_AND(::cuda::std::rank_v<_CArray> == 1))
72
+ _CCCL_HOST_DEVICE restrict_mdspan(_CArray&)
73
+ -> restrict_mdspan<::cuda::std::remove_all_extents_t<_CArray>,
74
+ ::cuda::std::extents<size_t, ::cuda::std::extent_v<_CArray, 0>>>;
75
+
76
+ template <class _ElementType, class _OtherIndexType, size_t _Size>
77
+ _CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, const ::cuda::std::array<_OtherIndexType, _Size>&)
78
+ -> restrict_mdspan<_ElementType, ::cuda::std::dextents<size_t, _Size>>;
79
+
80
+ template <class _ElementType, class _OtherIndexType, size_t _Size>
81
+ _CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, ::cuda::std::span<_OtherIndexType, _Size>)
82
+ -> restrict_mdspan<_ElementType, ::cuda::std::dextents<size_t, _Size>>;
83
+
84
+ // This one is necessary because all the constructors take `data_handle_type`s, not
85
+ // `_ElementType*`s, and `data_handle_type` is taken from `accessor_type::data_handle_type`, which
86
+ // seems to throw off automatic deduction guides.
87
+ template <class _ElementType, class _OtherIndexType, size_t... _ExtentsPack>
88
+ _CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, const ::cuda::std::extents<_OtherIndexType, _ExtentsPack...>&)
89
+ -> restrict_mdspan<_ElementType, ::cuda::std::extents<_OtherIndexType, _ExtentsPack...>>;
90
+
91
+ template <class _ElementType, class _MappingType>
92
+ _CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, const _MappingType&)
93
+ -> restrict_mdspan<_ElementType, typename _MappingType::extents_type, typename _MappingType::layout_type>;
94
+
95
+ template <class _MappingType, class _AccessorType>
96
+ _CCCL_HOST_DEVICE
97
+ restrict_mdspan(const typename _AccessorType::data_handle_type, const _MappingType&, const _AccessorType&)
98
+ -> restrict_mdspan<typename _AccessorType::element_type,
99
+ typename _MappingType::extents_type,
100
+ typename _MappingType::layout_type,
101
+ _AccessorType>;
36
102
 
37
103
  /***********************************************************************************************************************
38
104
  * Accessibility Traits
@@ -0,0 +1,93 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef _CUDA___MEMORY_POINTER_IN_RANGE_H
12
+ #define _CUDA___MEMORY_POINTER_IN_RANGE_H
13
+
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
24
+ #include <cuda/std/__type_traits/is_constant_evaluated.h>
25
+ #include <cuda/std/cstdint>
26
+ #if _CCCL_HOST_COMPILATION()
27
+ # include <functional>
28
+ #endif // _CCCL_HOST_COMPILATION()
29
+
30
+ #include <cuda/std/__cccl/prologue.h>
31
+
32
+ _CCCL_BEGIN_NAMESPACE_CUDA
33
+
34
+ // Pointers comparison <, <=, >=, > is undefined behavior in C++ (https://eel.is/c++draft/expr.rel#4) when pointers
35
+ // don't belong to the same object or array.
36
+ // - Even when a platform guarantees flat address space, the compiler can leverage UB for optimization purposes.
37
+ // - However, the compiler treats ::std::less<> other functional operators in a special way, ensuring a total ordering.
38
+ // - For device code, we can convert pointers to uintptr_t and compare them.
39
+ //
40
+ // References:
41
+ // - https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2024/p3234r0.html
42
+ // - https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2865r2.pdf
43
+ // - https://www.boost.org/doc/libs/develop/libs/core/doc/html/core/pointer_in_range.html
44
+ // - https://pvs-studio.com/en/blog/posts/cpp/1199/
45
+ // - https://releases.llvm.org/20.1.0/tools/clang/docs/ReleaseNotes.html#resolutions-to-c-defect-reports
46
+
47
+ #if _CCCL_HOST_COMPILATION()
48
+
49
+ template <typename _Tp>
50
+ [[nodiscard]] _CCCL_API bool __ptr_in_range_host(_Tp* __ptr, _Tp* __start, _Tp* __end) noexcept
51
+ {
52
+ _CCCL_ASSERT(::std::greater_equal<>{}(__end, __start), "__ptr_in_range_host: __end must be greater than __start");
53
+ return ::std::greater_equal<>{}(__ptr, __start) && ::std::less<>{}(__ptr, __end);
54
+ }
55
+
56
+ #endif // _CCCL_HOST_COMPILATION()
57
+
58
+ #if _CCCL_DEVICE_COMPILATION()
59
+
60
+ template <typename _Tp>
61
+ [[nodiscard]] _CCCL_API bool __ptr_in_range_device(_Tp* __ptr, _Tp* __start, _Tp* __end) noexcept
62
+ {
63
+ using uintptr_t = ::cuda::std::uintptr_t;
64
+ auto __end_ptr = reinterpret_cast<uintptr_t>(__end);
65
+ auto __start_ptr = reinterpret_cast<uintptr_t>(__start);
66
+ auto __ptr_ptr = reinterpret_cast<uintptr_t>(__ptr);
67
+ _CCCL_ASSERT(__end_ptr >= __start_ptr, "__ptr_in_range_device: __end must be greater than __start");
68
+ return __ptr_ptr >= __start_ptr && __ptr_ptr < __end_ptr;
69
+ }
70
+
71
+ #endif // _CCCL_DEVICE_COMPILATION()
72
+
73
+ template <typename _Tp>
74
+ [[nodiscard]] _CCCL_API constexpr bool ptr_in_range(_Tp* __ptr, _Tp* __start, _Tp* __end) noexcept
75
+ {
76
+ if (::cuda::std::__cccl_default_is_constant_evaluated())
77
+ {
78
+ _CCCL_ASSERT(__end >= __start, "ptr_in_range: __end must be greater than __start");
79
+ return __ptr >= __start && __ptr < __end; // UB is not possible in a constant expression
80
+ }
81
+ else
82
+ {
83
+ NV_IF_ELSE_TARGET(NV_IS_HOST,
84
+ (return ::cuda::__ptr_in_range_host(__ptr, __start, __end);),
85
+ (return ::cuda::__ptr_in_range_device(__ptr, __start, __end);));
86
+ }
87
+ }
88
+
89
+ _CCCL_END_NAMESPACE_CUDA
90
+
91
+ #include <cuda/std/__cccl/epilogue.h>
92
+
93
+ #endif // _CUDA___MEMORY_POINTER_IN_RANGE_H
@@ -8,8 +8,8 @@
8
8
  //
9
9
  //===----------------------------------------------------------------------===//
10
10
 
11
- #ifndef _CUDA___MEMORY_RESOURCE_GET_MEMORY_RESOURCE_CUH
12
- #define _CUDA___MEMORY_RESOURCE_GET_MEMORY_RESOURCE_CUH
11
+ #ifndef _CUDA___MEMORY_RESOURCE_GET_MEMORY_RESOURCE_H
12
+ #define _CUDA___MEMORY_RESOURCE_GET_MEMORY_RESOURCE_H
13
13
 
14
14
  #include <cuda/std/detail/__config>
15
15
 
@@ -23,11 +23,11 @@
23
23
 
24
24
  #include <cuda/__memory_resource/properties.h>
25
25
  #include <cuda/__memory_resource/resource.h>
26
+ #include <cuda/__stream/stream_ref.h>
26
27
  #include <cuda/std/__concepts/equality_comparable.h>
27
28
  #include <cuda/std/__execution/env.h>
28
29
  #include <cuda/std/__type_traits/is_same.h>
29
30
  #include <cuda/std/__type_traits/remove_cvref.h>
30
- #include <cuda/stream_ref>
31
31
 
32
32
  #include <cuda/std/__cccl/prologue.h>
33
33
 
@@ -79,4 +79,4 @@ _CCCL_END_NAMESPACE_CUDA_MR
79
79
 
80
80
  #include <cuda/std/__cccl/epilogue.h>
81
81
 
82
- #endif //_CUDAX__MEMORY_RESOURCE_GET_MEMORY_RESOURCE_CUH
82
+ #endif //_CUDA__MEMORY_RESOURCE_GET_MEMORY_RESOURCE_H
@@ -21,6 +21,7 @@
21
21
  # pragma system_header
22
22
  #endif // no system header
23
23
 
24
+ #include <cuda/std/__type_traits/decay.h>
24
25
  #include <cuda/std/__type_traits/type_set.h>
25
26
  #include <cuda/std/cstddef>
26
27
 
@@ -62,6 +63,49 @@ template <class... _Properties>
62
63
  inline constexpr bool __contains_execution_space_property =
63
64
  __is_host_accessible<_Properties...> || __is_device_accessible<_Properties...>;
64
65
 
66
+ //! @brief A type representing a list of memory resource properties
67
+ //! @tparam _Properties The properties to be included in the list
68
+ //! It has a member template `rebind` that allows constructing a type by combining
69
+ //! a template and type arguments with the properties from this list. The properties
70
+ //! are appended after the type arguments in the resulting type.
71
+ template <class... _Properties>
72
+ struct properties_list
73
+ {
74
+ //! @brief A type alias for a type template instantiated with the properties
75
+ //! from this list appended to the type arguments.
76
+ template <template <class...> class _Fn, class... _ExtraArgs>
77
+ using rebind = _Fn<_ExtraArgs..., _Properties...>;
78
+
79
+ template <class _QueryProperty>
80
+ _CCCL_HOST_API static constexpr bool has_property([[maybe_unused]] _QueryProperty)
81
+ {
82
+ return ::cuda::std::__type_set_contains_v<::cuda::std::__make_type_set<_Properties...>, _QueryProperty>;
83
+ }
84
+ };
85
+
86
+ template <class _Tp>
87
+ inline constexpr bool __is_queries_list = false;
88
+
89
+ template <class... _Tp>
90
+ inline constexpr bool __is_queries_list<properties_list<_Tp...>> = true;
91
+
92
+ template <typename _Tp>
93
+ _CCCL_CONCEPT __has_default_queries =
94
+ _CCCL_REQUIRES_EXPR((_Tp))(requires(__is_queries_list<typename ::cuda::std::decay_t<_Tp>::default_queries>));
95
+
96
+ template <typename _Resource, bool _HasDefaultQueries = __has_default_queries<_Resource>>
97
+ struct __copy_default_queries;
98
+
99
+ template <typename _Resource>
100
+ struct __copy_default_queries<_Resource, true>
101
+ {
102
+ using default_queries = typename _Resource::default_queries;
103
+ };
104
+
105
+ template <typename _Resource>
106
+ struct __copy_default_queries<_Resource, false>
107
+ {};
108
+
65
109
  _CCCL_END_NAMESPACE_CUDA_MR
66
110
 
67
111
  #include <cuda/std/__cccl/epilogue.h>
@@ -22,6 +22,7 @@
22
22
  #endif // no system header
23
23
 
24
24
  #include <cuda/__memory_resource/get_property.h>
25
+ #include <cuda/__stream/stream_ref.h>
25
26
  #include <cuda/std/__concepts/concept_macros.h>
26
27
  #include <cuda/std/__concepts/convertible_to.h>
27
28
  #include <cuda/std/__concepts/equality_comparable.h>
@@ -29,7 +30,6 @@
29
30
  #include <cuda/std/__tuple_dir/sfinae_helpers.h>
30
31
  #include <cuda/std/__type_traits/decay.h>
31
32
  #include <cuda/std/__type_traits/fold.h>
32
- #include <cuda/stream_ref>
33
33
 
34
34
  #include <cuda/std/__cccl/prologue.h>
35
35
 
@@ -26,6 +26,7 @@
26
26
  # include <cuda/__memory_resource/get_property.h>
27
27
  # include <cuda/__memory_resource/properties.h>
28
28
  # include <cuda/__memory_resource/resource.h>
29
+ # include <cuda/__stream/stream_ref.h>
29
30
  # include <cuda/std/__concepts/concept_macros.h>
30
31
  # include <cuda/std/__memory/addressof.h>
31
32
  # include <cuda/std/__type_traits/is_base_of.h>
@@ -34,7 +35,6 @@
34
35
  # include <cuda/std/__utility/exchange.h>
35
36
  # include <cuda/std/__utility/move.h>
36
37
  # include <cuda/std/cstddef>
37
- # include <cuda/stream_ref>
38
38
 
39
39
  # include <cuda/std/__cccl/prologue.h>
40
40
 
@@ -161,10 +161,7 @@ struct _Resource_vtable_builder
161
161
  template <class _Resource>
162
162
  static void _Dealloc(void* __object, void* __ptr, size_t __bytes, size_t __alignment) noexcept
163
163
  {
164
- // TODO: this breaks RMM because their memory resources do not declare their
165
- // deallocate_sync functions to be noexcept. Comment out the check for now until
166
- // we can fix RMM.
167
- // static_assert(noexcept(static_cast<_Resource*>(__object)->deallocate(__ptr, __bytes, __alignment)));
164
+ static_assert(noexcept(static_cast<_Resource*>(__object)->deallocate_sync(__ptr, __bytes, __alignment)));
168
165
  return static_cast<_Resource*>(__object)->deallocate_sync(__ptr, __bytes, __alignment);
169
166
  }
170
167
 
@@ -176,8 +173,9 @@ struct _Resource_vtable_builder
176
173
 
177
174
  template <class _Resource>
178
175
  static void
179
- _Dealloc_async(void* __object, void* __ptr, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream)
176
+ _Dealloc_async(void* __object, void* __ptr, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream) noexcept
180
177
  {
178
+ static_assert(noexcept(static_cast<_Resource*>(__object)->deallocate(__stream, __ptr, __bytes, __alignment)));
181
179
  return static_cast<_Resource*>(__object)->deallocate(__stream, __ptr, __bytes, __alignment);
182
180
  }
183
181
 
@@ -653,8 +653,9 @@
653
653
  #ifndef NVTX3_CPP_DEFINITIONS_V1_0
654
654
  # define NVTX3_CPP_DEFINITIONS_V1_0
655
655
 
656
+ # include <cuda/std/__cccl/memory_wrapper.h>
657
+
656
658
  # include <cstddef>
657
- # include <memory>
658
659
  # include <string>
659
660
  # include <type_traits>
660
661
  # include <utility>
@@ -32,6 +32,7 @@
32
32
  # ifndef _CCCL_DOXYGEN_INVOKED // Do not document
33
33
 
34
34
  _CCCL_BEGIN_NAMESPACE_CUDA
35
+
35
36
  class stream_ref;
36
37
 
37
38
  //! @brief RAII helper which on construction sets the current context to the specified one.
@@ -45,7 +46,7 @@ struct [[maybe_unused]] __ensure_current_context
45
46
  //! @param new_device The device to switch the context to
46
47
  //!
47
48
  //! @throws cuda_error if the context switch fails
48
- explicit __ensure_current_context(device_ref __new_device)
49
+ _CCCL_HOST_API explicit __ensure_current_context(device_ref __new_device)
49
50
  {
50
51
  auto __ctx = ::cuda::__physical_devices()[__new_device.get()].__primary_context();
51
52
  ::cuda::__driver::__ctxPush(__ctx);
@@ -57,7 +58,7 @@ struct [[maybe_unused]] __ensure_current_context
57
58
  //! @param ctx The context to switch to
58
59
  //!
59
60
  //! @throws cuda_error if the context switch fails
60
- explicit __ensure_current_context(::CUcontext __ctx)
61
+ _CCCL_HOST_API explicit __ensure_current_context(::CUcontext __ctx)
61
62
  {
62
63
  ::cuda::__driver::__ctxPush(__ctx);
63
64
  }
@@ -68,7 +69,7 @@ struct [[maybe_unused]] __ensure_current_context
68
69
  //! @param stream Stream indicating the context to switch to
69
70
  //!
70
71
  //! @throws cuda_error if the context switch fails
71
- explicit __ensure_current_context(stream_ref __stream);
72
+ _CCCL_HOST_API explicit __ensure_current_context(stream_ref __stream);
72
73
 
73
74
  __ensure_current_context(__ensure_current_context&&) = delete;
74
75
  __ensure_current_context(__ensure_current_context const&) = delete;
@@ -80,7 +81,7 @@ struct [[maybe_unused]] __ensure_current_context
80
81
  //!
81
82
  //! @throws cuda_error if the device switch fails. If the destructor is called
82
83
  //! during stack unwinding, the program is automatically terminated.
83
- ~__ensure_current_context() noexcept(false)
84
+ _CCCL_HOST_API ~__ensure_current_context() noexcept(false)
84
85
  {
85
86
  // TODO would it make sense to assert here that we pushed and popped the same thing?
86
87
  ::cuda::__driver::__ctxPop();
@@ -43,7 +43,7 @@ struct stream : stream_ref
43
43
  //! Priority is defaulted to stream::default_priority
44
44
  //!
45
45
  //! @throws cuda_error if stream creation fails
46
- explicit stream(device_ref __dev, int __priority = default_priority)
46
+ _CCCL_HOST_API explicit stream(device_ref __dev, int __priority = default_priority)
47
47
  : stream_ref(__detail::__invalid_stream)
48
48
  {
49
49
  [[maybe_unused]] __ensure_current_context __ctx_setter(__dev);
@@ -54,7 +54,7 @@ struct stream : stream_ref
54
54
  //!
55
55
  //! @post `stream()` returns an invalid stream handle
56
56
  // Can't be constexpr because __invalid_stream isn't
57
- explicit stream(no_init_t) noexcept
57
+ _CCCL_HOST_API explicit stream(no_init_t) noexcept
58
58
  : stream_ref(__detail::__invalid_stream)
59
59
  {}
60
60
 
@@ -63,7 +63,7 @@ struct stream : stream_ref
63
63
  //! @param __other
64
64
  //!
65
65
  //! @post `__other` is in moved-from state.
66
- stream(stream&& __other) noexcept
66
+ _CCCL_HOST_API stream(stream&& __other) noexcept
67
67
  : stream(::cuda::std::exchange(__other.__stream, __detail::__invalid_stream))
68
68
  {}
69
69
 
@@ -72,7 +72,7 @@ struct stream : stream_ref
72
72
  //! Destroy the `stream` object
73
73
  //!
74
74
  //! @note If the stream fails to be destroyed, the error is silently ignored.
75
- ~stream()
75
+ _CCCL_HOST_API ~stream()
76
76
  {
77
77
  if (__stream != __detail::__invalid_stream)
78
78
  {
@@ -87,7 +87,7 @@ struct stream : stream_ref
87
87
  //! @param __other
88
88
  //!
89
89
  //! @post `__other` is in a moved-from state.
90
- stream& operator=(stream&& __other) noexcept
90
+ _CCCL_HOST_API stream& operator=(stream&& __other) noexcept
91
91
  {
92
92
  stream __tmp(::cuda::std::move(__other));
93
93
  ::cuda::std::swap(__stream, __tmp.__stream);
@@ -103,7 +103,7 @@ struct stream : stream_ref
103
103
  //! @return stream The constructed `stream` object
104
104
  //!
105
105
  //! @note The constructed `stream` object takes ownership of the native handle.
106
- [[nodiscard]] static stream from_native_handle(::cudaStream_t __handle)
106
+ [[nodiscard]] static _CCCL_HOST_API stream from_native_handle(::cudaStream_t __handle)
107
107
  {
108
108
  return stream(__handle);
109
109
  }
@@ -119,7 +119,7 @@ struct stream : stream_ref
119
119
  //! @return cudaStream_t The native handle being held by the `stream` object.
120
120
  //!
121
121
  //! @post The stream object is in a moved-from state.
122
- [[nodiscard]] ::cudaStream_t release()
122
+ [[nodiscard]] _CCCL_HOST_API ::cudaStream_t release()
123
123
  {
124
124
  return ::cuda::std::exchange(__stream, __detail::__invalid_stream);
125
125
  }
@@ -127,7 +127,7 @@ struct stream : stream_ref
127
127
  private:
128
128
  // Use `stream::from_native_handle(s)` to construct an owning `stream`
129
129
  // object from a `cudaStream_t` handle.
130
- explicit stream(::cudaStream_t __handle)
130
+ _CCCL_HOST_API explicit stream(::cudaStream_t __handle)
131
131
  : stream_ref(__handle)
132
132
  {}
133
133
  };
@@ -30,6 +30,7 @@
30
30
  # include <cuda/__runtime/ensure_current_context.h>
31
31
  # include <cuda/__utility/no_init.h>
32
32
  # include <cuda/std/__exception/cuda_error.h>
33
+ # include <cuda/std/__utility/to_underlying.h>
33
34
  # include <cuda/std/cstddef>
34
35
 
35
36
  # include <cuda/std/__cccl/prologue.h>
@@ -61,9 +62,10 @@ public:
61
62
  //!
62
63
  //! For behavior of the default stream,
63
64
  //! @see //! https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html
64
- [[deprecated("Using the default/null stream is generally discouraged. If you need to use it, please construct a "
65
- "stream_ref from cudaStream_t{nullptr}")]]
66
- _CCCL_HIDE_FROM_ABI stream_ref() = default;
65
+ CCCL_DEPRECATED_BECAUSE("Using the default/null stream is generally discouraged. If you need to use it, please "
66
+ "construct a "
67
+ "stream_ref from cudaStream_t{nullptr}") _CCCL_HIDE_FROM_ABI
68
+ stream_ref() = default;
67
69
 
68
70
  //! @brief Constructs a `stream_ref` from a `cudaStream_t` handle.
69
71
  //!
@@ -124,8 +126,7 @@ public:
124
126
  //! @brief Deprecated. Use sync() instead.
125
127
  //!
126
128
  //! @deprecated Use sync() instead.
127
- [[deprecated("Use sync() instead.")]]
128
- void wait() const
129
+ CCCL_DEPRECATED_BECAUSE("Use sync() instead.") _CCCL_HOST_API void wait() const
129
130
  {
130
131
  sync();
131
132
  }
@@ -184,7 +185,7 @@ public:
184
185
  //! @throws cuda::cuda_error if the query fails.
185
186
  //!
186
187
  //! @return `true` if all operations have completed, or `false` if not.
187
- [[deprecated("Use is_done() instead.")]] [[nodiscard]] bool ready() const
188
+ [[nodiscard]] CCCL_DEPRECATED_BECAUSE("Use is_done() instead.") _CCCL_HOST_API bool ready() const
188
189
  {
189
190
  return is_done();
190
191
  }
@@ -216,7 +217,7 @@ public:
216
217
  //! @return A new event that was recorded into this stream
217
218
  //!
218
219
  //! @throws cuda_error if event creation or record failed
219
- [[nodiscard]] _CCCL_HOST_API event record_event(event::flags __flags = event::flags::none) const
220
+ [[nodiscard]] _CCCL_HOST_API event record_event(event_flags __flags = event_flags::none) const
220
221
  {
221
222
  return event(*this, __flags);
222
223
  }
@@ -226,7 +227,7 @@ public:
226
227
  //! @return A new timed event that was recorded into this stream
227
228
  //!
228
229
  //! @throws cuda_error if event creation or record failed
229
- [[nodiscard]] _CCCL_HOST_API timed_event record_timed_event(event::flags __flags = event::flags::none) const
230
+ [[nodiscard]] _CCCL_HOST_API timed_event record_timed_event(event_flags __flags = event_flags::none) const
230
231
  {
231
232
  return timed_event(*this, __flags);
232
233
  }
@@ -237,7 +238,7 @@ public:
237
238
  //! returned
238
239
  //!
239
240
  //! @throws cuda_error if device check fails
240
- _CCCL_HOST_API device_ref device() const
241
+ [[nodiscard]] _CCCL_HOST_API device_ref device() const
241
242
  {
242
243
  ::CUdevice __device{};
243
244
  # if _CCCL_CTK_AT_LEAST(13, 0)
@@ -260,7 +261,7 @@ public:
260
261
  }
261
262
  };
262
263
 
263
- inline void event_ref::record(stream_ref __stream) const
264
+ _CCCL_HOST_API inline void event_ref::record(stream_ref __stream) const
264
265
  {
265
266
  _CCCL_ASSERT(__event_ != nullptr, "cuda::event_ref::record no event set");
266
267
  _CCCL_ASSERT(__stream.get() != nullptr, "cuda::event_ref::record invalid stream passed");
@@ -268,26 +269,26 @@ inline void event_ref::record(stream_ref __stream) const
268
269
  ::cuda::__driver::__eventRecord(__event_, __stream.get());
269
270
  }
270
271
 
271
- inline event::event(stream_ref __stream, event::flags __flags)
272
- : event(__stream, static_cast<unsigned>(__flags) | cudaEventDisableTiming)
272
+ _CCCL_HOST_API inline event::event(stream_ref __stream, event_flags __flags)
273
+ : event(__stream, ::cuda::std::to_underlying(__flags) | cudaEventDisableTiming)
273
274
  {
274
275
  record(__stream);
275
276
  }
276
277
 
277
- inline event::event(stream_ref __stream, unsigned __flags)
278
+ _CCCL_HOST_API inline event::event(stream_ref __stream, unsigned __flags)
278
279
  : event_ref(::cudaEvent_t{})
279
280
  {
280
281
  [[maybe_unused]] __ensure_current_context __ctx_setter(__stream);
281
282
  __event_ = ::cuda::__driver::__eventCreate(static_cast<unsigned>(__flags));
282
283
  }
283
284
 
284
- inline timed_event::timed_event(stream_ref __stream, event::flags __flags)
285
- : event(__stream, static_cast<unsigned>(__flags))
285
+ _CCCL_HOST_API inline timed_event::timed_event(stream_ref __stream, event_flags __flags)
286
+ : event(__stream, ::cuda::std::to_underlying(__flags))
286
287
  {
287
288
  record(__stream);
288
289
  }
289
290
 
290
- inline __ensure_current_context::__ensure_current_context(stream_ref __stream)
291
+ _CCCL_HOST_API inline __ensure_current_context::__ensure_current_context(stream_ref __stream)
291
292
  {
292
293
  auto __ctx = __driver::__streamGetCtx(__stream.get());
293
294
  ::cuda::__driver::__ctxPush(__ctx);
@@ -0,0 +1,65 @@
1
+ //===----------------------------------------------------------------------===//
2
+ //
3
+ // Part of libcu++, the C++ Standard Library for your entire system,
4
+ // under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+
11
+ #ifndef _CUDA___UTILITY_IN_RANGE_H
12
+ #define _CUDA___UTILITY_IN_RANGE_H
13
+
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
24
+ #include <cuda/__type_traits/is_floating_point.h>
25
+ #include <cuda/std/__cmath/isnan.h>
26
+ #include <cuda/std/__concepts/concept_macros.h>
27
+ #include <cuda/std/__type_traits/conditional.h>
28
+ #include <cuda/std/__type_traits/is_extended_floating_point.h>
29
+ #include <cuda/std/__type_traits/is_integer.h>
30
+ #include <cuda/std/__type_traits/is_unsigned_integer.h>
31
+
32
+ #include <cuda/std/__cccl/prologue.h>
33
+
34
+ _CCCL_BEGIN_NAMESPACE_CUDA
35
+
36
+ _CCCL_TEMPLATE(typename _Tp)
37
+ _CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> || ::cuda::std::is_floating_point_v<_Tp>
38
+ || ::cuda::std::__is_extended_floating_point_v<_Tp>)
39
+ [[nodiscard]] _CCCL_API constexpr bool in_range(_Tp __v, _Tp __start, _Tp __end) noexcept
40
+ {
41
+ _CCCL_ASSERT(::cuda::std::isnan(__start) || ::cuda::std::isnan(__end) || __end >= __start,
42
+ "in_range: __end must be greater than or equal to __start");
43
+ if constexpr (::cuda::std::__cccl_is_unsigned_integer_v<_Tp>)
44
+ {
45
+ // if __end > __start, we know that the range is always positive. Similarly, __v is positive if unsigned.
46
+ // this optimization is useful when __start and __end are compile-time constants, or when in_range is used multiple
47
+ // times with the same range
48
+ using _Up = ::cuda::std::conditional_t<(sizeof(_Tp) <= sizeof(unsigned)), unsigned, _Tp>; // at least 32-bit
49
+ const auto __start1 = static_cast<_Up>(__start);
50
+ const auto __end1 = static_cast<_Up>(__end);
51
+ const auto __v1 = static_cast<_Up>(__v);
52
+ const auto __range = __end1 - __start1;
53
+ return (__v1 - __start1) <= __range;
54
+ }
55
+ else
56
+ {
57
+ return __v >= __start && __v <= __end;
58
+ }
59
+ }
60
+
61
+ _CCCL_END_NAMESPACE_CUDA
62
+
63
+ #include <cuda/std/__cccl/epilogue.h>
64
+
65
+ #endif // _CUDA___UTILITY_IN_RANGE_H
@@ -26,6 +26,7 @@
26
26
  #include <cuda/__cmath/ilog.h>
27
27
  #include <cuda/__cmath/ipow.h>
28
28
  #include <cuda/__cmath/isqrt.h>
29
+ #include <cuda/__cmath/mul_hi.h>
29
30
  #include <cuda/__cmath/neg.h>
30
31
  #include <cuda/__cmath/pow2.h>
31
32
  #include <cuda/__cmath/round_down.h>
@@ -22,9 +22,12 @@
22
22
  #endif // no system header
23
23
 
24
24
  #include <cuda/__device/all_devices.h>
25
+ #include <cuda/__device/arch_id.h>
25
26
  #include <cuda/__device/arch_traits.h>
26
27
  #include <cuda/__device/attributes.h>
28
+ #include <cuda/__device/compute_capability.h>
27
29
  #include <cuda/__device/device_ref.h>
28
30
  #include <cuda/__device/physical_device.h>
31
+ #include <cuda/version>
29
32
 
30
33
  #endif // _CUDA_DEVICES
@@ -28,6 +28,7 @@
28
28
  #include <cuda/__memory/discard_memory.h>
29
29
  #include <cuda/__memory/get_device_address.h>
30
30
  #include <cuda/__memory/is_aligned.h>
31
+ #include <cuda/__memory/ptr_in_range.h>
31
32
  #include <cuda/__memory/ptr_rebind.h>
32
33
  #include <cuda/std/memory>
33
34
 
@@ -52,12 +52,12 @@ __equal_range(_Iter __first, _Sent __last, const _Tp& __value, _Compare&& __comp
52
52
  {
53
53
  auto __half_len = ::cuda::std::__half_positive(__len);
54
54
  _Iter __mid = _IterOps<_AlgPolicy>::next(__first, __half_len);
55
- if (::cuda::std::__invoke(__comp, ::cuda::std::__invoke(__proj, *__mid), __value))
55
+ if (::cuda::std::invoke(__comp, ::cuda::std::invoke(__proj, *__mid), __value))
56
56
  {
57
57
  __first = ++__mid;
58
58
  __len -= __half_len + 1;
59
59
  }
60
- else if (::cuda::std::__invoke(__comp, __value, ::cuda::std::__invoke(__proj, *__mid)))
60
+ else if (::cuda::std::invoke(__comp, __value, ::cuda::std::invoke(__proj, *__mid)))
61
61
  {
62
62
  __end = __mid;
63
63
  __len = __half_len;