cuda-cccl 0.3.0__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (144) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  7. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  20. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
  21. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  22. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  23. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
  24. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
  25. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  26. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
  27. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
  28. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  29. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  30. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  31. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  32. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  33. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  34. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  35. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  36. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  37. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  38. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  39. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  40. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  41. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  42. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  43. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  44. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  46. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  47. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  48. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  49. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  50. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  51. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  52. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  53. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  54. cuda/cccl/headers/include/cuda/__device/arch_traits.h +48 -46
  55. cuda/cccl/headers/include/cuda/__device/attributes.h +171 -121
  56. cuda/cccl/headers/include/cuda/__device/device_ref.h +30 -42
  57. cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
  58. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  59. cuda/cccl/headers/include/cuda/__event/event.h +1 -0
  60. cuda/cccl/headers/include/cuda/__event/timed_event.h +1 -0
  61. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  62. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  63. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
  64. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
  65. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
  66. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
  67. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +1 -0
  68. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  69. cuda/cccl/headers/include/cuda/algorithm +1 -1
  70. cuda/cccl/headers/include/cuda/devices +10 -0
  71. cuda/cccl/headers/include/cuda/iterator +1 -0
  72. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  73. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  74. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  75. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  76. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  77. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  78. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  79. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  80. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  81. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  82. cuda/cccl/headers/include/cuda/std/string_view +12 -5
  83. cuda/cccl/headers/include/cuda/std/version +1 -4
  84. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  85. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  86. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  87. cuda/cccl/parallel/experimental/__init__.py +21 -74
  88. cuda/compute/__init__.py +77 -0
  89. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +1 -1
  90. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  91. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  92. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  93. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  94. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -4
  95. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  96. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  97. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
  98. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  99. cuda/{cccl/parallel/experimental → compute}/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  100. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  101. cuda/{cccl/parallel/experimental → compute}/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
  102. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  103. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
  104. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  105. cuda/coop/__init__.py +8 -0
  106. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  107. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  108. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  109. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  110. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  111. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  112. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  113. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  114. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  115. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  116. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  117. cuda/coop/warp/__init__.py +9 -0
  118. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  119. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  120. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  121. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
  122. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +141 -138
  123. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  125. cuda/cccl/parallel/experimental/.gitignore +0 -4
  126. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  127. /cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +0 -0
  128. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  129. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  130. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  131. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  132. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  133. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  134. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  135. /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
  136. /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
  137. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  138. /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
  139. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  140. /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
  141. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  142. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  143. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
  144. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,593 @@
1
+ // -*- C++ -*-
2
+ //===----------------------------------------------------------------------===//
3
+ //
4
+ // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5
+ // See https://llvm.org/LICENSE.txt for license information.
6
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
+ // SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES
8
+ //
9
+ //===----------------------------------------------------------------------===//
10
+ #ifndef _CUDA___ITERATOR_ZIP_TRANSFORM_ITERATOR_H
11
+ #define _CUDA___ITERATOR_ZIP_TRANSFORM_ITERATOR_H
12
+
13
+ #include <cuda/std/detail/__config>
14
+
15
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
16
+ # pragma GCC system_header
17
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
18
+ # pragma clang system_header
19
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
20
+ # pragma system_header
21
+ #endif // no system header
22
+
23
+ #include <cuda/__fwd/zip_iterator.h>
24
+ #include <cuda/std/__algorithm/ranges_min_element.h>
25
+ #if _LIBCUDACXX_HAS_SPACESHIP_OPERATOR()
26
+ # include <cuda/std/__compare/three_way_comparable.h>
27
+ #endif // _LIBCUDACXX_HAS_SPACESHIP_OPERATOR()
28
+ #include <cuda/__iterator/zip_common.h>
29
+ #include <cuda/std/__concepts/convertible_to.h>
30
+ #include <cuda/std/__concepts/equality_comparable.h>
31
+ #include <cuda/std/__functional/invoke.h>
32
+ #include <cuda/std/__functional/operations.h>
33
+ #include <cuda/std/__iterator/concepts.h>
34
+ #include <cuda/std/__iterator/incrementable_traits.h>
35
+ #include <cuda/std/__iterator/iterator_traits.h>
36
+ #include <cuda/std/__ranges/concepts.h>
37
+ #include <cuda/std/__ranges/movable_box.h>
38
+ #include <cuda/std/__type_traits/common_type.h>
39
+ #include <cuda/std/__type_traits/make_unsigned.h>
40
+ #include <cuda/std/__type_traits/remove_cvref.h>
41
+ #include <cuda/std/__utility/forward.h>
42
+ #include <cuda/std/__utility/integer_sequence.h>
43
+ #include <cuda/std/__utility/move.h>
44
+ #include <cuda/std/__utility/pair.h>
45
+ #include <cuda/std/tuple>
46
+
47
+ #include <cuda/std/__cccl/prologue.h>
48
+
49
+ _CCCL_BEGIN_NAMESPACE_CUDA
50
+
51
+ //! @addtogroup iterators
52
+ //! @{
53
+
54
+ template <class _Fn, class... _Iterators>
55
+ [[nodiscard]] _CCCL_API _CCCL_CONSTEVAL auto __get_zip_transform_iterator_category()
56
+ {
57
+ using _Constraints = __zip_iter_constraints<_Iterators...>;
58
+ if constexpr (!::cuda::std::is_reference_v<
59
+ ::cuda::std::invoke_result_t<_Fn&, ::cuda::std::iter_reference_t<_Iterators>...>>)
60
+ {
61
+ return ::cuda::std::input_iterator_tag{};
62
+ }
63
+ else if constexpr (_Constraints::__all_random_access)
64
+ {
65
+ return ::cuda::std::random_access_iterator_tag{};
66
+ }
67
+ else if constexpr (_Constraints::__all_bidirectional)
68
+ {
69
+ return ::cuda::std::bidirectional_iterator_tag{};
70
+ }
71
+ else if constexpr (_Constraints::__all_forward)
72
+ {
73
+ return ::cuda::std::forward_iterator_tag{};
74
+ }
75
+ else
76
+ {
77
+ return ::cuda::std::input_iterator_tag{};
78
+ }
79
+ _CCCL_UNREACHABLE();
80
+ }
81
+
82
+ //! @brief @c zip_transform_iterator is an iterator which represents the result of a transformation of a set of
83
+ //! sequences with a given function. This iterator is useful for creating a range filled with the result of applying an
84
+ //! operation to another range without either explicitly storing it in memory, or explicitly executing the
85
+ //! transformation. Using @c zip_transform_iterator facilitates kernel fusion by deferring the execution of a
86
+ //! transformation until the value is needed while saving both memory capacity and bandwidth.
87
+ //!
88
+ //! @c zip_transform_iterator is morally equivalent to a combination of transform_iterator and zip_iterator
89
+ //!
90
+ //! @code{.cpp}
91
+ //! template <class Fn, class... Iterators>
92
+ //! using zip_transform_iterator = cuda::transform_iterator<cuda::zip_iterator<Iterators...>, cuda::zip_function<Fn>>;
93
+ //! @endcode
94
+ //!
95
+ //! @c zip_transform_iterator has the additional benefit that it does not require an artificial @c zip_function to work
96
+ //! and more importantly does not need to materialize the result of dereferencing the stored iterators when passing them
97
+ //! to the stored function.
98
+ //!
99
+ //! The following code snippet demonstrates how to create a @c zip_transform_iterator which represents the result of
100
+ //! "zipping" multiple ranges together.
101
+ //!
102
+ //! @code
103
+ //! #include <cuda/iterator>
104
+ //! #include <thrust/device_vector.h>
105
+ //!
106
+ //! struct SumArgs {
107
+ //! __host__ __device__ float operator()(float a, float b, float c) const noexcept {
108
+ //! return a + b + c;
109
+ //! }
110
+ //! };
111
+ //!
112
+ //! thrust::device_vector<float> A{0.f, 1.f, 2.f};
113
+ //! thrust::device_vector<float> B{1.f, 2.f, 3.f};
114
+ //! thrust::device_vector<float> C{2.f, 3.f, 4.f};
115
+ //!
116
+ //! cuda::zip_transform_iterator iter{SumArgs{}, A.begin(), B.begin(), C.begin()};
117
+ //!
118
+ //! *iter; // returns (3.f)
119
+ //! iter[0]; // returns (3.f)
120
+ //! iter[1]; // returns (6.f)
121
+ //! iter[2]; // returns (9.f)
122
+ //! // iter[3] is an out-of-bounds error
123
+ //! @endcode
124
+ //!
125
+ //! This example shows how to use @c zip_transform_iterator to copy multiple ranges with a single call to @c
126
+ //! thrust::copy.
127
+ //!
128
+ //! @code
129
+ //! #include <cuda/iterator>
130
+ //! #include <thrust/device_vector.h>
131
+ //!
132
+ //! int main()
133
+ //! {
134
+ //! struct SumArgs {
135
+ //! __host__ __device__ float operator()(float a, float b, float c) const noexcept {
136
+ //! return a + b + c;
137
+ //! }
138
+ //! };
139
+ //!
140
+ //! thrust::device_vector<float> A{0.f, 1.f, 2.f};
141
+ //! thrust::device_vector<float> B{1.f, 2.f, 3.f};
142
+ //! thrust::device_vector<float> C{2.f, 3.f, 4.f};
143
+ //! thrust::device_vector<float> out(3);
144
+ //!
145
+ //! cuda::zip_transform_iterator iter{SumArgs{}, A.begin(), B.begin(), C.begin()}
146
+ //! thrust::copy(iter, iter + 3, out.begin());
147
+ //!
148
+ //! // out is now [3.0f, 6.0f, 9.0f]
149
+ //!
150
+ //! return 0;
151
+ //! }
152
+ //! @endcode
153
+ template <class _Fn, class... _Iterators>
154
+ class zip_transform_iterator
155
+ {
156
+ ::cuda::std::ranges::__movable_box<_Fn> __func_;
157
+ __tuple_or_pair<_Iterators...> __current_;
158
+
159
+ template <class, class...>
160
+ friend class zip_transform_iterator;
161
+
162
+ template <class _Op>
163
+ _CCCL_API static constexpr auto
164
+ __zip_apply(const _Op& __op,
165
+ const __tuple_or_pair<_Iterators...>& __tuple1,
166
+ const __tuple_or_pair<_Iterators...>& __tuple2) //
167
+ noexcept(noexcept(__op(__tuple1, __tuple2, ::cuda::std::make_index_sequence<sizeof...(_Iterators)>())))
168
+ {
169
+ return __op(__tuple1, __tuple2, ::cuda::std::make_index_sequence<sizeof...(_Iterators)>());
170
+ }
171
+
172
+ public:
173
+ //! @brief Default-constructs a @c zip_transform_iterator by value-initializing the functor and all stored iterators
174
+ #if _CCCL_HAS_CONCEPTS()
175
+ _CCCL_EXEC_CHECK_DISABLE
176
+ _CCCL_HIDE_FROM_ABI zip_transform_iterator()
177
+ requires ::cuda::std::default_initializable<_Fn>
178
+ && __zip_iter_constraints<_Iterators...>::__all_default_initializable
179
+ = default;
180
+ #else // ^^^ _CCCL_HAS_CONCEPTS() ^^^ / vvv !_CCCL_HAS_CONCEPTS() vvv
181
+ _CCCL_EXEC_CHECK_DISABLE
182
+ _CCCL_TEMPLATE(class _Fn2 = _Fn)
183
+ _CCCL_REQUIRES(
184
+ ::cuda::std::default_initializable<_Fn2>&& __zip_iter_constraints<_Iterators...>::__all_default_initializable)
185
+ _CCCL_API constexpr zip_transform_iterator() noexcept(
186
+ ::cuda::std::is_nothrow_default_constructible_v<_Fn2>
187
+ && __zip_iter_constraints<_Iterators...>::__all_nothrow_default_constructible)
188
+ : __func_(::cuda::std::in_place)
189
+ , __current_()
190
+ {}
191
+ #endif // ^^^ !_CCCL_HAS_CONCEPTS() ^^^
192
+
193
+ //! @brief Constructs a @c zip_transform_iterator from a tuple of iterators
194
+ //! @param __iters A tuple or pair of iterators
195
+ _CCCL_API constexpr explicit zip_transform_iterator(_Fn __fun, __tuple_or_pair<_Iterators...> __iters)
196
+ : __func_(::cuda::std::in_place, ::cuda::std::move(__fun))
197
+ , __current_(::cuda::std::move(__iters))
198
+ {}
199
+
200
+ //! @brief Constructs a @c zip_transform_iterator from a tuple of iterators
201
+ //! @param __iters A tuple of iterators
202
+ _CCCL_TEMPLATE(size_t _NumIterators = sizeof...(_Iterators))
203
+ _CCCL_REQUIRES((_NumIterators == 2))
204
+ _CCCL_API constexpr explicit zip_transform_iterator(_Fn __fun, ::cuda::std::tuple<_Iterators...> __iters)
205
+ : __func_(::cuda::std::in_place, ::cuda::std::move(__fun))
206
+ , __current_(::cuda::std::get<0>(::cuda::std::move(__iters)), ::cuda::std::get<1>(::cuda::std::move(__iters)))
207
+ {}
208
+
209
+ //! @brief Constructs a @c zip_transform_iterator from variadic set of iterators
210
+ //! @param __iters The input iterators
211
+ _CCCL_API constexpr explicit zip_transform_iterator(_Fn __fun, _Iterators... __iters)
212
+ : __func_(::cuda::std::in_place, ::cuda::std::move(__fun))
213
+ , __current_(::cuda::std::move(__iters)...)
214
+ {}
215
+
216
+ using iterator_concept = decltype(::cuda::__get_zip_iterator_concept<_Iterators...>());
217
+ using iterator_category = decltype(::cuda::__get_zip_transform_iterator_category<_Fn, _Iterators...>());
218
+ using difference_type = ::cuda::std::common_type_t<::cuda::std::iter_difference_t<_Iterators>...>;
219
+ using value_type =
220
+ ::cuda::std::remove_cvref_t<::cuda::std::invoke_result_t<_Fn&, ::cuda::std::iter_reference_t<_Iterators>...>>;
221
+
222
+ // Those are technically not to spec, but pre-ranges iterator_traits do not work properly with iterators that do not
223
+ // define all 5 aliases, see https://en.cppreference.com/w/cpp/iterator/iterator_traits.html
224
+ using reference = ::cuda::std::invoke_result_t<_Fn&, ::cuda::std::iter_reference_t<_Iterators>...>;
225
+ using pointer = void;
226
+
227
+ // Internal helper functions to extract internals for device dispatch, must be a tuple for cub_transform_many
228
+ [[nodiscard]] _CCCL_API constexpr ::cuda::std::tuple<_Iterators...>
229
+ __base() && noexcept(::cuda::std::is_nothrow_move_constructible_v<__tuple_or_pair<_Iterators...>>)
230
+ {
231
+ return ::cuda::std::move(__current_);
232
+ }
233
+
234
+ [[nodiscard]] _CCCL_API constexpr _Fn __pred() && noexcept(::cuda::std::is_nothrow_move_constructible_v<_Fn>)
235
+ {
236
+ return ::cuda::std::move(*__func_);
237
+ }
238
+
239
+ struct __zip_transform_op_star
240
+ {
241
+ _Fn& __func_;
242
+
243
+ _CCCL_EXEC_CHECK_DISABLE
244
+ [[nodiscard]] _CCCL_API constexpr reference operator()(const _Iterators&... __iters) const
245
+ noexcept(::cuda::std::is_nothrow_invocable_v<_Fn&, ::cuda::std::iter_reference_t<const _Iterators>...>)
246
+ {
247
+ return ::cuda::std::invoke(const_cast<_Fn&>(__func_), *__iters...);
248
+ }
249
+ };
250
+
251
+ //! @brief Invokes the stored function with the result of dereferencing the stored iterators
252
+ [[nodiscard]] _CCCL_API constexpr reference operator*() const
253
+ noexcept(::cuda::std::is_nothrow_invocable_v<_Fn&, ::cuda::std::iter_reference_t<const _Iterators>...>)
254
+ {
255
+ return ::cuda::std::apply(__zip_transform_op_star{const_cast<_Fn&>(*__func_)}, __current_);
256
+ }
257
+
258
+ struct __zip_transform_op_subscript
259
+ {
260
+ difference_type __n_;
261
+ _Fn& __func_;
262
+
263
+ _CCCL_EXEC_CHECK_DISABLE
264
+ [[nodiscard]] _CCCL_API constexpr reference operator()(const _Iterators&... __iters) const noexcept(noexcept(
265
+ ::cuda::std::invoke(const_cast<_Fn&>(__func_), __iters[::cuda::std::iter_difference_t<_Iterators>(__n_)]...)))
266
+ {
267
+ return ::cuda::std::invoke(
268
+ const_cast<_Fn&>(__func_), __iters[::cuda::std::iter_difference_t<_Iterators>(__n_)]...);
269
+ }
270
+ };
271
+
272
+ //! @brief Invokes the stored function with the result of dereferencing the stored iterators advanced by an offset
273
+ //! @param __n The additional offset
274
+ _CCCL_TEMPLATE(class _Constraints = __zip_iter_constraints<_Iterators...>)
275
+ _CCCL_REQUIRES(_Constraints::__all_random_access)
276
+ _CCCL_API constexpr reference operator[](difference_type __n) const
277
+ noexcept(noexcept(::cuda::std::apply(__zip_transform_op_subscript{__n, const_cast<_Fn&>(*__func_)}, __current_)))
278
+ {
279
+ return ::cuda::std::apply(__zip_transform_op_subscript{__n, const_cast<_Fn&>(*__func_)}, __current_);
280
+ }
281
+
282
+ //! @brief Increments all stored iterators
283
+ _CCCL_API constexpr zip_transform_iterator&
284
+ operator++() noexcept(noexcept(::cuda::std::apply(__zip_op_increment{}, __current_)))
285
+ {
286
+ ::cuda::std::apply(__zip_op_increment{}, __current_);
287
+ return *this;
288
+ }
289
+
290
+ //! @brief Increments all stored iterators
291
+ //! @returns A copy of the original @c zip_transform_iterator if possible
292
+ _CCCL_API constexpr auto operator++(int)
293
+ {
294
+ if constexpr (__zip_iter_constraints<_Iterators...>::__all_forward)
295
+ {
296
+ auto __tmp = *this;
297
+ ++*this;
298
+ return __tmp;
299
+ }
300
+ else
301
+ {
302
+ ++*this;
303
+ }
304
+ }
305
+
306
+ //! @brief Decrements all stored iterators
307
+ _CCCL_TEMPLATE(class _Constraints = __zip_iter_constraints<_Iterators...>)
308
+ _CCCL_REQUIRES(_Constraints::__all_bidirectional)
309
+ _CCCL_API constexpr zip_transform_iterator&
310
+ operator--() noexcept(noexcept(::cuda::std::apply(__zip_op_decrement{}, __current_)))
311
+ {
312
+ ::cuda::std::apply(__zip_op_decrement{}, __current_);
313
+ return *this;
314
+ }
315
+
316
+ //! @brief Decrements all stored iterators
317
+ _CCCL_TEMPLATE(class _Constraints = __zip_iter_constraints<_Iterators...>)
318
+ _CCCL_REQUIRES(_Constraints::__all_bidirectional)
319
+ _CCCL_API constexpr zip_transform_iterator operator--(int)
320
+ {
321
+ auto __tmp = *this;
322
+ --*this;
323
+ return __tmp;
324
+ }
325
+
326
+ struct __zip_op_pe
327
+ {
328
+ difference_type __n;
329
+
330
+ _CCCL_EXEC_CHECK_DISABLE
331
+ _CCCL_API constexpr void operator()(_Iterators&... __iters) const
332
+ noexcept(noexcept(((void) (__iters += ::cuda::std::iter_difference_t<_Iterators>(__n)), ...)))
333
+ {
334
+ ((void) (__iters += ::cuda::std::iter_difference_t<_Iterators>(__n)), ...);
335
+ }
336
+ };
337
+
338
+ //! @brief Increments all stored iterators by a given number of elements
339
+ //! @param __n The number of elements to increment
340
+ _CCCL_TEMPLATE(class _Constraints = __zip_iter_constraints<_Iterators...>)
341
+ _CCCL_REQUIRES(_Constraints::__all_random_access)
342
+ _CCCL_API constexpr zip_transform_iterator&
343
+ operator+=(difference_type __n) noexcept(noexcept(::cuda::std::apply(__zip_op_pe{__n}, __current_)))
344
+ {
345
+ ::cuda::std::apply(__zip_op_pe{__n}, __current_);
346
+ return *this;
347
+ }
348
+
349
+ struct __zip_op_me
350
+ {
351
+ difference_type __n;
352
+
353
+ _CCCL_EXEC_CHECK_DISABLE
354
+ _CCCL_API constexpr void operator()(_Iterators&... __iters) const
355
+ noexcept(noexcept(((void) (__iters -= ::cuda::std::iter_difference_t<_Iterators>(__n)), ...)))
356
+ {
357
+ ((void) (__iters -= ::cuda::std::iter_difference_t<_Iterators>(__n)), ...);
358
+ }
359
+ };
360
+
361
+ //! @brief Decrements all stored iterators by a given number of elements
362
+ //! @param __n The number of elements to decrement
363
+ _CCCL_TEMPLATE(class _Constraints = __zip_iter_constraints<_Iterators...>)
364
+ _CCCL_REQUIRES(_Constraints::__all_random_access)
365
+ _CCCL_API constexpr zip_transform_iterator& operator-=(difference_type __n)
366
+ {
367
+ ::cuda::std::apply(__zip_op_me{__n}, __current_);
368
+ return *this;
369
+ }
370
+
371
+ //! @brief Returns a copy of a @c zip_transform_iterator incremented by a given number of elements
372
+ //! @param __iter The @c zip_transform_iterator to increment
373
+ //! @param __n The number of elements to increment
374
+ template <class _Constraints = __zip_iter_constraints<_Iterators...>>
375
+ _CCCL_API friend constexpr auto operator+(const zip_transform_iterator& __iter, difference_type __n)
376
+ _CCCL_TRAILING_REQUIRES(zip_transform_iterator)(_Constraints::__all_random_access)
377
+ {
378
+ auto __rhs = __iter;
379
+ __rhs += __n;
380
+ return __rhs;
381
+ }
382
+
383
+ //! @brief Returns a copy of a @c zip_transform_iterator incremented by a given number of elements
384
+ //! @param __n The number of elements to increment
385
+ //! @param __iter The @c zip_transform_iterator to increment
386
+ template <class _Constraints = __zip_iter_constraints<_Iterators...>>
387
+ _CCCL_API friend constexpr auto operator+(difference_type __n, const zip_transform_iterator& __iter)
388
+ _CCCL_TRAILING_REQUIRES(zip_transform_iterator)(_Constraints::__all_random_access)
389
+ {
390
+ return __iter + __n;
391
+ }
392
+
393
+ //! @brief Returns a copy of a @c zip_transform_iterator decremented by a given number of elements
394
+ //! @param __n The number of elements to decrement
395
+ //! @param __iter The @c zip_transform_iterator to decrement
396
+ template <class _Constraints = __zip_iter_constraints<_Iterators...>>
397
+ _CCCL_API friend constexpr auto operator-(const zip_transform_iterator& __iter, difference_type __n)
398
+ _CCCL_TRAILING_REQUIRES(zip_transform_iterator)(_Constraints::__all_random_access)
399
+ {
400
+ auto __rhs = __iter;
401
+ __rhs -= __n;
402
+ return __rhs;
403
+ }
404
+
405
+ struct __zip_op_minus
406
+ {
407
+ struct __less_abs
408
+ {
409
+ // abs in cstdlib is not constexpr
410
+ _CCCL_EXEC_CHECK_DISABLE
411
+ [[nodiscard]] _CCCL_API static constexpr difference_type
412
+ __abs(difference_type __t) noexcept(noexcept(__t < 0 ? -__t : __t))
413
+ {
414
+ return __t < 0 ? -__t : __t;
415
+ }
416
+
417
+ _CCCL_EXEC_CHECK_DISABLE
418
+ [[nodiscard]] _CCCL_API constexpr bool operator()(difference_type __n, difference_type __y) const
419
+ noexcept(noexcept(__abs(__n) < __abs(__y)))
420
+ {
421
+ return __abs(__n) < __abs(__y);
422
+ }
423
+ };
424
+
425
+ _CCCL_EXEC_CHECK_DISABLE
426
+ template <size_t _Zero, size_t... _Indices>
427
+ [[nodiscard]] _CCCL_API constexpr difference_type
428
+ operator()(const __tuple_or_pair<_Iterators...>& __iters1,
429
+ const __tuple_or_pair<_Iterators...>& __iters2,
430
+ ::cuda::std::index_sequence<_Zero, _Indices...>) const //
431
+ noexcept(noexcept(((::cuda::std::get<_Indices>(__iters1) - ::cuda::std::get<_Indices>(__iters2)) && ...)))
432
+ {
433
+ const auto __first = static_cast<difference_type>(::cuda::std::get<0>(__iters1) - ::cuda::std::get<0>(__iters2));
434
+ if (__first == 0)
435
+ {
436
+ return __first;
437
+ }
438
+
439
+ const difference_type __temp[] = {
440
+ __first,
441
+ static_cast<difference_type>(::cuda::std::get<_Indices>(__iters1) - ::cuda::std::get<_Indices>(__iters2))...};
442
+ return *::cuda::std::ranges::min_element(__temp, __zip_op_minus::__less_abs{});
443
+ }
444
+ };
445
+
446
+ //! @brief Returns the distance between two @c zip_transform_iterators
447
+ //! @returns The minimal distance between any of the stored iterators
448
+ template <class _Constraints = __zip_iter_constraints<_Iterators...>>
449
+ _CCCL_API friend constexpr auto operator-(const zip_transform_iterator& __n, const zip_transform_iterator& __y)
450
+ _CCCL_TRAILING_REQUIRES(difference_type)(_Constraints::__all_sized_sentinel)
451
+ {
452
+ return __zip_apply(__zip_op_minus{}, __n.__current_, __y.__current_);
453
+ }
454
+
455
+ struct __zip_op_eq
456
+ {
457
+ _CCCL_EXEC_CHECK_DISABLE
458
+ template <size_t... _Indices>
459
+ _CCCL_API constexpr bool operator()(const __tuple_or_pair<_Iterators...>& __iters1,
460
+ const __tuple_or_pair<_Iterators...>& __iters2,
461
+ ::cuda::std::index_sequence<_Indices...>) const
462
+ noexcept(noexcept(((::cuda::std::get<_Indices>(__iters1) == ::cuda::std::get<_Indices>(__iters2)) || ...)))
463
+ {
464
+ return ((::cuda::std::get<_Indices>(__iters1) == ::cuda::std::get<_Indices>(__iters2)) || ...);
465
+ }
466
+ };
467
+
468
+ //! @brief Compares two @c zip_transform_iterator for equality by comparing the tuple of stored iterators
469
+ template <class _Constraints = __zip_iter_constraints<_Iterators...>>
470
+ _CCCL_API friend constexpr auto operator==(const zip_transform_iterator& __n, const zip_transform_iterator& __y)
471
+ _CCCL_TRAILING_REQUIRES(bool)(_Constraints::__all_equality_comparable)
472
+ {
473
+ if constexpr (_Constraints::__all_bidirectional)
474
+ {
475
+ return __n.__current_ == __y.__current_;
476
+ }
477
+ else
478
+ {
479
+ return __zip_apply(__zip_op_eq{}, __n.__current_, __y.__current_);
480
+ }
481
+ _CCCL_UNREACHABLE();
482
+ }
483
+
484
+ #if _CCCL_STD_VER <= 2017
485
+ //! @brief Compares two @c zip_transform_iterator for inequality by comparing the tuple of stored iterators
486
+ template <class _Constraints = __zip_iter_constraints<_Iterators...>>
487
+ _CCCL_API friend constexpr auto operator!=(const zip_transform_iterator& __n, const zip_transform_iterator& __y)
488
+ _CCCL_TRAILING_REQUIRES(bool)(_Constraints::__all_equality_comparable)
489
+ {
490
+ if constexpr (_Constraints::__all_bidirectional)
491
+ {
492
+ return __n.__current_ != __y.__current_;
493
+ }
494
+ else
495
+ {
496
+ return !__zip_apply(__zip_op_eq{}, __n.__current_, __y.__current_);
497
+ }
498
+ _CCCL_UNREACHABLE();
499
+ }
500
+ #endif // _CCCL_STD_VER <= 2017
501
+
502
+ #if _LIBCUDACXX_HAS_SPACESHIP_OPERATOR()
503
+ //! @brief Three-way compares two @c zip_transform_iterator by comparing the tuple of stored iterators
504
+ template <class _Constraints = __zip_iter_constraints<_Iterators...>>
505
+ _CCCL_API friend constexpr auto operator<=>(const zip_transform_iterator& __n, const zip_transform_iterator& __y)
506
+ _CCCL_TRAILING_REQUIRES(bool)(_Constraints::__all_random_access&& _Constraints::__all_three_way_comparable)
507
+ {
508
+ return __n.__current_ <=> __y.__current_;
509
+ }
510
+
511
+ #else // ^^^ _LIBCUDACXX_HAS_SPACESHIP_OPERATOR() ^^^ / vvv !_LIBCUDACXX_HAS_SPACESHIP_OPERATOR() vvv
512
+
513
+ //! @brief Compares two @c zip_transform_iterator for less than by comparing the tuple of stored iterators
514
+ template <class _Constraints = __zip_iter_constraints<_Iterators...>>
515
+ _CCCL_API friend constexpr auto operator<(const zip_transform_iterator& __n, const zip_transform_iterator& __y)
516
+ _CCCL_TRAILING_REQUIRES(bool)(_Constraints::__all_random_access)
517
+ {
518
+ return __n.__current_ < __y.__current_;
519
+ }
520
+
521
+ //! @brief Compares two @c zip_transform_iterator for greater than by comparing the tuple of stored iterators
522
+ template <class _Constraints = __zip_iter_constraints<_Iterators...>>
523
+ _CCCL_API friend constexpr auto operator>(const zip_transform_iterator& __n, const zip_transform_iterator& __y)
524
+ _CCCL_TRAILING_REQUIRES(bool)(_Constraints::__all_random_access)
525
+ {
526
+ return __y < __n;
527
+ }
528
+
529
+ //! @brief Compares two @c zip_transform_iterator for less equal by comparing the tuple of stored iterators
530
+ template <class _Constraints = __zip_iter_constraints<_Iterators...>>
531
+ _CCCL_API friend constexpr auto operator<=(const zip_transform_iterator& __n, const zip_transform_iterator& __y)
532
+ _CCCL_TRAILING_REQUIRES(bool)(_Constraints::__all_random_access)
533
+ {
534
+ return !(__y < __n);
535
+ }
536
+
537
+ //! @brief Compares two @c zip_transform_iterator for greater equal by comparing the tuple of stored iterators
538
+ template <class _Constraints = __zip_iter_constraints<_Iterators...>>
539
+ _CCCL_API friend constexpr auto operator>=(const zip_transform_iterator& __n, const zip_transform_iterator& __y)
540
+ _CCCL_TRAILING_REQUIRES(bool)(_Constraints::__all_random_access)
541
+ {
542
+ return !(__n < __y);
543
+ }
544
+ #endif // !_LIBCUDACXX_HAS_SPACESHIP_OPERATOR()
545
+ };
546
+
547
+ template <class _Fn, class... _Iterators>
548
+ _CCCL_HOST_DEVICE zip_transform_iterator(_Fn, ::cuda::std::tuple<_Iterators...>)
549
+ -> zip_transform_iterator<_Fn, _Iterators...>;
550
+
551
+ template <class _Fn, class _Iterator1, class _Iterator2>
552
+ _CCCL_HOST_DEVICE zip_transform_iterator(_Fn, ::cuda::std::pair<_Iterator1, _Iterator2>)
553
+ -> zip_transform_iterator<_Fn, _Iterator1, _Iterator2>;
554
+
555
+ template <class _Fn, class... _Iterators>
556
+ _CCCL_HOST_DEVICE zip_transform_iterator(_Fn, _Iterators...) -> zip_transform_iterator<_Fn, _Iterators...>;
557
+
558
+ //! @brief Creates a @c zip_transform_iterator from a tuple of iterators.
559
+ //! @param __t The tuple of iterators to wrap
560
+ template <class _Fn, class... _Iterators>
561
+ [[nodiscard]] _CCCL_API constexpr auto
562
+ make_zip_transform_iterator(_Fn __fun, ::cuda::std::tuple<_Iterators...> __t) noexcept(
563
+ ::cuda::std::is_nothrow_move_constructible_v<_Fn>
564
+ && __zip_iter_constraints<_Iterators...>::__all_nothrow_move_constructible)
565
+ {
566
+ return zip_transform_iterator<_Fn, _Iterators...>{::cuda::std::move(__fun), ::cuda::std::move(__t)};
567
+ }
568
+
569
+ //! @brief Creates a @c zip_transform_iterator from a variadic number of iterators.
570
+ //! @param __iters The iterators to wrap
571
+ template <class _Fn, class... _Iterators>
572
+ [[nodiscard]] _CCCL_API constexpr auto make_zip_transform_iterator(_Fn __fun, _Iterators... __iters) noexcept(
573
+ ::cuda::std::is_nothrow_move_constructible_v<_Fn>
574
+ && __zip_iter_constraints<_Iterators...>::__all_nothrow_move_constructible)
575
+ {
576
+ return zip_transform_iterator<_Fn, _Iterators...>{::cuda::std::move(__fun), ::cuda::std::move(__iters)...};
577
+ }
578
+
579
+ //! @}
580
+
581
+ _CCCL_END_NAMESPACE_CUDA
582
+
583
+ // GCC and MSVC2019 have issues determining __is_fancy_pointer in C++17 because they fail to instantiate pointer_traits
584
+ #if (_CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC)) && _CCCL_STD_VER <= 2017
585
+ _CCCL_BEGIN_NAMESPACE_CUDA_STD
586
+ template <class _Fn, class... _Iterators>
587
+ inline constexpr bool __is_fancy_pointer<::cuda::zip_transform_iterator<_Fn, _Iterators...>> = false;
588
+ _CCCL_END_NAMESPACE_CUDA_STD
589
+ #endif // (_CCCL_COMPILER(GCC) || _CCCL_COMPILER(MSVC)) && _CCCL_STD_VER <= 2017
590
+
591
+ #include <cuda/std/__cccl/epilogue.h>
592
+
593
+ #endif // _CUDA___ITERATOR_ZIP_TRANSFORM_ITERATOR_H
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA___RUNTIME_ENSURE_CURRENT_CONTEXT_H
12
12
  #define _CUDA___RUNTIME_ENSURE_CURRENT_CONTEXT_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -23,7 +23,8 @@
23
23
 
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
 
26
- # include <cuda/__device/all_devices.h>
26
+ # include <cuda/__device/device_ref.h>
27
+ # include <cuda/__device/physical_device.h>
27
28
  # include <cuda/__driver/driver_api.h>
28
29
 
29
30
  # include <cuda/std/__cccl/prologue.h>
@@ -46,7 +47,7 @@ struct [[maybe_unused]] __ensure_current_context
46
47
  //! @throws cuda_error if the context switch fails
47
48
  explicit __ensure_current_context(device_ref __new_device)
48
49
  {
49
- auto __ctx = devices[__new_device.get()].primary_context();
50
+ auto __ctx = ::cuda::__physical_devices()[__new_device.get()].__primary_context();
50
51
  ::cuda::__driver::__ctxPush(__ctx);
51
52
  }
52
53
 
@@ -23,6 +23,7 @@
23
23
 
24
24
  #if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
25
25
 
26
+ # include <cuda/__device/device_ref.h>
26
27
  # include <cuda/__driver/driver_api.h>
27
28
  # include <cuda/__event/timed_event.h>
28
29
  # include <cuda/__fwd/get_stream.h>
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA___UTILITY_BASIC_ANY_H
12
12
  #define _CUDA___UTILITY_BASIC_ANY_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA_ALGORITHM
12
12
  #define _CUDA_ALGORITHM
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -11,6 +11,16 @@
11
11
  #ifndef _CUDA_DEVICES
12
12
  #define _CUDA_DEVICES
13
13
 
14
+ #include <cuda/std/detail/__config>
15
+
16
+ #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
+ # pragma GCC system_header
18
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19
+ # pragma clang system_header
20
+ #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21
+ # pragma system_header
22
+ #endif // no system header
23
+
14
24
  #include <cuda/__device/all_devices.h>
15
25
  #include <cuda/__device/arch_traits.h>
16
26
  #include <cuda/__device/attributes.h>
@@ -33,6 +33,7 @@
33
33
  #include <cuda/__iterator/transform_output_iterator.h>
34
34
  #include <cuda/__iterator/zip_function.h>
35
35
  #include <cuda/__iterator/zip_iterator.h>
36
+ #include <cuda/__iterator/zip_transform_iterator.h>
36
37
  #include <cuda/std/iterator>
37
38
 
38
39
  #endif // _CUDA_ITERATOR