cuda-cccl 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show
  1. cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
  2. cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
  3. cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
  4. cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
  5. cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
  6. cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
  7. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
  8. cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
  9. cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
  10. cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
  11. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
  12. cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
  13. cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
  14. cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
  15. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
  16. cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
  17. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
  18. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
  19. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
  20. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
  21. cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
  22. cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
  23. cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
  24. cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
  25. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
  26. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
  27. cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
  28. cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
  29. cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
  30. cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
  31. cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
  32. cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
  33. cuda/cccl/headers/include/cuda/__cccl_config +1 -0
  34. cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
  35. cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
  36. cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
  37. cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
  38. cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
  39. cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
  40. cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
  41. cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
  42. cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
  43. cuda/cccl/headers/include/cuda/__event/event.h +26 -26
  44. cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
  45. cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
  46. cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
  47. cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
  48. cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
  49. cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
  50. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
  51. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
  52. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
  53. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
  54. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
  55. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
  56. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
  57. cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
  58. cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
  59. cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
  60. cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
  61. cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
  62. cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
  63. cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
  64. cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
  65. cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
  66. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
  67. cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
  68. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
  69. cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
  70. cuda/cccl/headers/include/cuda/cmath +1 -0
  71. cuda/cccl/headers/include/cuda/devices +3 -0
  72. cuda/cccl/headers/include/cuda/memory +1 -0
  73. cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
  74. cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
  75. cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
  76. cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
  77. cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
  78. cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
  79. cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
  80. cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
  81. cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
  82. cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
  83. cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
  84. cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
  85. cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
  86. cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
  87. cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
  88. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
  89. cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
  90. cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
  91. cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
  92. cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
  93. cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
  94. cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
  95. cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
  96. cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
  97. cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
  98. cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
  99. cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
  100. cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
  101. cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
  102. cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
  103. cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
  104. cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
  105. cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
  106. cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
  107. cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
  108. cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
  109. cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
  110. cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
  111. cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
  112. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
  113. cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
  114. cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
  115. cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
  116. cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
  117. cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
  118. cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
  119. cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
  120. cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
  121. cuda/cccl/headers/include/cuda/std/bitset +1 -1
  122. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
  123. cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
  124. cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
  125. cuda/cccl/headers/include/cuda/std/numbers +5 -0
  126. cuda/cccl/headers/include/cuda/std/string_view +146 -11
  127. cuda/cccl/headers/include/cuda/stream_ref +5 -0
  128. cuda/cccl/headers/include/cuda/utility +1 -0
  129. cuda/cccl/headers/include/nv/target +7 -2
  130. cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
  131. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
  132. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
  133. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
  134. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
  135. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
  136. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
  137. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
  138. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
  139. cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
  140. cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
  141. cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
  142. cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
  143. cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
  144. cuda/cccl/headers/include/thrust/device_delete.h +18 -3
  145. cuda/cccl/headers/include/thrust/device_free.h +16 -3
  146. cuda/cccl/headers/include/thrust/device_new.h +29 -8
  147. cuda/cccl/headers/include/thrust/host_vector.h +1 -1
  148. cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
  149. cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
  150. cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
  151. cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
  152. cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
  153. cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
  154. cuda/compute/__init__.py +2 -0
  155. cuda/compute/_bindings.pyi +43 -1
  156. cuda/compute/_bindings_impl.pyx +156 -7
  157. cuda/compute/algorithms/_scan.py +108 -36
  158. cuda/compute/algorithms/_transform.py +32 -11
  159. cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  160. cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
  161. cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  162. cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
  163. cuda/compute/iterators/__init__.py +2 -0
  164. cuda/compute/iterators/_factories.py +28 -0
  165. cuda/compute/iterators/_iterators.py +206 -1
  166. cuda/compute/numba_utils.py +2 -2
  167. cuda/compute/typing.py +2 -0
  168. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
  169. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
  170. cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
  171. cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
  172. cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
  173. cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
  174. cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
  175. cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
  176. cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
  177. cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
  178. cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
  179. cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
  180. cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
  181. cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
  182. cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
  183. cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
  184. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
  185. {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -39,11 +39,13 @@
39
39
  #if _CCCL_HAS_CUDA_COMPILER()
40
40
  # include <thrust/system/cuda/config.h>
41
41
 
42
- # include <thrust/distance.h>
43
- # include <thrust/iterator/counting_iterator.h>
44
- # include <thrust/iterator/transform_iterator.h>
45
42
  # include <thrust/system/cuda/detail/execution_policy.h>
46
43
 
44
+ # include <cuda/__iterator/counting_iterator.h>
45
+ # include <cuda/__iterator/transform_iterator.h>
46
+ # include <cuda/__iterator/zip_iterator.h>
47
+ # include <cuda/std/__iterator/distance.h>
48
+
47
49
  THRUST_NAMESPACE_BEGIN
48
50
  namespace cuda_cub
49
51
  {
@@ -62,7 +64,6 @@ InputIt _CCCL_HOST_DEVICE find(execution_policy<Derived>& policy, InputIt first,
62
64
  }; // namespace cuda_cub
63
65
  THRUST_NAMESPACE_END
64
66
 
65
- # include <thrust/iterator/zip_iterator.h>
66
67
  # include <thrust/system/cuda/detail/reduce.h>
67
68
 
68
69
  THRUST_NAMESPACE_BEGIN
@@ -92,109 +93,13 @@ struct functor
92
93
  }
93
94
  }
94
95
  };
95
-
96
- template <class ValueType, class InputIt, class UnaryOp>
97
- struct transform_input_iterator_t
98
- {
99
- using self_t = transform_input_iterator_t;
100
- using difference_type = thrust::detail::it_difference_t<InputIt>;
101
- using value_type = ValueType;
102
- using pointer = void;
103
- using reference = value_type;
104
- using iterator_category = ::cuda::std::random_access_iterator_tag;
105
-
106
- InputIt input;
107
- mutable UnaryOp op;
108
-
109
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE transform_input_iterator_t(InputIt input, UnaryOp op)
110
- : input(input)
111
- , op(op)
112
- {}
113
-
114
- transform_input_iterator_t(const self_t&) = default;
115
-
116
- // UnaryOp might not be copy assignable, such as when it is a lambda. Define
117
- // an explicit copy assignment operator that doesn't try to assign it.
118
- _CCCL_HOST_DEVICE self_t& operator=(const self_t& o)
119
- {
120
- input = o.input;
121
- return *this;
122
- }
123
-
124
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator++(int)
125
- {
126
- self_t retval = *this;
127
- ++input;
128
- return retval;
129
- }
130
-
131
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator++()
132
- {
133
- ++input;
134
- return *this;
135
- }
136
-
137
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const
138
- {
139
- thrust::detail::it_value_t<InputIt> x = *input;
140
- return op(x);
141
- }
142
-
143
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*()
144
- {
145
- thrust::detail::it_value_t<InputIt> x = *input;
146
- return op(x);
147
- }
148
-
149
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator+(difference_type n) const
150
- {
151
- return self_t(input + n, op);
152
- }
153
-
154
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t& operator+=(difference_type n)
155
- {
156
- input += n;
157
- return *this;
158
- }
159
-
160
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator-(difference_type n) const
161
- {
162
- return self_t(input - n, op);
163
- }
164
-
165
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t& operator-=(difference_type n)
166
- {
167
- input -= n;
168
- return *this;
169
- }
170
-
171
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_t other) const
172
- {
173
- return input - other.input;
174
- }
175
-
176
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](difference_type n) const
177
- {
178
- return op(input[n]);
179
- }
180
-
181
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_t& rhs) const
182
- {
183
- return (input == rhs.input);
184
- }
185
-
186
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_t& rhs) const
187
- {
188
- return (input != rhs.input);
189
- }
190
- };
191
96
  } // namespace __find_if
192
97
 
193
98
  template <class Derived, class InputIt, class Size, class Predicate>
194
99
  InputIt _CCCL_HOST_DEVICE
195
100
  find_if_n(execution_policy<Derived>& policy, InputIt first, Size num_items, Predicate predicate)
196
101
  {
197
- using result_type = typename thrust::tuple<bool, Size>;
102
+ using result_type = ::cuda::std::tuple<bool, Size>;
198
103
 
199
104
  // empty sequence
200
105
  if (num_items == 0)
@@ -212,27 +117,20 @@ find_if_n(execution_policy<Derived>& policy, InputIt first, Size num_items, Pred
212
117
  const Size interval_threshold = 1 << 20;
213
118
  const Size interval_size = (::cuda::std::min) (interval_threshold, num_items);
214
119
 
215
- // FIXME(bgruber): we should also be able to use transform_iterator here, but it makes nvc++ hang. See:
216
- // https://github.com/NVIDIA/cccl/issues/3594. The problem does not occur with nvcc, so we could not add a test :/
217
- using XfrmIterator = __find_if::transform_input_iterator_t<bool, InputIt, Predicate>;
218
- // using XfrmIterator = transform_iterator<Predicate, InputIt>;
219
- using IteratorTuple = thrust::tuple<XfrmIterator, counting_iterator<Size>>;
220
- using ZipIterator = thrust::zip_iterator<IteratorTuple>;
221
-
222
- IteratorTuple iter_tuple = thrust::make_tuple(XfrmIterator(first, predicate), counting_iterator<Size>(0));
223
-
224
- ZipIterator begin = thrust::make_zip_iterator(iter_tuple);
225
- ZipIterator end = begin + num_items;
120
+ const auto begin = ::cuda::make_zip_iterator(
121
+ ::cuda::make_transform_iterator(try_unwrap_contiguous_iterator(first), predicate),
122
+ ::cuda::counting_iterator<Size>(0));
123
+ const auto end = begin + num_items;
226
124
 
227
- for (ZipIterator interval_begin = begin; interval_begin < end; interval_begin += interval_size)
125
+ for (auto interval_begin = begin; interval_begin < end; interval_begin += interval_size)
228
126
  {
229
- ZipIterator interval_end = interval_begin + interval_size;
127
+ auto interval_end = interval_begin + interval_size;
230
128
  if (end < interval_end)
231
129
  {
232
130
  interval_end = end;
233
131
  } // end if
234
132
 
235
- result_type result = reduce(
133
+ const result_type result = reduce(
236
134
  policy, interval_begin, interval_end, result_type(false, interval_end - begin), __find_if::functor<result_type>());
237
135
 
238
136
  // see if we found something
@@ -73,12 +73,14 @@ struct transform_pair_of_input_iterators_t
73
73
  using value_type = ValueType;
74
74
  using pointer = void;
75
75
  using reference = value_type;
76
- using iterator_category = std::random_access_iterator_tag;
76
+ using iterator_category = ::cuda::std::random_access_iterator_tag;
77
77
 
78
78
  InputIt1 input1;
79
79
  InputIt2 input2;
80
80
  mutable BinaryOp op;
81
81
 
82
+ transform_pair_of_input_iterators_t() = default;
83
+
82
84
  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE
83
85
  transform_pair_of_input_iterators_t(InputIt1 input1_, InputIt2 input2_, BinaryOp op_)
84
86
  : input1(input1_)
@@ -107,7 +109,7 @@ struct transform_pair_of_input_iterators_t
107
109
  }
108
110
 
109
111
  /// Prefix increment
110
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator++()
112
+ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t& operator++()
111
113
  {
112
114
  ++input1;
113
115
  ++input2;
@@ -177,6 +179,10 @@ struct transform_pair_of_input_iterators_t
177
179
  return (input1 != rhs.input1) || (input2 != rhs.input2);
178
180
  }
179
181
 
182
+ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator<(const self_t& rhs) const
183
+ {
184
+ return input1 < rhs.input1;
185
+ }
180
186
  }; // struct transform_pair_of_input_iterators_t
181
187
  } // namespace detail
182
188
 
@@ -79,7 +79,7 @@ namespace detail
79
79
  template <typename Iterator>
80
80
  inline constexpr bool is_libcxx_wrap_iter_v = false;
81
81
 
82
- #if defined(_LIBCPP_VERSION)
82
+ #if _CCCL_HOST_STD_LIB(LIBCXX)
83
83
  template <typename Iterator>
84
84
  inline constexpr bool is_libcxx_wrap_iter_v<
85
85
  # if _LIBCPP_VERSION < 14000
@@ -88,23 +88,23 @@ inline constexpr bool is_libcxx_wrap_iter_v<
88
88
  std::__wrap_iter<Iterator>
89
89
  # endif
90
90
  > = true;
91
- #endif
91
+ #endif // _CCCL_HOST_STD_LIB(LIBCXX)
92
92
 
93
93
  template <typename Iterator>
94
94
  inline constexpr bool is_libstdcxx_normal_iterator_v = false;
95
95
 
96
- #if defined(__GLIBCXX__)
96
+ #if _CCCL_HOST_STD_LIB(LIBSTDCXX)
97
97
  template <typename Iterator, typename Container>
98
98
  inline constexpr bool is_libstdcxx_normal_iterator_v<::__gnu_cxx::__normal_iterator<Iterator, Container>> = true;
99
- #endif
99
+ #endif // _CCCL_HOST_STD_LIB(LIBSTDCXX)
100
100
 
101
- #if _CCCL_COMPILER(MSVC)
101
+ #if _CCCL_HOST_STD_LIB(STL)
102
102
  template <typename Iterator>
103
103
  inline constexpr bool is_msvc_contiguous_iterator_v = ::cuda::std::is_pointer_v<::std::_Unwrapped_t<Iterator>>;
104
- #else
104
+ #else // ^^^ _CCCL_HOST_STD_LIB(STL) ^^^ / vvv !_CCCL_HOST_STD_LIB(STL) vvv
105
105
  template <typename Iterator>
106
106
  inline constexpr bool is_msvc_contiguous_iterator_v = false;
107
- #endif
107
+ #endif // ^^^ !_CCCL_HOST_STD_LIB(STL) ^^^
108
108
 
109
109
  template <typename Iterator>
110
110
  inline constexpr bool is_contiguous_iterator_impl_v =
cuda/compute/__init__.py CHANGED
@@ -32,6 +32,7 @@ from .iterators import (
32
32
  CacheModifiedInputIterator,
33
33
  ConstantIterator,
34
34
  CountingIterator,
35
+ PermutationIterator,
35
36
  ReverseIterator,
36
37
  TransformIterator,
37
38
  TransformOutputIterator,
@@ -63,6 +64,7 @@ __all__ = [
63
64
  "make_unique_by_key",
64
65
  "merge_sort",
65
66
  "OpKind",
67
+ "PermutationIterator",
66
68
  "radix_sort",
67
69
  "reduce_into",
68
70
  "ReverseIterator",
@@ -57,6 +57,12 @@ class SortOrder(IntEnum):
57
57
  ASCENDING = ...
58
58
  DESCENDING = ...
59
59
 
60
+ class InitKind(IntEnum):
61
+ _value_: int
62
+ NO_INIT = ...
63
+ FUTURE_VALUE_INIT = ...
64
+ VALUE_INIT = ...
65
+
60
66
  class Op:
61
67
  def __init__(
62
68
  self,
@@ -133,6 +139,8 @@ class Iterator:
133
139
  def state(self, value) -> None: ...
134
140
  @property
135
141
  def type(self) -> IteratorKind: ...
142
+ @property
143
+ def value_type(self) -> TypeInfo: ...
136
144
  def as_bytes(self) -> bytes: ...
137
145
  def is_kind_pointer(self) -> bool: ...
138
146
  def is_kind_iterator(self) -> bool: ...
@@ -197,8 +205,9 @@ class DeviceScanBuildResult:
197
205
  d_in: Iterator,
198
206
  d_out: Iterator,
199
207
  binary_op: Op,
200
- h_init: Value,
208
+ init_type: TypeInfo,
201
209
  force_inclusive: bool,
210
+ init_kind: InitKind,
202
211
  info: CommonData,
203
212
  ): ...
204
213
  def compute_inclusive(
@@ -223,6 +232,39 @@ class DeviceScanBuildResult:
223
232
  h_init: Value,
224
233
  stream,
225
234
  ) -> int: ...
235
+ def compute_inclusive_future_value(
236
+ self,
237
+ temp_storage_ptr: int | None,
238
+ temp_storage_nbytes: int,
239
+ d_in: Iterator,
240
+ d_out: Iterator,
241
+ num_items: int,
242
+ binary_op: Op,
243
+ h_init: Iterator,
244
+ stream,
245
+ ) -> int: ...
246
+ def compute_exclusive_future_value(
247
+ self,
248
+ temp_storage_ptr: int | None,
249
+ temp_storage_nbytes: int,
250
+ d_in: Iterator,
251
+ d_out: Iterator,
252
+ num_items: int,
253
+ binary_op: Op,
254
+ h_init: Iterator,
255
+ stream,
256
+ ) -> int: ...
257
+ def compute_inclusive_no_init(
258
+ self,
259
+ temp_storage_ptr: int | None,
260
+ temp_storage_nbytes: int,
261
+ d_in: Iterator,
262
+ d_out: Iterator,
263
+ num_items: int,
264
+ binary_op: Op,
265
+ h_init: None,
266
+ stream,
267
+ ) -> int: ...
226
268
 
227
269
  # ---------------------
228
270
  # DeviceSegmentedReduce
@@ -120,6 +120,10 @@ cdef extern from "cccl/c/types.h":
120
120
  ASCENDING "CCCL_ASCENDING"
121
121
  DESCENDING "CCCL_DESCENDING"
122
122
 
123
+ cpdef enum cccl_init_kind_t:
124
+ VALUE_INIT "CCCL_VALUE_INIT"
125
+ FUTURE_VALUE_INIT "CCCL_FUTURE_VALUE_INIT"
126
+ NO_INIT "CCCL_NO_INIT"
123
127
 
124
128
  cdef void arg_type_check(
125
129
  str arg_name,
@@ -136,6 +140,7 @@ OpKind = cccl_op_kind_t
136
140
  TypeEnum = cccl_type_enum
137
141
  IteratorKind = cccl_iterator_kind_t
138
142
  SortOrder = cccl_sort_order_t
143
+ InitKind = cccl_init_kind_t
139
144
 
140
145
  cdef void _validate_alignment(int alignment) except *:
141
146
  """
@@ -724,6 +729,11 @@ cdef class Iterator:
724
729
  else:
725
730
  return IteratorKind.ITERATOR
726
731
 
732
+ @property
733
+ def value_type(self):
734
+ cdef cccl_type_info type_info = self.iter_data.value_type
735
+ return TypeInfo(type_info.size, type_info.alignment, type_info.type)
736
+
727
737
  def is_kind_pointer(self):
728
738
  cdef cccl_iterator_kind_t it_kind = self.iter_data.type
729
739
  return (it_kind == cccl_iterator_kind_t.POINTER)
@@ -947,8 +957,9 @@ cdef extern from "cccl/c/scan.h":
947
957
  cccl_iterator_t,
948
958
  cccl_iterator_t,
949
959
  cccl_op_t,
950
- cccl_value_t,
960
+ cccl_type_info,
951
961
  _Bool,
962
+ cccl_init_kind_t,
952
963
  int, int, const char*, const char*, const char*, const char*
953
964
  ) nogil
954
965
 
@@ -976,6 +987,41 @@ cdef extern from "cccl/c/scan.h":
976
987
  CUstream
977
988
  ) nogil
978
989
 
990
+ cdef CUresult cccl_device_exclusive_scan_future_value(
991
+ cccl_device_scan_build_result_t,
992
+ void *,
993
+ size_t *,
994
+ cccl_iterator_t,
995
+ cccl_iterator_t,
996
+ uint64_t,
997
+ cccl_op_t,
998
+ cccl_iterator_t,
999
+ CUstream
1000
+ ) nogil
1001
+
1002
+ cdef CUresult cccl_device_inclusive_scan_future_value(
1003
+ cccl_device_scan_build_result_t,
1004
+ void *,
1005
+ size_t *,
1006
+ cccl_iterator_t,
1007
+ cccl_iterator_t,
1008
+ uint64_t,
1009
+ cccl_op_t,
1010
+ cccl_iterator_t,
1011
+ CUstream
1012
+ ) nogil
1013
+
1014
+ cdef CUresult cccl_device_inclusive_scan_no_init(
1015
+ cccl_device_scan_build_result_t,
1016
+ void *,
1017
+ size_t *,
1018
+ cccl_iterator_t,
1019
+ cccl_iterator_t,
1020
+ uint64_t,
1021
+ cccl_op_t,
1022
+ CUstream
1023
+ ) nogil
1024
+
979
1025
  cdef CUresult cccl_device_scan_cleanup(
980
1026
  cccl_device_scan_build_result_t*
981
1027
  ) nogil
@@ -989,8 +1035,9 @@ cdef class DeviceScanBuildResult:
989
1035
  Iterator d_in,
990
1036
  Iterator d_out,
991
1037
  Op op,
992
- Value h_init,
1038
+ TypeInfo init_type,
993
1039
  bint force_inclusive,
1040
+ cccl_init_kind_t init_kind,
994
1041
  CommonData common_data
995
1042
  ):
996
1043
  cdef CUresult status = -1
@@ -1008,8 +1055,9 @@ cdef class DeviceScanBuildResult:
1008
1055
  d_in.iter_data,
1009
1056
  d_out.iter_data,
1010
1057
  op.op_data,
1011
- h_init.value_data,
1058
+ init_type.type_info,
1012
1059
  force_inclusive,
1060
+ init_kind,
1013
1061
  cc_major,
1014
1062
  cc_minor,
1015
1063
  cub_path,
@@ -1035,7 +1083,7 @@ cdef class DeviceScanBuildResult:
1035
1083
  Iterator d_out,
1036
1084
  size_t num_items,
1037
1085
  Op op,
1038
- Value h_init,
1086
+ Value init_value,
1039
1087
  stream
1040
1088
  ):
1041
1089
  cdef CUresult status = -1
@@ -1052,7 +1100,7 @@ cdef class DeviceScanBuildResult:
1052
1100
  d_out.iter_data,
1053
1101
  <uint64_t>num_items,
1054
1102
  op.op_data,
1055
- h_init.value_data,
1103
+ init_value.value_data,
1056
1104
  c_stream
1057
1105
  )
1058
1106
  if status != 0:
@@ -1069,7 +1117,7 @@ cdef class DeviceScanBuildResult:
1069
1117
  Iterator d_out,
1070
1118
  size_t num_items,
1071
1119
  Op op,
1072
- Value h_init,
1120
+ Value init_value,
1073
1121
  stream
1074
1122
  ):
1075
1123
  cdef CUresult status = -1
@@ -1086,7 +1134,7 @@ cdef class DeviceScanBuildResult:
1086
1134
  d_out.iter_data,
1087
1135
  <uint64_t>num_items,
1088
1136
  op.op_data,
1089
- h_init.value_data,
1137
+ init_value.value_data,
1090
1138
  c_stream
1091
1139
  )
1092
1140
  if status != 0:
@@ -1095,6 +1143,107 @@ cdef class DeviceScanBuildResult:
1095
1143
  )
1096
1144
  return storage_sz
1097
1145
 
1146
+ cpdef int compute_inclusive_future_value(
1147
+ DeviceScanBuildResult self,
1148
+ temp_storage_ptr,
1149
+ temp_storage_bytes,
1150
+ Iterator d_in,
1151
+ Iterator d_out,
1152
+ size_t num_items,
1153
+ Op op,
1154
+ Iterator init_value,
1155
+ stream
1156
+ ):
1157
+ cdef CUresult status = -1
1158
+ cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
1159
+ cdef size_t storage_sz = <size_t>temp_storage_bytes
1160
+ cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
1161
+
1162
+ with nogil:
1163
+ status = cccl_device_inclusive_scan_future_value(
1164
+ self.build_data,
1165
+ storage_ptr,
1166
+ &storage_sz,
1167
+ d_in.iter_data,
1168
+ d_out.iter_data,
1169
+ <uint64_t>num_items,
1170
+ op.op_data,
1171
+ init_value.iter_data,
1172
+ c_stream
1173
+ )
1174
+ if status != 0:
1175
+ raise RuntimeError(
1176
+ f"Failed executing inclusive scan, error code: {status}"
1177
+ )
1178
+ return storage_sz
1179
+
1180
+ cpdef int compute_exclusive_future_value(
1181
+ DeviceScanBuildResult self,
1182
+ temp_storage_ptr,
1183
+ temp_storage_bytes,
1184
+ Iterator d_in,
1185
+ Iterator d_out,
1186
+ size_t num_items,
1187
+ Op op,
1188
+ Iterator init_value,
1189
+ stream
1190
+ ):
1191
+ cdef CUresult status = -1
1192
+ cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
1193
+ cdef size_t storage_sz = <size_t>temp_storage_bytes
1194
+ cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
1195
+
1196
+ with nogil:
1197
+ status = cccl_device_exclusive_scan_future_value(
1198
+ self.build_data,
1199
+ storage_ptr,
1200
+ &storage_sz,
1201
+ d_in.iter_data,
1202
+ d_out.iter_data,
1203
+ <uint64_t>num_items,
1204
+ op.op_data,
1205
+ init_value.iter_data,
1206
+ c_stream
1207
+ )
1208
+ if status != 0:
1209
+ raise RuntimeError(
1210
+ f"Failed executing exclusive scan, error code: {status}"
1211
+ )
1212
+ return storage_sz
1213
+
1214
+ cpdef int compute_inclusive_no_init(
1215
+ DeviceScanBuildResult self,
1216
+ temp_storage_ptr,
1217
+ temp_storage_bytes,
1218
+ Iterator d_in,
1219
+ Iterator d_out,
1220
+ size_t num_items,
1221
+ Op op,
1222
+ object init_value,
1223
+ stream
1224
+ ):
1225
+ cdef CUresult status = -1
1226
+ cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
1227
+ cdef size_t storage_sz = <size_t>temp_storage_bytes
1228
+ cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
1229
+
1230
+ with nogil:
1231
+ status = cccl_device_inclusive_scan_no_init(
1232
+ self.build_data,
1233
+ storage_ptr,
1234
+ &storage_sz,
1235
+ d_in.iter_data,
1236
+ d_out.iter_data,
1237
+ <uint64_t>num_items,
1238
+ op.op_data,
1239
+ c_stream
1240
+ )
1241
+ if status != 0:
1242
+ raise RuntimeError(
1243
+ f"Failed executing inclusive scan, error code: {status}"
1244
+ )
1245
+ return storage_sz
1246
+
1098
1247
  def _get_cubin(self):
1099
1248
  return PyBytes_FromStringAndSize(
1100
1249
  <const char*>self.build_data.cubin,