cuda-cccl 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +12 -13
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +4 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +239 -317
- cuda/cccl/headers/include/cuda/__device/attributes.h +4 -3
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +0 -10
- cuda/cccl/headers/include/cuda/__device/physical_device.h +1 -26
- cuda/cccl/headers/include/cuda/__event/event.h +26 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +9 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +4 -4
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +2 -12
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +15 -19
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +59 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +5 -4
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +17 -16
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +3 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +146 -11
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/compute/__init__.py +2 -0
- cuda/compute/_bindings.pyi +43 -1
- cuda/compute/_bindings_impl.pyx +156 -7
- cuda/compute/algorithms/_scan.py +108 -36
- cuda/compute/algorithms/_transform.py +32 -11
- cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/iterators/__init__.py +2 -0
- cuda/compute/iterators/_factories.py +28 -0
- cuda/compute/iterators/_iterators.py +206 -1
- cuda/compute/numba_utils.py +2 -2
- cuda/compute/typing.py +2 -0
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +171 -175
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.1.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -39,11 +39,13 @@
|
|
|
39
39
|
#if _CCCL_HAS_CUDA_COMPILER()
|
|
40
40
|
# include <thrust/system/cuda/config.h>
|
|
41
41
|
|
|
42
|
-
# include <thrust/distance.h>
|
|
43
|
-
# include <thrust/iterator/counting_iterator.h>
|
|
44
|
-
# include <thrust/iterator/transform_iterator.h>
|
|
45
42
|
# include <thrust/system/cuda/detail/execution_policy.h>
|
|
46
43
|
|
|
44
|
+
# include <cuda/__iterator/counting_iterator.h>
|
|
45
|
+
# include <cuda/__iterator/transform_iterator.h>
|
|
46
|
+
# include <cuda/__iterator/zip_iterator.h>
|
|
47
|
+
# include <cuda/std/__iterator/distance.h>
|
|
48
|
+
|
|
47
49
|
THRUST_NAMESPACE_BEGIN
|
|
48
50
|
namespace cuda_cub
|
|
49
51
|
{
|
|
@@ -62,7 +64,6 @@ InputIt _CCCL_HOST_DEVICE find(execution_policy<Derived>& policy, InputIt first,
|
|
|
62
64
|
}; // namespace cuda_cub
|
|
63
65
|
THRUST_NAMESPACE_END
|
|
64
66
|
|
|
65
|
-
# include <thrust/iterator/zip_iterator.h>
|
|
66
67
|
# include <thrust/system/cuda/detail/reduce.h>
|
|
67
68
|
|
|
68
69
|
THRUST_NAMESPACE_BEGIN
|
|
@@ -92,109 +93,13 @@ struct functor
|
|
|
92
93
|
}
|
|
93
94
|
}
|
|
94
95
|
};
|
|
95
|
-
|
|
96
|
-
template <class ValueType, class InputIt, class UnaryOp>
|
|
97
|
-
struct transform_input_iterator_t
|
|
98
|
-
{
|
|
99
|
-
using self_t = transform_input_iterator_t;
|
|
100
|
-
using difference_type = thrust::detail::it_difference_t<InputIt>;
|
|
101
|
-
using value_type = ValueType;
|
|
102
|
-
using pointer = void;
|
|
103
|
-
using reference = value_type;
|
|
104
|
-
using iterator_category = ::cuda::std::random_access_iterator_tag;
|
|
105
|
-
|
|
106
|
-
InputIt input;
|
|
107
|
-
mutable UnaryOp op;
|
|
108
|
-
|
|
109
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE transform_input_iterator_t(InputIt input, UnaryOp op)
|
|
110
|
-
: input(input)
|
|
111
|
-
, op(op)
|
|
112
|
-
{}
|
|
113
|
-
|
|
114
|
-
transform_input_iterator_t(const self_t&) = default;
|
|
115
|
-
|
|
116
|
-
// UnaryOp might not be copy assignable, such as when it is a lambda. Define
|
|
117
|
-
// an explicit copy assignment operator that doesn't try to assign it.
|
|
118
|
-
_CCCL_HOST_DEVICE self_t& operator=(const self_t& o)
|
|
119
|
-
{
|
|
120
|
-
input = o.input;
|
|
121
|
-
return *this;
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator++(int)
|
|
125
|
-
{
|
|
126
|
-
self_t retval = *this;
|
|
127
|
-
++input;
|
|
128
|
-
return retval;
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator++()
|
|
132
|
-
{
|
|
133
|
-
++input;
|
|
134
|
-
return *this;
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const
|
|
138
|
-
{
|
|
139
|
-
thrust::detail::it_value_t<InputIt> x = *input;
|
|
140
|
-
return op(x);
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*()
|
|
144
|
-
{
|
|
145
|
-
thrust::detail::it_value_t<InputIt> x = *input;
|
|
146
|
-
return op(x);
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator+(difference_type n) const
|
|
150
|
-
{
|
|
151
|
-
return self_t(input + n, op);
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t& operator+=(difference_type n)
|
|
155
|
-
{
|
|
156
|
-
input += n;
|
|
157
|
-
return *this;
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator-(difference_type n) const
|
|
161
|
-
{
|
|
162
|
-
return self_t(input - n, op);
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t& operator-=(difference_type n)
|
|
166
|
-
{
|
|
167
|
-
input -= n;
|
|
168
|
-
return *this;
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_t other) const
|
|
172
|
-
{
|
|
173
|
-
return input - other.input;
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](difference_type n) const
|
|
177
|
-
{
|
|
178
|
-
return op(input[n]);
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_t& rhs) const
|
|
182
|
-
{
|
|
183
|
-
return (input == rhs.input);
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_t& rhs) const
|
|
187
|
-
{
|
|
188
|
-
return (input != rhs.input);
|
|
189
|
-
}
|
|
190
|
-
};
|
|
191
96
|
} // namespace __find_if
|
|
192
97
|
|
|
193
98
|
template <class Derived, class InputIt, class Size, class Predicate>
|
|
194
99
|
InputIt _CCCL_HOST_DEVICE
|
|
195
100
|
find_if_n(execution_policy<Derived>& policy, InputIt first, Size num_items, Predicate predicate)
|
|
196
101
|
{
|
|
197
|
-
using result_type =
|
|
102
|
+
using result_type = ::cuda::std::tuple<bool, Size>;
|
|
198
103
|
|
|
199
104
|
// empty sequence
|
|
200
105
|
if (num_items == 0)
|
|
@@ -212,27 +117,20 @@ find_if_n(execution_policy<Derived>& policy, InputIt first, Size num_items, Pred
|
|
|
212
117
|
const Size interval_threshold = 1 << 20;
|
|
213
118
|
const Size interval_size = (::cuda::std::min) (interval_threshold, num_items);
|
|
214
119
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
using IteratorTuple = thrust::tuple<XfrmIterator, counting_iterator<Size>>;
|
|
220
|
-
using ZipIterator = thrust::zip_iterator<IteratorTuple>;
|
|
221
|
-
|
|
222
|
-
IteratorTuple iter_tuple = thrust::make_tuple(XfrmIterator(first, predicate), counting_iterator<Size>(0));
|
|
223
|
-
|
|
224
|
-
ZipIterator begin = thrust::make_zip_iterator(iter_tuple);
|
|
225
|
-
ZipIterator end = begin + num_items;
|
|
120
|
+
const auto begin = ::cuda::make_zip_iterator(
|
|
121
|
+
::cuda::make_transform_iterator(try_unwrap_contiguous_iterator(first), predicate),
|
|
122
|
+
::cuda::counting_iterator<Size>(0));
|
|
123
|
+
const auto end = begin + num_items;
|
|
226
124
|
|
|
227
|
-
for (
|
|
125
|
+
for (auto interval_begin = begin; interval_begin < end; interval_begin += interval_size)
|
|
228
126
|
{
|
|
229
|
-
|
|
127
|
+
auto interval_end = interval_begin + interval_size;
|
|
230
128
|
if (end < interval_end)
|
|
231
129
|
{
|
|
232
130
|
interval_end = end;
|
|
233
131
|
} // end if
|
|
234
132
|
|
|
235
|
-
result_type result = reduce(
|
|
133
|
+
const result_type result = reduce(
|
|
236
134
|
policy, interval_begin, interval_end, result_type(false, interval_end - begin), __find_if::functor<result_type>());
|
|
237
135
|
|
|
238
136
|
// see if we found something
|
|
@@ -73,12 +73,14 @@ struct transform_pair_of_input_iterators_t
|
|
|
73
73
|
using value_type = ValueType;
|
|
74
74
|
using pointer = void;
|
|
75
75
|
using reference = value_type;
|
|
76
|
-
using iterator_category = std::random_access_iterator_tag;
|
|
76
|
+
using iterator_category = ::cuda::std::random_access_iterator_tag;
|
|
77
77
|
|
|
78
78
|
InputIt1 input1;
|
|
79
79
|
InputIt2 input2;
|
|
80
80
|
mutable BinaryOp op;
|
|
81
81
|
|
|
82
|
+
transform_pair_of_input_iterators_t() = default;
|
|
83
|
+
|
|
82
84
|
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE
|
|
83
85
|
transform_pair_of_input_iterators_t(InputIt1 input1_, InputIt2 input2_, BinaryOp op_)
|
|
84
86
|
: input1(input1_)
|
|
@@ -107,7 +109,7 @@ struct transform_pair_of_input_iterators_t
|
|
|
107
109
|
}
|
|
108
110
|
|
|
109
111
|
/// Prefix increment
|
|
110
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator++()
|
|
112
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t& operator++()
|
|
111
113
|
{
|
|
112
114
|
++input1;
|
|
113
115
|
++input2;
|
|
@@ -177,6 +179,10 @@ struct transform_pair_of_input_iterators_t
|
|
|
177
179
|
return (input1 != rhs.input1) || (input2 != rhs.input2);
|
|
178
180
|
}
|
|
179
181
|
|
|
182
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator<(const self_t& rhs) const
|
|
183
|
+
{
|
|
184
|
+
return input1 < rhs.input1;
|
|
185
|
+
}
|
|
180
186
|
}; // struct transform_pair_of_input_iterators_t
|
|
181
187
|
} // namespace detail
|
|
182
188
|
|
|
@@ -79,7 +79,7 @@ namespace detail
|
|
|
79
79
|
template <typename Iterator>
|
|
80
80
|
inline constexpr bool is_libcxx_wrap_iter_v = false;
|
|
81
81
|
|
|
82
|
-
#if
|
|
82
|
+
#if _CCCL_HOST_STD_LIB(LIBCXX)
|
|
83
83
|
template <typename Iterator>
|
|
84
84
|
inline constexpr bool is_libcxx_wrap_iter_v<
|
|
85
85
|
# if _LIBCPP_VERSION < 14000
|
|
@@ -88,23 +88,23 @@ inline constexpr bool is_libcxx_wrap_iter_v<
|
|
|
88
88
|
std::__wrap_iter<Iterator>
|
|
89
89
|
# endif
|
|
90
90
|
> = true;
|
|
91
|
-
#endif
|
|
91
|
+
#endif // _CCCL_HOST_STD_LIB(LIBCXX)
|
|
92
92
|
|
|
93
93
|
template <typename Iterator>
|
|
94
94
|
inline constexpr bool is_libstdcxx_normal_iterator_v = false;
|
|
95
95
|
|
|
96
|
-
#if
|
|
96
|
+
#if _CCCL_HOST_STD_LIB(LIBSTDCXX)
|
|
97
97
|
template <typename Iterator, typename Container>
|
|
98
98
|
inline constexpr bool is_libstdcxx_normal_iterator_v<::__gnu_cxx::__normal_iterator<Iterator, Container>> = true;
|
|
99
|
-
#endif
|
|
99
|
+
#endif // _CCCL_HOST_STD_LIB(LIBSTDCXX)
|
|
100
100
|
|
|
101
|
-
#if
|
|
101
|
+
#if _CCCL_HOST_STD_LIB(STL)
|
|
102
102
|
template <typename Iterator>
|
|
103
103
|
inline constexpr bool is_msvc_contiguous_iterator_v = ::cuda::std::is_pointer_v<::std::_Unwrapped_t<Iterator>>;
|
|
104
|
-
#else
|
|
104
|
+
#else // ^^^ _CCCL_HOST_STD_LIB(STL) ^^^ / vvv !_CCCL_HOST_STD_LIB(STL) vvv
|
|
105
105
|
template <typename Iterator>
|
|
106
106
|
inline constexpr bool is_msvc_contiguous_iterator_v = false;
|
|
107
|
-
#endif
|
|
107
|
+
#endif // ^^^ !_CCCL_HOST_STD_LIB(STL) ^^^
|
|
108
108
|
|
|
109
109
|
template <typename Iterator>
|
|
110
110
|
inline constexpr bool is_contiguous_iterator_impl_v =
|
cuda/compute/__init__.py
CHANGED
|
@@ -32,6 +32,7 @@ from .iterators import (
|
|
|
32
32
|
CacheModifiedInputIterator,
|
|
33
33
|
ConstantIterator,
|
|
34
34
|
CountingIterator,
|
|
35
|
+
PermutationIterator,
|
|
35
36
|
ReverseIterator,
|
|
36
37
|
TransformIterator,
|
|
37
38
|
TransformOutputIterator,
|
|
@@ -63,6 +64,7 @@ __all__ = [
|
|
|
63
64
|
"make_unique_by_key",
|
|
64
65
|
"merge_sort",
|
|
65
66
|
"OpKind",
|
|
67
|
+
"PermutationIterator",
|
|
66
68
|
"radix_sort",
|
|
67
69
|
"reduce_into",
|
|
68
70
|
"ReverseIterator",
|
cuda/compute/_bindings.pyi
CHANGED
|
@@ -57,6 +57,12 @@ class SortOrder(IntEnum):
|
|
|
57
57
|
ASCENDING = ...
|
|
58
58
|
DESCENDING = ...
|
|
59
59
|
|
|
60
|
+
class InitKind(IntEnum):
|
|
61
|
+
_value_: int
|
|
62
|
+
NO_INIT = ...
|
|
63
|
+
FUTURE_VALUE_INIT = ...
|
|
64
|
+
VALUE_INIT = ...
|
|
65
|
+
|
|
60
66
|
class Op:
|
|
61
67
|
def __init__(
|
|
62
68
|
self,
|
|
@@ -133,6 +139,8 @@ class Iterator:
|
|
|
133
139
|
def state(self, value) -> None: ...
|
|
134
140
|
@property
|
|
135
141
|
def type(self) -> IteratorKind: ...
|
|
142
|
+
@property
|
|
143
|
+
def value_type(self) -> TypeInfo: ...
|
|
136
144
|
def as_bytes(self) -> bytes: ...
|
|
137
145
|
def is_kind_pointer(self) -> bool: ...
|
|
138
146
|
def is_kind_iterator(self) -> bool: ...
|
|
@@ -197,8 +205,9 @@ class DeviceScanBuildResult:
|
|
|
197
205
|
d_in: Iterator,
|
|
198
206
|
d_out: Iterator,
|
|
199
207
|
binary_op: Op,
|
|
200
|
-
|
|
208
|
+
init_type: TypeInfo,
|
|
201
209
|
force_inclusive: bool,
|
|
210
|
+
init_kind: InitKind,
|
|
202
211
|
info: CommonData,
|
|
203
212
|
): ...
|
|
204
213
|
def compute_inclusive(
|
|
@@ -223,6 +232,39 @@ class DeviceScanBuildResult:
|
|
|
223
232
|
h_init: Value,
|
|
224
233
|
stream,
|
|
225
234
|
) -> int: ...
|
|
235
|
+
def compute_inclusive_future_value(
|
|
236
|
+
self,
|
|
237
|
+
temp_storage_ptr: int | None,
|
|
238
|
+
temp_storage_nbytes: int,
|
|
239
|
+
d_in: Iterator,
|
|
240
|
+
d_out: Iterator,
|
|
241
|
+
num_items: int,
|
|
242
|
+
binary_op: Op,
|
|
243
|
+
h_init: Iterator,
|
|
244
|
+
stream,
|
|
245
|
+
) -> int: ...
|
|
246
|
+
def compute_exclusive_future_value(
|
|
247
|
+
self,
|
|
248
|
+
temp_storage_ptr: int | None,
|
|
249
|
+
temp_storage_nbytes: int,
|
|
250
|
+
d_in: Iterator,
|
|
251
|
+
d_out: Iterator,
|
|
252
|
+
num_items: int,
|
|
253
|
+
binary_op: Op,
|
|
254
|
+
h_init: Iterator,
|
|
255
|
+
stream,
|
|
256
|
+
) -> int: ...
|
|
257
|
+
def compute_inclusive_no_init(
|
|
258
|
+
self,
|
|
259
|
+
temp_storage_ptr: int | None,
|
|
260
|
+
temp_storage_nbytes: int,
|
|
261
|
+
d_in: Iterator,
|
|
262
|
+
d_out: Iterator,
|
|
263
|
+
num_items: int,
|
|
264
|
+
binary_op: Op,
|
|
265
|
+
h_init: None,
|
|
266
|
+
stream,
|
|
267
|
+
) -> int: ...
|
|
226
268
|
|
|
227
269
|
# ---------------------
|
|
228
270
|
# DeviceSegmentedReduce
|
cuda/compute/_bindings_impl.pyx
CHANGED
|
@@ -120,6 +120,10 @@ cdef extern from "cccl/c/types.h":
|
|
|
120
120
|
ASCENDING "CCCL_ASCENDING"
|
|
121
121
|
DESCENDING "CCCL_DESCENDING"
|
|
122
122
|
|
|
123
|
+
cpdef enum cccl_init_kind_t:
|
|
124
|
+
VALUE_INIT "CCCL_VALUE_INIT"
|
|
125
|
+
FUTURE_VALUE_INIT "CCCL_FUTURE_VALUE_INIT"
|
|
126
|
+
NO_INIT "CCCL_NO_INIT"
|
|
123
127
|
|
|
124
128
|
cdef void arg_type_check(
|
|
125
129
|
str arg_name,
|
|
@@ -136,6 +140,7 @@ OpKind = cccl_op_kind_t
|
|
|
136
140
|
TypeEnum = cccl_type_enum
|
|
137
141
|
IteratorKind = cccl_iterator_kind_t
|
|
138
142
|
SortOrder = cccl_sort_order_t
|
|
143
|
+
InitKind = cccl_init_kind_t
|
|
139
144
|
|
|
140
145
|
cdef void _validate_alignment(int alignment) except *:
|
|
141
146
|
"""
|
|
@@ -724,6 +729,11 @@ cdef class Iterator:
|
|
|
724
729
|
else:
|
|
725
730
|
return IteratorKind.ITERATOR
|
|
726
731
|
|
|
732
|
+
@property
|
|
733
|
+
def value_type(self):
|
|
734
|
+
cdef cccl_type_info type_info = self.iter_data.value_type
|
|
735
|
+
return TypeInfo(type_info.size, type_info.alignment, type_info.type)
|
|
736
|
+
|
|
727
737
|
def is_kind_pointer(self):
|
|
728
738
|
cdef cccl_iterator_kind_t it_kind = self.iter_data.type
|
|
729
739
|
return (it_kind == cccl_iterator_kind_t.POINTER)
|
|
@@ -947,8 +957,9 @@ cdef extern from "cccl/c/scan.h":
|
|
|
947
957
|
cccl_iterator_t,
|
|
948
958
|
cccl_iterator_t,
|
|
949
959
|
cccl_op_t,
|
|
950
|
-
|
|
960
|
+
cccl_type_info,
|
|
951
961
|
_Bool,
|
|
962
|
+
cccl_init_kind_t,
|
|
952
963
|
int, int, const char*, const char*, const char*, const char*
|
|
953
964
|
) nogil
|
|
954
965
|
|
|
@@ -976,6 +987,41 @@ cdef extern from "cccl/c/scan.h":
|
|
|
976
987
|
CUstream
|
|
977
988
|
) nogil
|
|
978
989
|
|
|
990
|
+
cdef CUresult cccl_device_exclusive_scan_future_value(
|
|
991
|
+
cccl_device_scan_build_result_t,
|
|
992
|
+
void *,
|
|
993
|
+
size_t *,
|
|
994
|
+
cccl_iterator_t,
|
|
995
|
+
cccl_iterator_t,
|
|
996
|
+
uint64_t,
|
|
997
|
+
cccl_op_t,
|
|
998
|
+
cccl_iterator_t,
|
|
999
|
+
CUstream
|
|
1000
|
+
) nogil
|
|
1001
|
+
|
|
1002
|
+
cdef CUresult cccl_device_inclusive_scan_future_value(
|
|
1003
|
+
cccl_device_scan_build_result_t,
|
|
1004
|
+
void *,
|
|
1005
|
+
size_t *,
|
|
1006
|
+
cccl_iterator_t,
|
|
1007
|
+
cccl_iterator_t,
|
|
1008
|
+
uint64_t,
|
|
1009
|
+
cccl_op_t,
|
|
1010
|
+
cccl_iterator_t,
|
|
1011
|
+
CUstream
|
|
1012
|
+
) nogil
|
|
1013
|
+
|
|
1014
|
+
cdef CUresult cccl_device_inclusive_scan_no_init(
|
|
1015
|
+
cccl_device_scan_build_result_t,
|
|
1016
|
+
void *,
|
|
1017
|
+
size_t *,
|
|
1018
|
+
cccl_iterator_t,
|
|
1019
|
+
cccl_iterator_t,
|
|
1020
|
+
uint64_t,
|
|
1021
|
+
cccl_op_t,
|
|
1022
|
+
CUstream
|
|
1023
|
+
) nogil
|
|
1024
|
+
|
|
979
1025
|
cdef CUresult cccl_device_scan_cleanup(
|
|
980
1026
|
cccl_device_scan_build_result_t*
|
|
981
1027
|
) nogil
|
|
@@ -989,8 +1035,9 @@ cdef class DeviceScanBuildResult:
|
|
|
989
1035
|
Iterator d_in,
|
|
990
1036
|
Iterator d_out,
|
|
991
1037
|
Op op,
|
|
992
|
-
|
|
1038
|
+
TypeInfo init_type,
|
|
993
1039
|
bint force_inclusive,
|
|
1040
|
+
cccl_init_kind_t init_kind,
|
|
994
1041
|
CommonData common_data
|
|
995
1042
|
):
|
|
996
1043
|
cdef CUresult status = -1
|
|
@@ -1008,8 +1055,9 @@ cdef class DeviceScanBuildResult:
|
|
|
1008
1055
|
d_in.iter_data,
|
|
1009
1056
|
d_out.iter_data,
|
|
1010
1057
|
op.op_data,
|
|
1011
|
-
|
|
1058
|
+
init_type.type_info,
|
|
1012
1059
|
force_inclusive,
|
|
1060
|
+
init_kind,
|
|
1013
1061
|
cc_major,
|
|
1014
1062
|
cc_minor,
|
|
1015
1063
|
cub_path,
|
|
@@ -1035,7 +1083,7 @@ cdef class DeviceScanBuildResult:
|
|
|
1035
1083
|
Iterator d_out,
|
|
1036
1084
|
size_t num_items,
|
|
1037
1085
|
Op op,
|
|
1038
|
-
Value
|
|
1086
|
+
Value init_value,
|
|
1039
1087
|
stream
|
|
1040
1088
|
):
|
|
1041
1089
|
cdef CUresult status = -1
|
|
@@ -1052,7 +1100,7 @@ cdef class DeviceScanBuildResult:
|
|
|
1052
1100
|
d_out.iter_data,
|
|
1053
1101
|
<uint64_t>num_items,
|
|
1054
1102
|
op.op_data,
|
|
1055
|
-
|
|
1103
|
+
init_value.value_data,
|
|
1056
1104
|
c_stream
|
|
1057
1105
|
)
|
|
1058
1106
|
if status != 0:
|
|
@@ -1069,7 +1117,7 @@ cdef class DeviceScanBuildResult:
|
|
|
1069
1117
|
Iterator d_out,
|
|
1070
1118
|
size_t num_items,
|
|
1071
1119
|
Op op,
|
|
1072
|
-
Value
|
|
1120
|
+
Value init_value,
|
|
1073
1121
|
stream
|
|
1074
1122
|
):
|
|
1075
1123
|
cdef CUresult status = -1
|
|
@@ -1086,7 +1134,7 @@ cdef class DeviceScanBuildResult:
|
|
|
1086
1134
|
d_out.iter_data,
|
|
1087
1135
|
<uint64_t>num_items,
|
|
1088
1136
|
op.op_data,
|
|
1089
|
-
|
|
1137
|
+
init_value.value_data,
|
|
1090
1138
|
c_stream
|
|
1091
1139
|
)
|
|
1092
1140
|
if status != 0:
|
|
@@ -1095,6 +1143,107 @@ cdef class DeviceScanBuildResult:
|
|
|
1095
1143
|
)
|
|
1096
1144
|
return storage_sz
|
|
1097
1145
|
|
|
1146
|
+
cpdef int compute_inclusive_future_value(
|
|
1147
|
+
DeviceScanBuildResult self,
|
|
1148
|
+
temp_storage_ptr,
|
|
1149
|
+
temp_storage_bytes,
|
|
1150
|
+
Iterator d_in,
|
|
1151
|
+
Iterator d_out,
|
|
1152
|
+
size_t num_items,
|
|
1153
|
+
Op op,
|
|
1154
|
+
Iterator init_value,
|
|
1155
|
+
stream
|
|
1156
|
+
):
|
|
1157
|
+
cdef CUresult status = -1
|
|
1158
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
1159
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
1160
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
1161
|
+
|
|
1162
|
+
with nogil:
|
|
1163
|
+
status = cccl_device_inclusive_scan_future_value(
|
|
1164
|
+
self.build_data,
|
|
1165
|
+
storage_ptr,
|
|
1166
|
+
&storage_sz,
|
|
1167
|
+
d_in.iter_data,
|
|
1168
|
+
d_out.iter_data,
|
|
1169
|
+
<uint64_t>num_items,
|
|
1170
|
+
op.op_data,
|
|
1171
|
+
init_value.iter_data,
|
|
1172
|
+
c_stream
|
|
1173
|
+
)
|
|
1174
|
+
if status != 0:
|
|
1175
|
+
raise RuntimeError(
|
|
1176
|
+
f"Failed executing inclusive scan, error code: {status}"
|
|
1177
|
+
)
|
|
1178
|
+
return storage_sz
|
|
1179
|
+
|
|
1180
|
+
cpdef int compute_exclusive_future_value(
|
|
1181
|
+
DeviceScanBuildResult self,
|
|
1182
|
+
temp_storage_ptr,
|
|
1183
|
+
temp_storage_bytes,
|
|
1184
|
+
Iterator d_in,
|
|
1185
|
+
Iterator d_out,
|
|
1186
|
+
size_t num_items,
|
|
1187
|
+
Op op,
|
|
1188
|
+
Iterator init_value,
|
|
1189
|
+
stream
|
|
1190
|
+
):
|
|
1191
|
+
cdef CUresult status = -1
|
|
1192
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
1193
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
1194
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
1195
|
+
|
|
1196
|
+
with nogil:
|
|
1197
|
+
status = cccl_device_exclusive_scan_future_value(
|
|
1198
|
+
self.build_data,
|
|
1199
|
+
storage_ptr,
|
|
1200
|
+
&storage_sz,
|
|
1201
|
+
d_in.iter_data,
|
|
1202
|
+
d_out.iter_data,
|
|
1203
|
+
<uint64_t>num_items,
|
|
1204
|
+
op.op_data,
|
|
1205
|
+
init_value.iter_data,
|
|
1206
|
+
c_stream
|
|
1207
|
+
)
|
|
1208
|
+
if status != 0:
|
|
1209
|
+
raise RuntimeError(
|
|
1210
|
+
f"Failed executing exclusive scan, error code: {status}"
|
|
1211
|
+
)
|
|
1212
|
+
return storage_sz
|
|
1213
|
+
|
|
1214
|
+
cpdef int compute_inclusive_no_init(
|
|
1215
|
+
DeviceScanBuildResult self,
|
|
1216
|
+
temp_storage_ptr,
|
|
1217
|
+
temp_storage_bytes,
|
|
1218
|
+
Iterator d_in,
|
|
1219
|
+
Iterator d_out,
|
|
1220
|
+
size_t num_items,
|
|
1221
|
+
Op op,
|
|
1222
|
+
object init_value,
|
|
1223
|
+
stream
|
|
1224
|
+
):
|
|
1225
|
+
cdef CUresult status = -1
|
|
1226
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
1227
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
1228
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
1229
|
+
|
|
1230
|
+
with nogil:
|
|
1231
|
+
status = cccl_device_inclusive_scan_no_init(
|
|
1232
|
+
self.build_data,
|
|
1233
|
+
storage_ptr,
|
|
1234
|
+
&storage_sz,
|
|
1235
|
+
d_in.iter_data,
|
|
1236
|
+
d_out.iter_data,
|
|
1237
|
+
<uint64_t>num_items,
|
|
1238
|
+
op.op_data,
|
|
1239
|
+
c_stream
|
|
1240
|
+
)
|
|
1241
|
+
if status != 0:
|
|
1242
|
+
raise RuntimeError(
|
|
1243
|
+
f"Failed executing inclusive scan, error code: {status}"
|
|
1244
|
+
)
|
|
1245
|
+
return storage_sz
|
|
1246
|
+
|
|
1098
1247
|
def _get_cubin(self):
|
|
1099
1248
|
return PyBytes_FromStringAndSize(
|
|
1100
1249
|
<const char*>self.build_data.cubin,
|