cuda-cccl 0.3.0__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.1__cp311-cp311-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +48 -46
- cuda/cccl/headers/include/cuda/__device/attributes.h +171 -121
- cuda/cccl/headers/include/cuda/__device/device_ref.h +30 -42
- cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +1 -0
- cuda/cccl/headers/include/cuda/__event/timed_event.h +1 -0
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +1 -0
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/devices +10 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/string_view +12 -5
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +77 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +1 -1
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +141 -138
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -100,7 +100,14 @@ template <typename _Tp>
|
|
|
100
100
|
template <typename _Tp>
|
|
101
101
|
[[nodiscard]] _CCCL_HIDE_FROM_ABI _CCCL_DEVICE int __cccl_countl_zero_impl_device(_Tp __v) noexcept
|
|
102
102
|
{
|
|
103
|
-
|
|
103
|
+
if constexpr (sizeof(_Tp) == sizeof(uint32_t))
|
|
104
|
+
{
|
|
105
|
+
return static_cast<int>(::__clz(static_cast<int>(__v)));
|
|
106
|
+
}
|
|
107
|
+
else
|
|
108
|
+
{
|
|
109
|
+
return static_cast<int>(::__clzll(static_cast<long long>(__v)));
|
|
110
|
+
}
|
|
104
111
|
}
|
|
105
112
|
#endif // _CCCL_CUDA_COMPILATION()
|
|
106
113
|
|
|
@@ -114,11 +114,11 @@ template <typename _Tp>
|
|
|
114
114
|
{
|
|
115
115
|
if constexpr (sizeof(_Tp) == sizeof(uint32_t))
|
|
116
116
|
{
|
|
117
|
-
return ::__clz(static_cast<int>(::__brev(__v)));
|
|
117
|
+
return static_cast<int>(::__clz(static_cast<int>(::__brev(__v))));
|
|
118
118
|
}
|
|
119
119
|
else
|
|
120
120
|
{
|
|
121
|
-
return ::__clzll(static_cast<long long>(::__brevll(__v)));
|
|
121
|
+
return static_cast<int>(::__clzll(static_cast<long long>(::__brevll(__v))));
|
|
122
122
|
}
|
|
123
123
|
}
|
|
124
124
|
#endif // _CCCL_CUDA_COMPILATION()
|
|
@@ -275,10 +275,10 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_aligned(
|
|
|
275
275
|
// do first word
|
|
276
276
|
if (__first.__ctz_ != 0)
|
|
277
277
|
{
|
|
278
|
-
unsigned
|
|
279
|
-
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(
|
|
278
|
+
unsigned __clz_f = __bits_per_word - __first.__ctz_;
|
|
279
|
+
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
|
|
280
280
|
__n -= __dn;
|
|
281
|
-
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (
|
|
281
|
+
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
|
|
282
282
|
__storage_type __b = *__first.__seg_ & __m;
|
|
283
283
|
*__result.__seg_ &= ~__m;
|
|
284
284
|
*__result.__seg_ |= __b;
|
|
@@ -420,8 +420,8 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_backward_aligned(
|
|
|
420
420
|
{
|
|
421
421
|
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__last.__ctz_), __n);
|
|
422
422
|
__n -= __dn;
|
|
423
|
-
unsigned
|
|
424
|
-
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >>
|
|
423
|
+
unsigned __clz_f = __bits_per_word - __last.__ctz_;
|
|
424
|
+
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_f);
|
|
425
425
|
__storage_type __b = *__last.__seg_ & __m;
|
|
426
426
|
*__result.__seg_ &= ~__m;
|
|
427
427
|
*__result.__seg_ |= __b;
|
|
@@ -635,10 +635,10 @@ _CCCL_API inline __bit_iterator<_Cr, false> __swap_ranges_aligned(
|
|
|
635
635
|
// do first word
|
|
636
636
|
if (__first.__ctz_ != 0)
|
|
637
637
|
{
|
|
638
|
-
unsigned
|
|
639
|
-
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(
|
|
638
|
+
unsigned __clz_f = __bits_per_word - __first.__ctz_;
|
|
639
|
+
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
|
|
640
640
|
__n -= __dn;
|
|
641
|
-
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (
|
|
641
|
+
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
|
|
642
642
|
__storage_type __b1 = *__first.__seg_ & __m;
|
|
643
643
|
*__first.__seg_ &= ~__m;
|
|
644
644
|
__storage_type __b2 = *__result.__seg_ & __m;
|
|
@@ -988,10 +988,10 @@ _CCCL_API constexpr bool __equal_aligned(
|
|
|
988
988
|
// do first word
|
|
989
989
|
if (__first1.__ctz_ != 0)
|
|
990
990
|
{
|
|
991
|
-
unsigned
|
|
992
|
-
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(
|
|
991
|
+
unsigned __clz_f = __bits_per_word - __first1.__ctz_;
|
|
992
|
+
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
|
|
993
993
|
__n -= __dn;
|
|
994
|
-
__storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (
|
|
994
|
+
__storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
|
|
995
995
|
if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
|
|
996
996
|
{
|
|
997
997
|
return false;
|
|
@@ -43,19 +43,19 @@ template <class _Rep, class _Period = ratio<1>>
|
|
|
43
43
|
class _CCCL_TYPE_VISIBILITY_DEFAULT duration;
|
|
44
44
|
|
|
45
45
|
template <class _Tp>
|
|
46
|
-
inline
|
|
46
|
+
inline constexpr bool __is_duration_v = false;
|
|
47
47
|
|
|
48
48
|
template <class _Rep, class _Period>
|
|
49
|
-
inline
|
|
49
|
+
inline constexpr bool __is_duration_v<duration<_Rep, _Period>> = true;
|
|
50
50
|
|
|
51
51
|
template <class _Rep, class _Period>
|
|
52
|
-
inline
|
|
52
|
+
inline constexpr bool __is_duration_v<const duration<_Rep, _Period>> = true;
|
|
53
53
|
|
|
54
54
|
template <class _Rep, class _Period>
|
|
55
|
-
inline
|
|
55
|
+
inline constexpr bool __is_duration_v<volatile duration<_Rep, _Period>> = true;
|
|
56
56
|
|
|
57
57
|
template <class _Rep, class _Period>
|
|
58
|
-
inline
|
|
58
|
+
inline constexpr bool __is_duration_v<const volatile duration<_Rep, _Period>> = true;
|
|
59
59
|
|
|
60
60
|
} // namespace chrono
|
|
61
61
|
|
|
@@ -190,29 +190,29 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT duration
|
|
|
190
190
|
struct __no_overflow
|
|
191
191
|
{
|
|
192
192
|
private:
|
|
193
|
-
static
|
|
194
|
-
static
|
|
195
|
-
static
|
|
196
|
-
static
|
|
197
|
-
static
|
|
198
|
-
static
|
|
199
|
-
static
|
|
193
|
+
static constexpr intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value;
|
|
194
|
+
static constexpr intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
|
|
195
|
+
static constexpr intmax_t __n1 = _R1::num / __gcd_n1_n2;
|
|
196
|
+
static constexpr intmax_t __d1 = _R1::den / __gcd_d1_d2;
|
|
197
|
+
static constexpr intmax_t __n2 = _R2::num / __gcd_n1_n2;
|
|
198
|
+
static constexpr intmax_t __d2 = _R2::den / __gcd_d1_d2;
|
|
199
|
+
static constexpr intmax_t max = -((intmax_t(1) << (sizeof(intmax_t) * CHAR_BIT - 1)) + 1);
|
|
200
200
|
|
|
201
201
|
template <intmax_t _Xp, intmax_t _Yp, bool __overflow>
|
|
202
202
|
struct __mul // __overflow == false
|
|
203
203
|
{
|
|
204
|
-
static
|
|
204
|
+
static constexpr intmax_t value = _Xp * _Yp;
|
|
205
205
|
};
|
|
206
206
|
|
|
207
207
|
template <intmax_t _Xp, intmax_t _Yp>
|
|
208
208
|
struct __mul<_Xp, _Yp, true>
|
|
209
209
|
{
|
|
210
|
-
static
|
|
210
|
+
static constexpr intmax_t value = 1;
|
|
211
211
|
};
|
|
212
212
|
|
|
213
213
|
public:
|
|
214
|
-
static
|
|
215
|
-
using type
|
|
214
|
+
static constexpr bool value = (__n1 <= max / __d2) && (__n2 <= max / __d1);
|
|
215
|
+
using type = ratio<__mul<__n1, __d2, !value>::value, __mul<__n2, __d1, !value>::value>;
|
|
216
216
|
};
|
|
217
217
|
|
|
218
218
|
public:
|
|
@@ -40,11 +40,11 @@ namespace chrono
|
|
|
40
40
|
class _CCCL_TYPE_VISIBILITY_DEFAULT steady_clock
|
|
41
41
|
{
|
|
42
42
|
public:
|
|
43
|
-
using duration
|
|
44
|
-
using rep
|
|
45
|
-
using period
|
|
46
|
-
using time_point
|
|
47
|
-
static constexpr
|
|
43
|
+
using duration = nanoseconds;
|
|
44
|
+
using rep = duration::rep;
|
|
45
|
+
using period = duration::period;
|
|
46
|
+
using time_point = ::cuda::std::chrono::time_point<steady_clock, duration>;
|
|
47
|
+
static constexpr bool is_steady = true;
|
|
48
48
|
|
|
49
49
|
[[nodiscard]] _CCCL_API static time_point now() noexcept;
|
|
50
50
|
};
|
|
@@ -39,11 +39,11 @@ namespace chrono
|
|
|
39
39
|
class _CCCL_TYPE_VISIBILITY_DEFAULT system_clock
|
|
40
40
|
{
|
|
41
41
|
public:
|
|
42
|
-
using duration
|
|
43
|
-
using rep
|
|
44
|
-
using period
|
|
45
|
-
using time_point
|
|
46
|
-
static constexpr
|
|
42
|
+
using duration = ::cuda::std::chrono::nanoseconds;
|
|
43
|
+
using rep = duration::rep;
|
|
44
|
+
using period = duration::period;
|
|
45
|
+
using time_point = ::cuda::std::chrono::time_point<system_clock>;
|
|
46
|
+
static constexpr bool is_steady = false;
|
|
47
47
|
|
|
48
48
|
[[nodiscard]] _CCCL_API inline static time_point now() noexcept
|
|
49
49
|
{
|
|
@@ -20,7 +20,9 @@
|
|
|
20
20
|
# pragma system_header
|
|
21
21
|
#endif // no system header
|
|
22
22
|
|
|
23
|
+
#include <cuda/__fwd/complex.h>
|
|
23
24
|
#include <cuda/std/__fwd/array.h>
|
|
25
|
+
#include <cuda/std/__fwd/complex.h>
|
|
24
26
|
#include <cuda/std/__fwd/tuple.h>
|
|
25
27
|
#include <cuda/std/__tuple_dir/tuple_element.h>
|
|
26
28
|
#include <cuda/std/__tuple_dir/tuple_indices.h>
|
|
@@ -61,7 +63,27 @@ struct __make_tuple_types_flat<array<_Vt, _Np>, __tuple_indices<_Idx...>>
|
|
|
61
63
|
template <size_t>
|
|
62
64
|
using __value_type = _Vt;
|
|
63
65
|
template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
|
|
64
|
-
using __apply_quals = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
|
|
66
|
+
using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
template <class _Vt, size_t... _Idx>
|
|
70
|
+
struct __make_tuple_types_flat<complex<_Vt>, __tuple_indices<_Idx...>>
|
|
71
|
+
{
|
|
72
|
+
static_assert(sizeof...(_Idx) == 2, "__make_tuple_types: complex has only 2 members");
|
|
73
|
+
template <size_t>
|
|
74
|
+
using __value_type = _Vt;
|
|
75
|
+
template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
|
|
76
|
+
using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
template <class _Vt, size_t... _Idx>
|
|
80
|
+
struct __make_tuple_types_flat<::cuda::complex<_Vt>, __tuple_indices<_Idx...>>
|
|
81
|
+
{
|
|
82
|
+
static_assert(sizeof...(_Idx) == 2, "__make_tuple_types: complex has only 2 members");
|
|
83
|
+
template <size_t>
|
|
84
|
+
using __value_type = _Vt;
|
|
85
|
+
template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
|
|
86
|
+
using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
|
|
65
87
|
};
|
|
66
88
|
|
|
67
89
|
template <class _Tp,
|
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
# pragma system_header
|
|
21
21
|
#endif // no system header
|
|
22
22
|
|
|
23
|
+
#include <cuda/__fwd/complex.h>
|
|
23
24
|
#include <cuda/std/__concepts/concept_macros.h>
|
|
24
25
|
#include <cuda/std/__fwd/array.h>
|
|
25
26
|
#include <cuda/std/__fwd/complex.h>
|
|
@@ -58,6 +59,9 @@ inline constexpr bool __tuple_like_impl<array<_Tp, _Size>> = true;
|
|
|
58
59
|
template <class _Tp>
|
|
59
60
|
inline constexpr bool __tuple_like_impl<complex<_Tp>> = true;
|
|
60
61
|
|
|
62
|
+
template <class _Tp>
|
|
63
|
+
inline constexpr bool __tuple_like_impl<::cuda::complex<_Tp>> = true;
|
|
64
|
+
|
|
61
65
|
template <class _Ip, class _Sp, ::cuda::std::ranges::subrange_kind _Kp>
|
|
62
66
|
inline constexpr bool __tuple_like_impl<::cuda::std::ranges::subrange<_Ip, _Sp, _Kp>> = true;
|
|
63
67
|
|
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
# pragma system_header
|
|
21
21
|
#endif // no system header
|
|
22
22
|
|
|
23
|
+
#include <cuda/__fwd/complex.h>
|
|
23
24
|
#include <cuda/std/__fwd/array.h>
|
|
24
25
|
#include <cuda/std/__fwd/complex.h>
|
|
25
26
|
#include <cuda/std/__fwd/pair.h>
|
|
@@ -54,6 +55,9 @@ inline constexpr bool __tuple_like_ext<array<_Tp, _Size>> = true;
|
|
|
54
55
|
template <class _Tp>
|
|
55
56
|
inline constexpr bool __tuple_like_ext<complex<_Tp>> = true;
|
|
56
57
|
|
|
58
|
+
template <class _Tp>
|
|
59
|
+
inline constexpr bool __tuple_like_ext<::cuda::complex<_Tp>> = true;
|
|
60
|
+
|
|
57
61
|
template <class... _Tp>
|
|
58
62
|
inline constexpr bool __tuple_like_ext<__tuple_types<_Tp...>> = true;
|
|
59
63
|
|
|
@@ -57,7 +57,7 @@
|
|
|
57
57
|
#include <cuda/std/version>
|
|
58
58
|
|
|
59
59
|
#if !_CCCL_COMPILER(NVRTC)
|
|
60
|
-
# include <
|
|
60
|
+
# include <string_view>
|
|
61
61
|
#endif // !_CCCL_COMPILER(NVRTC)
|
|
62
62
|
|
|
63
63
|
#include <cuda/std/__cccl/prologue.h>
|
|
@@ -727,14 +727,21 @@ _CCCL_HOST_DEVICE basic_string_view(_Range&&) -> basic_string_view<::cuda::std::
|
|
|
727
727
|
|
|
728
728
|
// operator <<
|
|
729
729
|
|
|
730
|
-
#if
|
|
730
|
+
#if !_CCCL_COMPILER(NVRTC)
|
|
731
|
+
template <class _CharT>
|
|
732
|
+
_CCCL_HOST_API ::std::basic_ostream<_CharT>&
|
|
733
|
+
operator<<(::std::basic_ostream<_CharT>& __os, basic_string_view<_CharT> __str)
|
|
734
|
+
{
|
|
735
|
+
return __os << ::std::basic_string_view<_CharT>{__str.data(), __str.size()};
|
|
736
|
+
}
|
|
737
|
+
|
|
731
738
|
template <class _CharT, class _Traits>
|
|
732
|
-
|
|
739
|
+
_CCCL_HOST_API ::std::basic_ostream<_CharT, _Traits>&
|
|
733
740
|
operator<<(::std::basic_ostream<_CharT, _Traits>& __os, basic_string_view<_CharT, _Traits> __str)
|
|
734
741
|
{
|
|
735
|
-
return __os
|
|
742
|
+
return __os << ::std::basic_string_view<_CharT, _Traits>{__str.data(), __str.size()};
|
|
736
743
|
}
|
|
737
|
-
#endif //
|
|
744
|
+
#endif // !_CCCL_COMPILER(NVRTC)
|
|
738
745
|
|
|
739
746
|
// literals
|
|
740
747
|
|
|
@@ -141,7 +141,7 @@
|
|
|
141
141
|
// # define __cccl_lib_shared_mutex 201505L
|
|
142
142
|
// # define __cccl_lib_shared_ptr_arrays 201611L
|
|
143
143
|
// # define __cccl_lib_shared_ptr_weak_type 201606L
|
|
144
|
-
|
|
144
|
+
#define __cccl_lib_string_view 201803L
|
|
145
145
|
// # define __cccl_lib_to_chars 201611L
|
|
146
146
|
// # define __cccl_lib_uncaught_exceptions 201411L
|
|
147
147
|
// # define __cccl_lib_unordered_map_try_emplace 201411L
|
|
@@ -171,7 +171,6 @@
|
|
|
171
171
|
// # define __cccl_lib_constexpr_misc 201811L
|
|
172
172
|
// # define __cccl_lib_constexpr_numeric 201911L
|
|
173
173
|
// # define __cccl_lib_constexpr_string 201907L
|
|
174
|
-
// # define __cccl_lib_constexpr_string_view 201811L
|
|
175
174
|
// # define __cccl_lib_constexpr_swap_algorithms 201806L
|
|
176
175
|
// # define __cccl_lib_constexpr_tuple 201811L
|
|
177
176
|
// # define __cccl_lib_constexpr_utility 201811L
|
|
@@ -204,8 +203,6 @@
|
|
|
204
203
|
// # define __cccl_lib_source_location 201907L
|
|
205
204
|
// # define __cccl_lib_ssize 201902L
|
|
206
205
|
// # define __cccl_lib_starts_ends_with 201711L
|
|
207
|
-
// # undef __cccl_lib_string_view
|
|
208
|
-
// # define __cccl_lib_string_view 201803L
|
|
209
206
|
// # define __cccl_lib_syncbuf 201803L
|
|
210
207
|
// # define __cccl_lib_three_way_comparison 201907L
|
|
211
208
|
# define __cccl_lib_unwrap_ref 201811L
|
|
@@ -27,6 +27,8 @@
|
|
|
27
27
|
#endif // no system header
|
|
28
28
|
#include <thrust/detail/type_deduction.h>
|
|
29
29
|
|
|
30
|
+
#include <cuda/std/__bit/countl.h>
|
|
31
|
+
#include <cuda/std/__type_traits/make_unsigned.h>
|
|
30
32
|
#include <cuda/std/limits>
|
|
31
33
|
#include <cuda/std/type_traits>
|
|
32
34
|
|
|
@@ -36,25 +38,6 @@ THRUST_NAMESPACE_BEGIN
|
|
|
36
38
|
namespace detail
|
|
37
39
|
{
|
|
38
40
|
|
|
39
|
-
template <typename Integer>
|
|
40
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer clz(Integer x)
|
|
41
|
-
{
|
|
42
|
-
Integer result;
|
|
43
|
-
|
|
44
|
-
NV_IF_TARGET(NV_IS_DEVICE,
|
|
45
|
-
(result = ::__clz(x);),
|
|
46
|
-
(int num_bits = 8 * sizeof(Integer); int num_bits_minus_one = num_bits - 1; result = num_bits;
|
|
47
|
-
for (int i = num_bits_minus_one; i >= 0; --i) {
|
|
48
|
-
if ((Integer(1) << i) & x)
|
|
49
|
-
{
|
|
50
|
-
result = num_bits_minus_one - i;
|
|
51
|
-
break;
|
|
52
|
-
}
|
|
53
|
-
}));
|
|
54
|
-
|
|
55
|
-
return result;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
41
|
template <typename Integer>
|
|
59
42
|
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_power_of_2(Integer x)
|
|
60
43
|
{
|
|
@@ -85,7 +68,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer log2(Integer x)
|
|
|
85
68
|
Integer num_bits = 8 * sizeof(Integer);
|
|
86
69
|
Integer num_bits_minus_one = num_bits - 1;
|
|
87
70
|
|
|
88
|
-
return num_bits_minus_one -
|
|
71
|
+
return num_bits_minus_one - ::cuda::std::countl_zero(::cuda::std::__to_unsigned_like(x));
|
|
89
72
|
}
|
|
90
73
|
|
|
91
74
|
template <typename Integer>
|
|
@@ -316,6 +316,17 @@ struct iterator_traversal<::cuda::zip_iterator<Iterators...>>
|
|
|
316
316
|
using type = detail::minimum_type<iterator_traversal_t<Iterators>...>;
|
|
317
317
|
};
|
|
318
318
|
|
|
319
|
+
template <class Fn, class... Iterators>
|
|
320
|
+
struct iterator_system<::cuda::zip_transform_iterator<Fn, Iterators...>>
|
|
321
|
+
{
|
|
322
|
+
using type = detail::minimum_system_t<iterator_system_t<Iterators>...>;
|
|
323
|
+
};
|
|
324
|
+
template <class Fn, class... Iterators>
|
|
325
|
+
struct iterator_traversal<::cuda::zip_transform_iterator<Fn, Iterators...>>
|
|
326
|
+
{
|
|
327
|
+
using type = detail::minimum_type<iterator_traversal_t<Iterators>...>;
|
|
328
|
+
};
|
|
329
|
+
|
|
319
330
|
//! \} // end iterator_traits
|
|
320
331
|
|
|
321
332
|
THRUST_NAMESPACE_END
|
|
@@ -48,6 +48,13 @@
|
|
|
48
48
|
#include <thrust/system/cuda/detail/util.h>
|
|
49
49
|
#include <thrust/type_traits/is_trivially_relocatable.h>
|
|
50
50
|
|
|
51
|
+
#if _CCCL_HAS_CUDA_COMPILER()
|
|
52
|
+
# include <cub/device/dispatch/tuning/tuning_transform.cuh>
|
|
53
|
+
#endif // _CCCL_HAS_CUDA_COMPILER()
|
|
54
|
+
|
|
55
|
+
#include <cuda/__fwd/zip_iterator.h>
|
|
56
|
+
#include <cuda/std/tuple>
|
|
57
|
+
|
|
51
58
|
THRUST_NAMESPACE_BEGIN
|
|
52
59
|
namespace cuda_cub
|
|
53
60
|
{
|
|
@@ -61,6 +68,21 @@ template <class Derived, class InputIt, class OutputIt, class TransformOp>
|
|
|
61
68
|
OutputIt _CCCL_API _CCCL_FORCEINLINE
|
|
62
69
|
transform(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op);
|
|
63
70
|
|
|
71
|
+
// Forward declare to work around a cyclic include, since "cuda/detail/transform.h" includes this header
|
|
72
|
+
// We want this to unwrap zip_transform_iterator
|
|
73
|
+
namespace __transform
|
|
74
|
+
{
|
|
75
|
+
_CCCL_EXEC_CHECK_DISABLE
|
|
76
|
+
template <class Derived, class Offset, class... InputIts, class OutputIt, class TransformOp, class Predicate>
|
|
77
|
+
OutputIt _CCCL_API _CCCL_FORCEINLINE cub_transform_many(
|
|
78
|
+
execution_policy<Derived>& policy,
|
|
79
|
+
::cuda::std::tuple<InputIts...> firsts,
|
|
80
|
+
OutputIt result,
|
|
81
|
+
Offset num_items,
|
|
82
|
+
TransformOp transform_op,
|
|
83
|
+
Predicate pred);
|
|
84
|
+
} // namespace __transform
|
|
85
|
+
|
|
64
86
|
namespace __copy
|
|
65
87
|
{
|
|
66
88
|
template <class H, class D, class T, class Size>
|
|
@@ -190,6 +212,17 @@ device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last,
|
|
|
190
212
|
|
|
191
213
|
return result + n;
|
|
192
214
|
}
|
|
215
|
+
else if constexpr (::cuda::__is_zip_transform_iterator<InputIt>)
|
|
216
|
+
{
|
|
217
|
+
const auto n = ::cuda::std::distance(first, last);
|
|
218
|
+
return cuda_cub::__transform::cub_transform_many(
|
|
219
|
+
policy,
|
|
220
|
+
::cuda::std::move(first).__base(),
|
|
221
|
+
result,
|
|
222
|
+
n,
|
|
223
|
+
::cuda::std::move(first).__pred(),
|
|
224
|
+
cub::detail::transform::always_true_predicate{});
|
|
225
|
+
}
|
|
193
226
|
else
|
|
194
227
|
{
|
|
195
228
|
return cuda_cub::transform(
|
|
@@ -1,77 +1,24 @@
|
|
|
1
|
-
# Copyright (c)
|
|
1
|
+
# Copyright (c) 2025, NVIDIA CORPORATION.
|
|
2
2
|
#
|
|
3
|
-
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
4
14
|
|
|
5
|
-
|
|
6
|
-
DoubleBuffer,
|
|
7
|
-
SortOrder,
|
|
8
|
-
binary_transform,
|
|
9
|
-
exclusive_scan,
|
|
10
|
-
histogram_even,
|
|
11
|
-
inclusive_scan,
|
|
12
|
-
make_binary_transform,
|
|
13
|
-
make_exclusive_scan,
|
|
14
|
-
make_histogram_even,
|
|
15
|
-
make_inclusive_scan,
|
|
16
|
-
make_merge_sort,
|
|
17
|
-
make_radix_sort,
|
|
18
|
-
make_reduce_into,
|
|
19
|
-
make_segmented_reduce,
|
|
20
|
-
make_three_way_partition,
|
|
21
|
-
make_unary_transform,
|
|
22
|
-
make_unique_by_key,
|
|
23
|
-
merge_sort,
|
|
24
|
-
radix_sort,
|
|
25
|
-
reduce_into,
|
|
26
|
-
segmented_reduce,
|
|
27
|
-
three_way_partition,
|
|
28
|
-
unary_transform,
|
|
29
|
-
unique_by_key,
|
|
30
|
-
)
|
|
31
|
-
from .iterators import (
|
|
32
|
-
CacheModifiedInputIterator,
|
|
33
|
-
ConstantIterator,
|
|
34
|
-
CountingIterator,
|
|
35
|
-
ReverseIterator,
|
|
36
|
-
TransformIterator,
|
|
37
|
-
TransformOutputIterator,
|
|
38
|
-
ZipIterator,
|
|
39
|
-
)
|
|
40
|
-
from .op import OpKind
|
|
41
|
-
from .struct import gpu_struct
|
|
15
|
+
# alias for backwards compatibility
|
|
42
16
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
"
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
"histogram_even",
|
|
52
|
-
"inclusive_scan",
|
|
53
|
-
"make_binary_transform",
|
|
54
|
-
"make_exclusive_scan",
|
|
55
|
-
"make_histogram_even",
|
|
56
|
-
"make_inclusive_scan",
|
|
57
|
-
"make_merge_sort",
|
|
58
|
-
"make_radix_sort",
|
|
59
|
-
"make_reduce_into",
|
|
60
|
-
"make_segmented_reduce",
|
|
61
|
-
"make_three_way_partition",
|
|
62
|
-
"make_unary_transform",
|
|
63
|
-
"make_unique_by_key",
|
|
64
|
-
"merge_sort",
|
|
65
|
-
"OpKind",
|
|
66
|
-
"radix_sort",
|
|
67
|
-
"reduce_into",
|
|
68
|
-
"ReverseIterator",
|
|
69
|
-
"segmented_reduce",
|
|
70
|
-
"SortOrder",
|
|
71
|
-
"TransformIterator",
|
|
72
|
-
"three_way_partition",
|
|
73
|
-
"TransformOutputIterator",
|
|
74
|
-
"unary_transform",
|
|
75
|
-
"unique_by_key",
|
|
76
|
-
"ZipIterator",
|
|
77
|
-
]
|
|
17
|
+
from warnings import warn
|
|
18
|
+
|
|
19
|
+
from cuda.compute import * # noqa: F403
|
|
20
|
+
|
|
21
|
+
warn(
|
|
22
|
+
"The module cuda.cccl.parallel.experimental is deprecated. Use cuda.compute instead.",
|
|
23
|
+
FutureWarning,
|
|
24
|
+
)
|
cuda/compute/__init__.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
|
+
|
|
5
|
+
from .algorithms import (
|
|
6
|
+
DoubleBuffer,
|
|
7
|
+
SortOrder,
|
|
8
|
+
binary_transform,
|
|
9
|
+
exclusive_scan,
|
|
10
|
+
histogram_even,
|
|
11
|
+
inclusive_scan,
|
|
12
|
+
make_binary_transform,
|
|
13
|
+
make_exclusive_scan,
|
|
14
|
+
make_histogram_even,
|
|
15
|
+
make_inclusive_scan,
|
|
16
|
+
make_merge_sort,
|
|
17
|
+
make_radix_sort,
|
|
18
|
+
make_reduce_into,
|
|
19
|
+
make_segmented_reduce,
|
|
20
|
+
make_three_way_partition,
|
|
21
|
+
make_unary_transform,
|
|
22
|
+
make_unique_by_key,
|
|
23
|
+
merge_sort,
|
|
24
|
+
radix_sort,
|
|
25
|
+
reduce_into,
|
|
26
|
+
segmented_reduce,
|
|
27
|
+
three_way_partition,
|
|
28
|
+
unary_transform,
|
|
29
|
+
unique_by_key,
|
|
30
|
+
)
|
|
31
|
+
from .iterators import (
|
|
32
|
+
CacheModifiedInputIterator,
|
|
33
|
+
ConstantIterator,
|
|
34
|
+
CountingIterator,
|
|
35
|
+
ReverseIterator,
|
|
36
|
+
TransformIterator,
|
|
37
|
+
TransformOutputIterator,
|
|
38
|
+
ZipIterator,
|
|
39
|
+
)
|
|
40
|
+
from .op import OpKind
|
|
41
|
+
from .struct import gpu_struct
|
|
42
|
+
|
|
43
|
+
__all__ = [
|
|
44
|
+
"binary_transform",
|
|
45
|
+
"CacheModifiedInputIterator",
|
|
46
|
+
"ConstantIterator",
|
|
47
|
+
"CountingIterator",
|
|
48
|
+
"DoubleBuffer",
|
|
49
|
+
"exclusive_scan",
|
|
50
|
+
"gpu_struct",
|
|
51
|
+
"histogram_even",
|
|
52
|
+
"inclusive_scan",
|
|
53
|
+
"make_binary_transform",
|
|
54
|
+
"make_exclusive_scan",
|
|
55
|
+
"make_histogram_even",
|
|
56
|
+
"make_inclusive_scan",
|
|
57
|
+
"make_merge_sort",
|
|
58
|
+
"make_radix_sort",
|
|
59
|
+
"make_reduce_into",
|
|
60
|
+
"make_segmented_reduce",
|
|
61
|
+
"make_three_way_partition",
|
|
62
|
+
"make_unary_transform",
|
|
63
|
+
"make_unique_by_key",
|
|
64
|
+
"merge_sort",
|
|
65
|
+
"OpKind",
|
|
66
|
+
"radix_sort",
|
|
67
|
+
"reduce_into",
|
|
68
|
+
"ReverseIterator",
|
|
69
|
+
"segmented_reduce",
|
|
70
|
+
"SortOrder",
|
|
71
|
+
"TransformIterator",
|
|
72
|
+
"TransformOutputIterator",
|
|
73
|
+
"three_way_partition",
|
|
74
|
+
"unary_transform",
|
|
75
|
+
"unique_by_key",
|
|
76
|
+
"ZipIterator",
|
|
77
|
+
]
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
# Python signatures are declared in the companion Python stub file _bindings.pyi
|
|
6
6
|
# Make sure to update PYI with change to Python API to ensure that Python
|
|
7
|
-
# static type checker tools like mypy green-lights cuda.
|
|
7
|
+
# static type checker tools like mypy green-lights cuda.compute
|
|
8
8
|
|
|
9
9
|
from libc.string cimport memset, memcpy
|
|
10
10
|
from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
|
|
@@ -148,7 +148,7 @@ def make_histogram_even(
|
|
|
148
148
|
Example:
|
|
149
149
|
Below, ``make_histogram_even`` is used to create a histogram object that can be reused.
|
|
150
150
|
|
|
151
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
151
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_object.py
|
|
152
152
|
:language: python
|
|
153
153
|
:start-after: # example-begin
|
|
154
154
|
|
|
@@ -190,7 +190,7 @@ def histogram_even(
|
|
|
190
190
|
Example:
|
|
191
191
|
Below, ``histogram_even`` is used to compute a histogram with evenly-spaced bins.
|
|
192
192
|
|
|
193
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
193
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_even_basic.py
|
|
194
194
|
:language: python
|
|
195
195
|
:start-after: # example-begin
|
|
196
196
|
:caption: Basic histogram example.
|