cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (144) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  7. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  20. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
  21. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  22. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  23. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
  24. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
  25. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  26. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
  27. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
  28. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  29. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
  30. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
  31. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  32. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  33. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  34. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  35. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  36. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  37. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  38. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  39. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  40. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  41. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  42. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  43. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  44. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  46. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  47. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  48. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  49. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  50. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  51. cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
  52. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  53. cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
  54. cuda/cccl/headers/include/cuda/__device/arch_traits.h +48 -46
  55. cuda/cccl/headers/include/cuda/__device/attributes.h +171 -121
  56. cuda/cccl/headers/include/cuda/__device/device_ref.h +30 -42
  57. cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
  58. cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
  59. cuda/cccl/headers/include/cuda/__event/event.h +1 -0
  60. cuda/cccl/headers/include/cuda/__event/timed_event.h +1 -0
  61. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  62. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  63. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
  64. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
  65. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
  66. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
  67. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +1 -0
  68. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  69. cuda/cccl/headers/include/cuda/algorithm +1 -1
  70. cuda/cccl/headers/include/cuda/devices +10 -0
  71. cuda/cccl/headers/include/cuda/iterator +1 -0
  72. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  73. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  74. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  75. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  76. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  77. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  78. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  79. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  80. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  81. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  82. cuda/cccl/headers/include/cuda/std/string_view +12 -5
  83. cuda/cccl/headers/include/cuda/std/version +1 -4
  84. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  85. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  86. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  87. cuda/cccl/parallel/experimental/__init__.py +21 -74
  88. cuda/compute/__init__.py +77 -0
  89. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +1 -1
  90. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  91. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  92. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  93. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
  94. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -4
  95. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  96. cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
  97. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
  98. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  99. cuda/{cccl/parallel/experimental → compute}/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  100. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  101. cuda/{cccl/parallel/experimental → compute}/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
  102. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  103. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
  104. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  105. cuda/coop/__init__.py +8 -0
  106. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  107. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  108. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  109. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  110. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  111. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  112. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  113. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  114. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  115. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  116. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  117. cuda/coop/warp/__init__.py +9 -0
  118. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  119. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  120. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  121. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
  122. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +141 -138
  123. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  124. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  125. cuda/cccl/parallel/experimental/.gitignore +0 -4
  126. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  127. /cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +0 -0
  128. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  129. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  130. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  131. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  132. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  133. /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
  134. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  135. /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
  136. /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
  137. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  138. /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
  139. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  140. /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
  141. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  142. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  143. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
  144. {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -100,7 +100,14 @@ template <typename _Tp>
100
100
  template <typename _Tp>
101
101
  [[nodiscard]] _CCCL_HIDE_FROM_ABI _CCCL_DEVICE int __cccl_countl_zero_impl_device(_Tp __v) noexcept
102
102
  {
103
- return (sizeof(_Tp) == sizeof(uint32_t)) ? ::__clz(static_cast<int>(__v)) : ::__clzll(static_cast<long long>(__v));
103
+ if constexpr (sizeof(_Tp) == sizeof(uint32_t))
104
+ {
105
+ return static_cast<int>(::__clz(static_cast<int>(__v)));
106
+ }
107
+ else
108
+ {
109
+ return static_cast<int>(::__clzll(static_cast<long long>(__v)));
110
+ }
104
111
  }
105
112
  #endif // _CCCL_CUDA_COMPILATION()
106
113
 
@@ -114,11 +114,11 @@ template <typename _Tp>
114
114
  {
115
115
  if constexpr (sizeof(_Tp) == sizeof(uint32_t))
116
116
  {
117
- return ::__clz(static_cast<int>(::__brev(__v)));
117
+ return static_cast<int>(::__clz(static_cast<int>(::__brev(__v))));
118
118
  }
119
119
  else
120
120
  {
121
- return ::__clzll(static_cast<long long>(::__brevll(__v)));
121
+ return static_cast<int>(::__clzll(static_cast<long long>(::__brevll(__v))));
122
122
  }
123
123
  }
124
124
  #endif // _CCCL_CUDA_COMPILATION()
@@ -275,10 +275,10 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_aligned(
275
275
  // do first word
276
276
  if (__first.__ctz_ != 0)
277
277
  {
278
- unsigned __clz = __bits_per_word - __first.__ctz_;
279
- difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz), __n);
278
+ unsigned __clz_f = __bits_per_word - __first.__ctz_;
279
+ difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
280
280
  __n -= __dn;
281
- __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
281
+ __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
282
282
  __storage_type __b = *__first.__seg_ & __m;
283
283
  *__result.__seg_ &= ~__m;
284
284
  *__result.__seg_ |= __b;
@@ -420,8 +420,8 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_backward_aligned(
420
420
  {
421
421
  difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__last.__ctz_), __n);
422
422
  __n -= __dn;
423
- unsigned __clz = __bits_per_word - __last.__ctz_;
424
- __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
423
+ unsigned __clz_f = __bits_per_word - __last.__ctz_;
424
+ __storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_f);
425
425
  __storage_type __b = *__last.__seg_ & __m;
426
426
  *__result.__seg_ &= ~__m;
427
427
  *__result.__seg_ |= __b;
@@ -635,10 +635,10 @@ _CCCL_API inline __bit_iterator<_Cr, false> __swap_ranges_aligned(
635
635
  // do first word
636
636
  if (__first.__ctz_ != 0)
637
637
  {
638
- unsigned __clz = __bits_per_word - __first.__ctz_;
639
- difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz), __n);
638
+ unsigned __clz_f = __bits_per_word - __first.__ctz_;
639
+ difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
640
640
  __n -= __dn;
641
- __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
641
+ __storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
642
642
  __storage_type __b1 = *__first.__seg_ & __m;
643
643
  *__first.__seg_ &= ~__m;
644
644
  __storage_type __b2 = *__result.__seg_ & __m;
@@ -988,10 +988,10 @@ _CCCL_API constexpr bool __equal_aligned(
988
988
  // do first word
989
989
  if (__first1.__ctz_ != 0)
990
990
  {
991
- unsigned __clz = __bits_per_word - __first1.__ctz_;
992
- difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz), __n);
991
+ unsigned __clz_f = __bits_per_word - __first1.__ctz_;
992
+ difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
993
993
  __n -= __dn;
994
- __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
994
+ __storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
995
995
  if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
996
996
  {
997
997
  return false;
@@ -43,19 +43,19 @@ template <class _Rep, class _Period = ratio<1>>
43
43
  class _CCCL_TYPE_VISIBILITY_DEFAULT duration;
44
44
 
45
45
  template <class _Tp>
46
- inline const bool __is_duration_v = false;
46
+ inline constexpr bool __is_duration_v = false;
47
47
 
48
48
  template <class _Rep, class _Period>
49
- inline const bool __is_duration_v<duration<_Rep, _Period>> = true;
49
+ inline constexpr bool __is_duration_v<duration<_Rep, _Period>> = true;
50
50
 
51
51
  template <class _Rep, class _Period>
52
- inline const bool __is_duration_v<const duration<_Rep, _Period>> = true;
52
+ inline constexpr bool __is_duration_v<const duration<_Rep, _Period>> = true;
53
53
 
54
54
  template <class _Rep, class _Period>
55
- inline const bool __is_duration_v<volatile duration<_Rep, _Period>> = true;
55
+ inline constexpr bool __is_duration_v<volatile duration<_Rep, _Period>> = true;
56
56
 
57
57
  template <class _Rep, class _Period>
58
- inline const bool __is_duration_v<const volatile duration<_Rep, _Period>> = true;
58
+ inline constexpr bool __is_duration_v<const volatile duration<_Rep, _Period>> = true;
59
59
 
60
60
  } // namespace chrono
61
61
 
@@ -190,29 +190,29 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT duration
190
190
  struct __no_overflow
191
191
  {
192
192
  private:
193
- static const intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value;
194
- static const intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
195
- static const intmax_t __n1 = _R1::num / __gcd_n1_n2;
196
- static const intmax_t __d1 = _R1::den / __gcd_d1_d2;
197
- static const intmax_t __n2 = _R2::num / __gcd_n1_n2;
198
- static const intmax_t __d2 = _R2::den / __gcd_d1_d2;
199
- static const intmax_t max = -((intmax_t(1) << (sizeof(intmax_t) * CHAR_BIT - 1)) + 1);
193
+ static constexpr intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value;
194
+ static constexpr intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
195
+ static constexpr intmax_t __n1 = _R1::num / __gcd_n1_n2;
196
+ static constexpr intmax_t __d1 = _R1::den / __gcd_d1_d2;
197
+ static constexpr intmax_t __n2 = _R2::num / __gcd_n1_n2;
198
+ static constexpr intmax_t __d2 = _R2::den / __gcd_d1_d2;
199
+ static constexpr intmax_t max = -((intmax_t(1) << (sizeof(intmax_t) * CHAR_BIT - 1)) + 1);
200
200
 
201
201
  template <intmax_t _Xp, intmax_t _Yp, bool __overflow>
202
202
  struct __mul // __overflow == false
203
203
  {
204
- static const intmax_t value = _Xp * _Yp;
204
+ static constexpr intmax_t value = _Xp * _Yp;
205
205
  };
206
206
 
207
207
  template <intmax_t _Xp, intmax_t _Yp>
208
208
  struct __mul<_Xp, _Yp, true>
209
209
  {
210
- static const intmax_t value = 1;
210
+ static constexpr intmax_t value = 1;
211
211
  };
212
212
 
213
213
  public:
214
- static const bool value = (__n1 <= max / __d2) && (__n2 <= max / __d1);
215
- using type = ratio<__mul<__n1, __d2, !value>::value, __mul<__n2, __d1, !value>::value>;
214
+ static constexpr bool value = (__n1 <= max / __d2) && (__n2 <= max / __d1);
215
+ using type = ratio<__mul<__n1, __d2, !value>::value, __mul<__n2, __d1, !value>::value>;
216
216
  };
217
217
 
218
218
  public:
@@ -40,11 +40,11 @@ namespace chrono
40
40
  class _CCCL_TYPE_VISIBILITY_DEFAULT steady_clock
41
41
  {
42
42
  public:
43
- using duration = nanoseconds;
44
- using rep = duration::rep;
45
- using period = duration::period;
46
- using time_point = ::cuda::std::chrono::time_point<steady_clock, duration>;
47
- static constexpr const bool is_steady = true;
43
+ using duration = nanoseconds;
44
+ using rep = duration::rep;
45
+ using period = duration::period;
46
+ using time_point = ::cuda::std::chrono::time_point<steady_clock, duration>;
47
+ static constexpr bool is_steady = true;
48
48
 
49
49
  [[nodiscard]] _CCCL_API static time_point now() noexcept;
50
50
  };
@@ -39,11 +39,11 @@ namespace chrono
39
39
  class _CCCL_TYPE_VISIBILITY_DEFAULT system_clock
40
40
  {
41
41
  public:
42
- using duration = ::cuda::std::chrono::nanoseconds;
43
- using rep = duration::rep;
44
- using period = duration::period;
45
- using time_point = ::cuda::std::chrono::time_point<system_clock>;
46
- static constexpr const bool is_steady = false;
42
+ using duration = ::cuda::std::chrono::nanoseconds;
43
+ using rep = duration::rep;
44
+ using period = duration::period;
45
+ using time_point = ::cuda::std::chrono::time_point<system_clock>;
46
+ static constexpr bool is_steady = false;
47
47
 
48
48
  [[nodiscard]] _CCCL_API inline static time_point now() noexcept
49
49
  {
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA_STD___FLOATING_POINT_FP_H
12
12
  #define _CUDA_STD___FLOATING_POINT_FP_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -20,7 +20,9 @@
20
20
  # pragma system_header
21
21
  #endif // no system header
22
22
 
23
+ #include <cuda/__fwd/complex.h>
23
24
  #include <cuda/std/__fwd/array.h>
25
+ #include <cuda/std/__fwd/complex.h>
24
26
  #include <cuda/std/__fwd/tuple.h>
25
27
  #include <cuda/std/__tuple_dir/tuple_element.h>
26
28
  #include <cuda/std/__tuple_dir/tuple_indices.h>
@@ -61,7 +63,27 @@ struct __make_tuple_types_flat<array<_Vt, _Np>, __tuple_indices<_Idx...>>
61
63
  template <size_t>
62
64
  using __value_type = _Vt;
63
65
  template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
64
- using __apply_quals = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
66
+ using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
67
+ };
68
+
69
+ template <class _Vt, size_t... _Idx>
70
+ struct __make_tuple_types_flat<complex<_Vt>, __tuple_indices<_Idx...>>
71
+ {
72
+ static_assert(sizeof...(_Idx) == 2, "__make_tuple_types: complex has only 2 members");
73
+ template <size_t>
74
+ using __value_type = _Vt;
75
+ template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
76
+ using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
77
+ };
78
+
79
+ template <class _Vt, size_t... _Idx>
80
+ struct __make_tuple_types_flat<::cuda::complex<_Vt>, __tuple_indices<_Idx...>>
81
+ {
82
+ static_assert(sizeof...(_Idx) == 2, "__make_tuple_types: complex has only 2 members");
83
+ template <size_t>
84
+ using __value_type = _Vt;
85
+ template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
86
+ using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
65
87
  };
66
88
 
67
89
  template <class _Tp,
@@ -20,6 +20,7 @@
20
20
  # pragma system_header
21
21
  #endif // no system header
22
22
 
23
+ #include <cuda/__fwd/complex.h>
23
24
  #include <cuda/std/__concepts/concept_macros.h>
24
25
  #include <cuda/std/__fwd/array.h>
25
26
  #include <cuda/std/__fwd/complex.h>
@@ -58,6 +59,9 @@ inline constexpr bool __tuple_like_impl<array<_Tp, _Size>> = true;
58
59
  template <class _Tp>
59
60
  inline constexpr bool __tuple_like_impl<complex<_Tp>> = true;
60
61
 
62
+ template <class _Tp>
63
+ inline constexpr bool __tuple_like_impl<::cuda::complex<_Tp>> = true;
64
+
61
65
  template <class _Ip, class _Sp, ::cuda::std::ranges::subrange_kind _Kp>
62
66
  inline constexpr bool __tuple_like_impl<::cuda::std::ranges::subrange<_Ip, _Sp, _Kp>> = true;
63
67
 
@@ -20,6 +20,7 @@
20
20
  # pragma system_header
21
21
  #endif // no system header
22
22
 
23
+ #include <cuda/__fwd/complex.h>
23
24
  #include <cuda/std/__fwd/array.h>
24
25
  #include <cuda/std/__fwd/complex.h>
25
26
  #include <cuda/std/__fwd/pair.h>
@@ -54,6 +55,9 @@ inline constexpr bool __tuple_like_ext<array<_Tp, _Size>> = true;
54
55
  template <class _Tp>
55
56
  inline constexpr bool __tuple_like_ext<complex<_Tp>> = true;
56
57
 
58
+ template <class _Tp>
59
+ inline constexpr bool __tuple_like_ext<::cuda::complex<_Tp>> = true;
60
+
57
61
  template <class... _Tp>
58
62
  inline constexpr bool __tuple_like_ext<__tuple_types<_Tp...>> = true;
59
63
 
@@ -57,7 +57,7 @@
57
57
  #include <cuda/std/version>
58
58
 
59
59
  #if !_CCCL_COMPILER(NVRTC)
60
- # include <iosfwd>
60
+ # include <string_view>
61
61
  #endif // !_CCCL_COMPILER(NVRTC)
62
62
 
63
63
  #include <cuda/std/__cccl/prologue.h>
@@ -727,14 +727,21 @@ _CCCL_HOST_DEVICE basic_string_view(_Range&&) -> basic_string_view<::cuda::std::
727
727
 
728
728
  // operator <<
729
729
 
730
- #if 0 // todo: we need to implement char_traits stream types & functions
730
+ #if !_CCCL_COMPILER(NVRTC)
731
+ template <class _CharT>
732
+ _CCCL_HOST_API ::std::basic_ostream<_CharT>&
733
+ operator<<(::std::basic_ostream<_CharT>& __os, basic_string_view<_CharT> __str)
734
+ {
735
+ return __os << ::std::basic_string_view<_CharT>{__str.data(), __str.size()};
736
+ }
737
+
731
738
  template <class _CharT, class _Traits>
732
- _CCCL_API inline ::std::basic_ostream<_CharT, _Traits>&
739
+ _CCCL_HOST_API ::std::basic_ostream<_CharT, _Traits>&
733
740
  operator<<(::std::basic_ostream<_CharT, _Traits>& __os, basic_string_view<_CharT, _Traits> __str)
734
741
  {
735
- return __os.write(__str.data(), static_cast<::std::streamsize>(__str.size()));
742
+ return __os << ::std::basic_string_view<_CharT, _Traits>{__str.data(), __str.size()};
736
743
  }
737
- #endif // 0
744
+ #endif // !_CCCL_COMPILER(NVRTC)
738
745
 
739
746
  // literals
740
747
 
@@ -141,7 +141,7 @@
141
141
  // # define __cccl_lib_shared_mutex 201505L
142
142
  // # define __cccl_lib_shared_ptr_arrays 201611L
143
143
  // # define __cccl_lib_shared_ptr_weak_type 201606L
144
- // # define __cccl_lib_string_view 201606L
144
+ #define __cccl_lib_string_view 201803L
145
145
  // # define __cccl_lib_to_chars 201611L
146
146
  // # define __cccl_lib_uncaught_exceptions 201411L
147
147
  // # define __cccl_lib_unordered_map_try_emplace 201411L
@@ -171,7 +171,6 @@
171
171
  // # define __cccl_lib_constexpr_misc 201811L
172
172
  // # define __cccl_lib_constexpr_numeric 201911L
173
173
  // # define __cccl_lib_constexpr_string 201907L
174
- // # define __cccl_lib_constexpr_string_view 201811L
175
174
  // # define __cccl_lib_constexpr_swap_algorithms 201806L
176
175
  // # define __cccl_lib_constexpr_tuple 201811L
177
176
  // # define __cccl_lib_constexpr_utility 201811L
@@ -204,8 +203,6 @@
204
203
  // # define __cccl_lib_source_location 201907L
205
204
  // # define __cccl_lib_ssize 201902L
206
205
  // # define __cccl_lib_starts_ends_with 201711L
207
- // # undef __cccl_lib_string_view
208
- // # define __cccl_lib_string_view 201803L
209
206
  // # define __cccl_lib_syncbuf 201803L
210
207
  // # define __cccl_lib_three_way_comparison 201907L
211
208
  # define __cccl_lib_unwrap_ref 201811L
@@ -27,6 +27,8 @@
27
27
  #endif // no system header
28
28
  #include <thrust/detail/type_deduction.h>
29
29
 
30
+ #include <cuda/std/__bit/countl.h>
31
+ #include <cuda/std/__type_traits/make_unsigned.h>
30
32
  #include <cuda/std/limits>
31
33
  #include <cuda/std/type_traits>
32
34
 
@@ -36,25 +38,6 @@ THRUST_NAMESPACE_BEGIN
36
38
  namespace detail
37
39
  {
38
40
 
39
- template <typename Integer>
40
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer clz(Integer x)
41
- {
42
- Integer result;
43
-
44
- NV_IF_TARGET(NV_IS_DEVICE,
45
- (result = ::__clz(x);),
46
- (int num_bits = 8 * sizeof(Integer); int num_bits_minus_one = num_bits - 1; result = num_bits;
47
- for (int i = num_bits_minus_one; i >= 0; --i) {
48
- if ((Integer(1) << i) & x)
49
- {
50
- result = num_bits_minus_one - i;
51
- break;
52
- }
53
- }));
54
-
55
- return result;
56
- }
57
-
58
41
  template <typename Integer>
59
42
  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_power_of_2(Integer x)
60
43
  {
@@ -85,7 +68,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer log2(Integer x)
85
68
  Integer num_bits = 8 * sizeof(Integer);
86
69
  Integer num_bits_minus_one = num_bits - 1;
87
70
 
88
- return num_bits_minus_one - clz(x);
71
+ return num_bits_minus_one - ::cuda::std::countl_zero(::cuda::std::__to_unsigned_like(x));
89
72
  }
90
73
 
91
74
  template <typename Integer>
@@ -316,6 +316,17 @@ struct iterator_traversal<::cuda::zip_iterator<Iterators...>>
316
316
  using type = detail::minimum_type<iterator_traversal_t<Iterators>...>;
317
317
  };
318
318
 
319
+ template <class Fn, class... Iterators>
320
+ struct iterator_system<::cuda::zip_transform_iterator<Fn, Iterators...>>
321
+ {
322
+ using type = detail::minimum_system_t<iterator_system_t<Iterators>...>;
323
+ };
324
+ template <class Fn, class... Iterators>
325
+ struct iterator_traversal<::cuda::zip_transform_iterator<Fn, Iterators...>>
326
+ {
327
+ using type = detail::minimum_type<iterator_traversal_t<Iterators>...>;
328
+ };
329
+
319
330
  //! \} // end iterator_traits
320
331
 
321
332
  THRUST_NAMESPACE_END
@@ -48,6 +48,13 @@
48
48
  #include <thrust/system/cuda/detail/util.h>
49
49
  #include <thrust/type_traits/is_trivially_relocatable.h>
50
50
 
51
+ #if _CCCL_HAS_CUDA_COMPILER()
52
+ # include <cub/device/dispatch/tuning/tuning_transform.cuh>
53
+ #endif // _CCCL_HAS_CUDA_COMPILER()
54
+
55
+ #include <cuda/__fwd/zip_iterator.h>
56
+ #include <cuda/std/tuple>
57
+
51
58
  THRUST_NAMESPACE_BEGIN
52
59
  namespace cuda_cub
53
60
  {
@@ -61,6 +68,21 @@ template <class Derived, class InputIt, class OutputIt, class TransformOp>
61
68
  OutputIt _CCCL_API _CCCL_FORCEINLINE
62
69
  transform(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op);
63
70
 
71
+ // Forward declare to work around a cyclic include, since "cuda/detail/transform.h" includes this header
72
+ // We want this to unwrap zip_transform_iterator
73
+ namespace __transform
74
+ {
75
+ _CCCL_EXEC_CHECK_DISABLE
76
+ template <class Derived, class Offset, class... InputIts, class OutputIt, class TransformOp, class Predicate>
77
+ OutputIt _CCCL_API _CCCL_FORCEINLINE cub_transform_many(
78
+ execution_policy<Derived>& policy,
79
+ ::cuda::std::tuple<InputIts...> firsts,
80
+ OutputIt result,
81
+ Offset num_items,
82
+ TransformOp transform_op,
83
+ Predicate pred);
84
+ } // namespace __transform
85
+
64
86
  namespace __copy
65
87
  {
66
88
  template <class H, class D, class T, class Size>
@@ -190,6 +212,17 @@ device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last,
190
212
 
191
213
  return result + n;
192
214
  }
215
+ else if constexpr (::cuda::__is_zip_transform_iterator<InputIt>)
216
+ {
217
+ const auto n = ::cuda::std::distance(first, last);
218
+ return cuda_cub::__transform::cub_transform_many(
219
+ policy,
220
+ ::cuda::std::move(first).__base(),
221
+ result,
222
+ n,
223
+ ::cuda::std::move(first).__pred(),
224
+ cub::detail::transform::always_true_predicate{});
225
+ }
193
226
  else
194
227
  {
195
228
  return cuda_cub::transform(
@@ -1,77 +1,24 @@
1
- # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
2
  #
3
- # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
4
14
 
5
- from .algorithms import (
6
- DoubleBuffer,
7
- SortOrder,
8
- binary_transform,
9
- exclusive_scan,
10
- histogram_even,
11
- inclusive_scan,
12
- make_binary_transform,
13
- make_exclusive_scan,
14
- make_histogram_even,
15
- make_inclusive_scan,
16
- make_merge_sort,
17
- make_radix_sort,
18
- make_reduce_into,
19
- make_segmented_reduce,
20
- make_three_way_partition,
21
- make_unary_transform,
22
- make_unique_by_key,
23
- merge_sort,
24
- radix_sort,
25
- reduce_into,
26
- segmented_reduce,
27
- three_way_partition,
28
- unary_transform,
29
- unique_by_key,
30
- )
31
- from .iterators import (
32
- CacheModifiedInputIterator,
33
- ConstantIterator,
34
- CountingIterator,
35
- ReverseIterator,
36
- TransformIterator,
37
- TransformOutputIterator,
38
- ZipIterator,
39
- )
40
- from .op import OpKind
41
- from .struct import gpu_struct
15
+ # alias for backwards compatibility
42
16
 
43
- __all__ = [
44
- "binary_transform",
45
- "CacheModifiedInputIterator",
46
- "ConstantIterator",
47
- "CountingIterator",
48
- "DoubleBuffer",
49
- "exclusive_scan",
50
- "gpu_struct",
51
- "histogram_even",
52
- "inclusive_scan",
53
- "make_binary_transform",
54
- "make_exclusive_scan",
55
- "make_histogram_even",
56
- "make_inclusive_scan",
57
- "make_merge_sort",
58
- "make_radix_sort",
59
- "make_reduce_into",
60
- "make_segmented_reduce",
61
- "make_three_way_partition",
62
- "make_unary_transform",
63
- "make_unique_by_key",
64
- "merge_sort",
65
- "OpKind",
66
- "radix_sort",
67
- "reduce_into",
68
- "ReverseIterator",
69
- "segmented_reduce",
70
- "SortOrder",
71
- "TransformIterator",
72
- "three_way_partition",
73
- "TransformOutputIterator",
74
- "unary_transform",
75
- "unique_by_key",
76
- "ZipIterator",
77
- ]
17
+ from warnings import warn
18
+
19
+ from cuda.compute import * # noqa: F403
20
+
21
+ warn(
22
+ "The module cuda.cccl.parallel.experimental is deprecated. Use cuda.compute instead.",
23
+ FutureWarning,
24
+ )
@@ -0,0 +1,77 @@
1
+ # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4
+
5
+ from .algorithms import (
6
+ DoubleBuffer,
7
+ SortOrder,
8
+ binary_transform,
9
+ exclusive_scan,
10
+ histogram_even,
11
+ inclusive_scan,
12
+ make_binary_transform,
13
+ make_exclusive_scan,
14
+ make_histogram_even,
15
+ make_inclusive_scan,
16
+ make_merge_sort,
17
+ make_radix_sort,
18
+ make_reduce_into,
19
+ make_segmented_reduce,
20
+ make_three_way_partition,
21
+ make_unary_transform,
22
+ make_unique_by_key,
23
+ merge_sort,
24
+ radix_sort,
25
+ reduce_into,
26
+ segmented_reduce,
27
+ three_way_partition,
28
+ unary_transform,
29
+ unique_by_key,
30
+ )
31
+ from .iterators import (
32
+ CacheModifiedInputIterator,
33
+ ConstantIterator,
34
+ CountingIterator,
35
+ ReverseIterator,
36
+ TransformIterator,
37
+ TransformOutputIterator,
38
+ ZipIterator,
39
+ )
40
+ from .op import OpKind
41
+ from .struct import gpu_struct
42
+
43
+ __all__ = [
44
+ "binary_transform",
45
+ "CacheModifiedInputIterator",
46
+ "ConstantIterator",
47
+ "CountingIterator",
48
+ "DoubleBuffer",
49
+ "exclusive_scan",
50
+ "gpu_struct",
51
+ "histogram_even",
52
+ "inclusive_scan",
53
+ "make_binary_transform",
54
+ "make_exclusive_scan",
55
+ "make_histogram_even",
56
+ "make_inclusive_scan",
57
+ "make_merge_sort",
58
+ "make_radix_sort",
59
+ "make_reduce_into",
60
+ "make_segmented_reduce",
61
+ "make_three_way_partition",
62
+ "make_unary_transform",
63
+ "make_unique_by_key",
64
+ "merge_sort",
65
+ "OpKind",
66
+ "radix_sort",
67
+ "reduce_into",
68
+ "ReverseIterator",
69
+ "segmented_reduce",
70
+ "SortOrder",
71
+ "TransformIterator",
72
+ "TransformOutputIterator",
73
+ "three_way_partition",
74
+ "unary_transform",
75
+ "unique_by_key",
76
+ "ZipIterator",
77
+ ]
@@ -4,7 +4,7 @@
4
4
 
5
5
  # Python signatures are declared in the companion Python stub file _bindings.pyi
6
6
  # Make sure to update PYI with change to Python API to ensure that Python
7
- # static type checker tools like mypy green-lights cuda.cccl.parallel
7
+ # static type checker tools like mypy green-lights cuda.compute
8
8
 
9
9
  from libc.string cimport memset, memcpy
10
10
  from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
@@ -148,7 +148,7 @@ def make_histogram_even(
148
148
  Example:
149
149
  Below, ``make_histogram_even`` is used to create a histogram object that can be reused.
150
150
 
151
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/histogram/histogram_object.py
151
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_object.py
152
152
  :language: python
153
153
  :start-after: # example-begin
154
154
 
@@ -190,7 +190,7 @@ def histogram_even(
190
190
  Example:
191
191
  Below, ``histogram_even`` is used to compute a histogram with evenly-spaced bins.
192
192
 
193
- .. literalinclude:: ../../python/cuda_cccl/tests/parallel/examples/histogram/histogram_even_basic.py
193
+ .. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_even_basic.py
194
194
  :language: python
195
195
  :start-after: # example-begin
196
196
  :caption: Basic histogram example.