cuda-cccl 0.1.3.2.0.dev438__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.1__cp310-cp310-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cuda-cccl might be problematic. Click here for more details.

Files changed (177) hide show
  1. cuda/cccl/cooperative/__init__.py +7 -1
  2. cuda/cccl/cooperative/experimental/__init__.py +21 -5
  3. cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
  4. cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
  5. cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
  6. cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
  7. cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
  8. cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
  9. cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
  10. cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
  11. cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
  12. cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
  13. cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
  14. cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
  15. cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
  16. cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
  17. cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
  18. cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
  19. cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
  20. cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
  21. cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
  22. cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
  23. cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
  24. cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
  25. cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
  26. cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
  27. cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
  28. cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
  29. cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
  30. cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
  31. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
  32. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
  33. cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
  34. cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
  35. cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +10 -5
  36. cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
  37. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
  38. cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
  39. cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
  40. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
  41. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
  42. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
  43. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
  44. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
  45. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
  46. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
  47. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
  48. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
  49. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
  50. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
  51. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
  52. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
  53. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
  54. cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
  55. cuda/cccl/headers/include/cub/util_device.cuh +51 -35
  56. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
  57. cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
  58. cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
  59. cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
  60. cuda/cccl/headers/include/cuda/__algorithm/copy.h +4 -4
  61. cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
  62. cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
  63. cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
  64. cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
  65. cuda/cccl/headers/include/cuda/__device/device_ref.h +32 -51
  66. cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
  67. cuda/cccl/headers/include/cuda/__driver/driver_api.h +330 -36
  68. cuda/cccl/headers/include/cuda/__event/event.h +8 -8
  69. cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
  70. cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -4
  71. cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
  72. cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
  73. cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
  74. cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
  75. cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
  76. cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
  77. cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
  78. cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
  79. cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
  80. cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
  81. cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
  82. cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
  83. cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
  84. cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
  85. cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
  86. cuda/cccl/headers/include/cuda/algorithm +1 -1
  87. cuda/cccl/headers/include/cuda/devices +10 -0
  88. cuda/cccl/headers/include/cuda/iterator +1 -0
  89. cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
  90. cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
  91. cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
  92. cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
  93. cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
  94. cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
  95. cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
  96. cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
  97. cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
  98. cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
  99. cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
  100. cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
  101. cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
  102. cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
  103. cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
  104. cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
  105. cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
  106. cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
  107. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
  108. cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
  109. cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
  110. cuda/cccl/headers/include/cuda/std/string_view +12 -5
  111. cuda/cccl/headers/include/cuda/std/version +1 -4
  112. cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
  113. cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
  114. cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
  115. cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
  116. cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
  117. cuda/cccl/parallel/experimental/__init__.py +21 -70
  118. cuda/compute/__init__.py +77 -0
  119. cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
  120. cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +141 -1
  121. cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
  122. cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
  123. cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
  124. cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
  125. cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
  126. cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
  127. cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
  128. cuda/compute/algorithms/_three_way_partition.py +261 -0
  129. cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
  130. cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
  131. cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  132. cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
  133. cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  134. cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
  135. cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
  136. cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
  137. cuda/coop/__init__.py +8 -0
  138. cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
  139. cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
  140. cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
  141. cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
  142. cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
  143. cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
  144. cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
  145. cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
  146. cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
  147. cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
  148. cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
  149. cuda/coop/warp/__init__.py +9 -0
  150. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
  151. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
  152. cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
  153. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
  154. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +171 -166
  155. cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
  156. cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
  157. cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
  158. cuda/cccl/parallel/experimental/.gitignore +0 -4
  159. cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  160. cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
  161. /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
  162. /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
  163. /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
  164. /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
  165. /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
  166. /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
  167. /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
  168. /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
  169. /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
  170. /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
  171. /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
  172. /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
  173. /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
  174. /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
  175. /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
  176. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
  177. {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -76,25 +76,25 @@ template <class _To, class _From>
76
76
  #if _CCCL_HAS_NVFP8_E8M0()
77
77
  else if constexpr (is_same_v<_To, __nv_fp8_e8m0>)
78
78
  {
79
- return ::cuda::std::__fp_from_storage<__nv_fp8_e8m0>(::__nv_cvt_float_to_e8m0(__v, __NV_NOSAT, cudaRoundZero));
79
+ return ::cuda::std::__fp_from_storage<__nv_fp8_e8m0>(::__nv_cvt_float_to_e8m0(__v, __NV_NOSAT, ::cudaRoundZero));
80
80
  }
81
81
  #endif // _CCCL_HAS_NVFP8_E8M0()
82
82
  #if _CCCL_HAS_NVFP6_E2M3()
83
83
  else if constexpr (is_same_v<_To, __nv_fp6_e2m3>)
84
84
  {
85
- return ::cuda::std::__fp_from_storage<__nv_fp6_e2m3>(::__nv_cvt_float_to_fp6(__v, __NV_E2M3, cudaRoundNearest));
85
+ return ::cuda::std::__fp_from_storage<__nv_fp6_e2m3>(::__nv_cvt_float_to_fp6(__v, __NV_E2M3, ::cudaRoundNearest));
86
86
  }
87
87
  #endif // _CCCL_HAS_NVFP6_E2M3()
88
88
  #if _CCCL_HAS_NVFP6_E3M2()
89
89
  else if constexpr (is_same_v<_To, __nv_fp6_e3m2>)
90
90
  {
91
- return ::cuda::std::__fp_from_storage<__nv_fp6_e3m2>(::__nv_cvt_float_to_fp6(__v, __NV_E3M2, cudaRoundNearest));
91
+ return ::cuda::std::__fp_from_storage<__nv_fp6_e3m2>(::__nv_cvt_float_to_fp6(__v, __NV_E3M2, ::cudaRoundNearest));
92
92
  }
93
93
  #endif // _CCCL_HAS_NVFP6_E3M2()
94
94
  #if _CCCL_HAS_NVFP4_E2M1()
95
95
  else if constexpr (is_same_v<_To, __nv_fp4_e2m1>)
96
96
  {
97
- return ::cuda::std::__fp_from_storage<__nv_fp4_e2m1>(::__nv_cvt_float_to_fp4(__v, __NV_E2M1, cudaRoundNearest));
97
+ return ::cuda::std::__fp_from_storage<__nv_fp4_e2m1>(::__nv_cvt_float_to_fp4(__v, __NV_E2M1, ::cudaRoundNearest));
98
98
  }
99
99
  #endif // _CCCL_HAS_NVFP4_E2M1()
100
100
  else
@@ -145,25 +145,28 @@ template <class _To, class _From>
145
145
  #if _CCCL_HAS_NVFP8_E8M0()
146
146
  else if constexpr (is_same_v<_To, __nv_fp8_e8m0>)
147
147
  {
148
- return ::cuda::std::__fp_from_storage<__nv_fp8_e8m0>(::__nv_cvt_double_to_e8m0(__v, __NV_NOSAT, cudaRoundZero));
148
+ return ::cuda::std::__fp_from_storage<__nv_fp8_e8m0>(::__nv_cvt_double_to_e8m0(__v, __NV_NOSAT, ::cudaRoundZero));
149
149
  }
150
150
  #endif // _CCCL_HAS_NVFP8_E8M0()
151
151
  #if _CCCL_HAS_NVFP6_E2M3()
152
152
  else if constexpr (is_same_v<_To, __nv_fp6_e2m3>)
153
153
  {
154
- return ::cuda::std::__fp_from_storage<__nv_fp6_e2m3>(::__nv_cvt_double_to_fp6(__v, __NV_E2M3, cudaRoundNearest));
154
+ return ::cuda::std::__fp_from_storage<__nv_fp6_e2m3>(
155
+ ::__nv_cvt_double_to_fp6(__v, __NV_E2M3, ::cudaRoundNearest));
155
156
  }
156
157
  #endif // _CCCL_HAS_NVFP6_E2M3()
157
158
  #if _CCCL_HAS_NVFP6_E3M2()
158
159
  else if constexpr (is_same_v<_To, __nv_fp6_e3m2>)
159
160
  {
160
- return ::cuda::std::__fp_from_storage<__nv_fp6_e3m2>(::__nv_cvt_double_to_fp6(__v, __NV_E3M2, cudaRoundNearest));
161
+ return ::cuda::std::__fp_from_storage<__nv_fp6_e3m2>(
162
+ ::__nv_cvt_double_to_fp6(__v, __NV_E3M2, ::cudaRoundNearest));
161
163
  }
162
164
  #endif // _CCCL_HAS_NVFP6_E3M2()
163
165
  #if _CCCL_HAS_NVFP4_E2M1()
164
166
  else if constexpr (is_same_v<_To, __nv_fp4_e2m1>)
165
167
  {
166
- return ::cuda::std::__fp_from_storage<__nv_fp4_e2m1>(::__nv_cvt_double_to_fp4(__v, __NV_E2M1, cudaRoundNearest));
168
+ return ::cuda::std::__fp_from_storage<__nv_fp4_e2m1>(
169
+ ::__nv_cvt_double_to_fp4(__v, __NV_E2M1, ::cudaRoundNearest));
167
170
  }
168
171
  #endif // _CCCL_HAS_NVFP4_E2M1()
169
172
  else
@@ -352,28 +355,28 @@ template <class _To, class _From>
352
355
  else if constexpr (is_same_v<_To, __nv_fp8_e8m0>)
353
356
  {
354
357
  return ::cuda::std::__fp_from_storage<__nv_fp8_e8m0>(
355
- ::__nv_cvt_bfloat16raw_to_e8m0(__v, __NV_NOSAT, cudaRoundZero));
358
+ ::__nv_cvt_bfloat16raw_to_e8m0(__v, __NV_NOSAT, ::cudaRoundZero));
356
359
  }
357
360
  # endif // _CCCL_HAS_NVFP8_E8M0()
358
361
  # if _CCCL_HAS_NVFP6_E2M3()
359
362
  else if constexpr (is_same_v<_To, __nv_fp6_e2m3>)
360
363
  {
361
364
  return ::cuda::std::__fp_from_storage<__nv_fp6_e2m3>(
362
- ::__nv_cvt_bfloat16raw_to_fp6(__v, __NV_E2M3, cudaRoundNearest));
365
+ ::__nv_cvt_bfloat16raw_to_fp6(__v, __NV_E2M3, ::cudaRoundNearest));
363
366
  }
364
367
  # endif // _CCCL_HAS_NVFP6_E2M3()
365
368
  # if _CCCL_HAS_NVFP6_E3M2()
366
369
  else if constexpr (is_same_v<_To, __nv_fp6_e3m2>)
367
370
  {
368
371
  return ::cuda::std::__fp_from_storage<__nv_fp6_e3m2>(
369
- ::__nv_cvt_bfloat16raw_to_fp6(__v, __NV_E3M2, cudaRoundNearest));
372
+ ::__nv_cvt_bfloat16raw_to_fp6(__v, __NV_E3M2, ::cudaRoundNearest));
370
373
  }
371
374
  # endif // _CCCL_HAS_NVFP6_E3M2()
372
375
  # if _CCCL_HAS_NVFP4_E2M1()
373
376
  else if constexpr (is_same_v<_To, __nv_fp4_e2m1>)
374
377
  {
375
378
  return ::cuda::std::__fp_from_storage<__nv_fp4_e2m1>(
376
- ::__nv_cvt_bfloat16raw_to_fp4(__v, __NV_E2M1, cudaRoundNearest));
379
+ ::__nv_cvt_bfloat16raw_to_fp4(__v, __NV_E2M1, ::cudaRoundNearest));
377
380
  }
378
381
  # endif // _CCCL_HAS_NVFP4_E2M1()
379
382
  else
@@ -55,6 +55,9 @@ _CCCL_DIAG_SUPPRESS_MSVC(4100) // unreferenced formal parameter
55
55
  _CCCL_DIAG_POP
56
56
  #endif // _CCCL_HAS_NVFP4()
57
57
 
58
+ // crt/device_fp128_functions.h is available in CUDA 12.8+.
59
+ // _CCCL_HAS_FLOAT128() checks the *compiler* compatibility with __float128.
60
+ // We also need to check the toolkit version to ensure the compatibility with nvc++.
58
61
  #if _CCCL_HAS_FLOAT128() && _CCCL_DEVICE_COMPILATION() && _CCCL_CTK_AT_LEAST(12, 8)
59
62
  # if !_CCCL_COMPILER(NVRTC)
60
63
  _CCCL_DIAG_PUSH
@@ -11,7 +11,7 @@
11
11
  #ifndef _CUDA_STD___FLOATING_POINT_FP_H
12
12
  #define _CUDA_STD___FLOATING_POINT_FP_H
13
13
 
14
- #include <cuda/__cccl_config>
14
+ #include <cuda/std/detail/__config>
15
15
 
16
16
  #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17
17
  # pragma GCC system_header
@@ -439,7 +439,8 @@ public:
439
439
  [[nodiscard]] _CCCL_API constexpr bool is_exhaustive() const
440
440
  noexcept(noexcept(::cuda::std::declval<const mapping_type&>().is_exhaustive()))
441
441
  {
442
- return mapping().is_exhaustive();
442
+ auto __tmp = mapping(); // workaround for clang with nodiscard
443
+ return __tmp.is_exhaustive();
443
444
  }
444
445
  [[nodiscard]] _CCCL_API constexpr bool is_strided() const
445
446
  noexcept(noexcept(::cuda::std::declval<const mapping_type&>().is_strided()))
@@ -20,7 +20,9 @@
20
20
  # pragma system_header
21
21
  #endif // no system header
22
22
 
23
+ #include <cuda/__fwd/complex.h>
23
24
  #include <cuda/std/__fwd/array.h>
25
+ #include <cuda/std/__fwd/complex.h>
24
26
  #include <cuda/std/__fwd/tuple.h>
25
27
  #include <cuda/std/__tuple_dir/tuple_element.h>
26
28
  #include <cuda/std/__tuple_dir/tuple_indices.h>
@@ -61,7 +63,27 @@ struct __make_tuple_types_flat<array<_Vt, _Np>, __tuple_indices<_Idx...>>
61
63
  template <size_t>
62
64
  using __value_type = _Vt;
63
65
  template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
64
- using __apply_quals = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
66
+ using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
67
+ };
68
+
69
+ template <class _Vt, size_t... _Idx>
70
+ struct __make_tuple_types_flat<complex<_Vt>, __tuple_indices<_Idx...>>
71
+ {
72
+ static_assert(sizeof...(_Idx) == 2, "__make_tuple_types: complex has only 2 members");
73
+ template <size_t>
74
+ using __value_type = _Vt;
75
+ template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
76
+ using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
77
+ };
78
+
79
+ template <class _Vt, size_t... _Idx>
80
+ struct __make_tuple_types_flat<::cuda::complex<_Vt>, __tuple_indices<_Idx...>>
81
+ {
82
+ static_assert(sizeof...(_Idx) == 2, "__make_tuple_types: complex has only 2 members");
83
+ template <size_t>
84
+ using __value_type = _Vt;
85
+ template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
86
+ using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
65
87
  };
66
88
 
67
89
  template <class _Tp,
@@ -20,6 +20,7 @@
20
20
  # pragma system_header
21
21
  #endif // no system header
22
22
 
23
+ #include <cuda/__fwd/complex.h>
23
24
  #include <cuda/std/__concepts/concept_macros.h>
24
25
  #include <cuda/std/__fwd/array.h>
25
26
  #include <cuda/std/__fwd/complex.h>
@@ -58,6 +59,9 @@ inline constexpr bool __tuple_like_impl<array<_Tp, _Size>> = true;
58
59
  template <class _Tp>
59
60
  inline constexpr bool __tuple_like_impl<complex<_Tp>> = true;
60
61
 
62
+ template <class _Tp>
63
+ inline constexpr bool __tuple_like_impl<::cuda::complex<_Tp>> = true;
64
+
61
65
  template <class _Ip, class _Sp, ::cuda::std::ranges::subrange_kind _Kp>
62
66
  inline constexpr bool __tuple_like_impl<::cuda::std::ranges::subrange<_Ip, _Sp, _Kp>> = true;
63
67
 
@@ -20,6 +20,7 @@
20
20
  # pragma system_header
21
21
  #endif // no system header
22
22
 
23
+ #include <cuda/__fwd/complex.h>
23
24
  #include <cuda/std/__fwd/array.h>
24
25
  #include <cuda/std/__fwd/complex.h>
25
26
  #include <cuda/std/__fwd/pair.h>
@@ -54,6 +55,9 @@ inline constexpr bool __tuple_like_ext<array<_Tp, _Size>> = true;
54
55
  template <class _Tp>
55
56
  inline constexpr bool __tuple_like_ext<complex<_Tp>> = true;
56
57
 
58
+ template <class _Tp>
59
+ inline constexpr bool __tuple_like_ext<::cuda::complex<_Tp>> = true;
60
+
57
61
  template <class... _Tp>
58
62
  inline constexpr bool __tuple_like_ext<__tuple_types<_Tp...>> = true;
59
63
 
@@ -20,10 +20,8 @@
20
20
  # pragma system_header
21
21
  #endif // no system header
22
22
 
23
- #include <cuda/std/__type_traits/integral_constant.h>
24
23
  #include <cuda/std/__type_traits/is_same.h>
25
24
  #include <cuda/std/__utility/declval.h>
26
- #include <cuda/std/cstddef>
27
25
 
28
26
  #include <cuda/std/__cccl/prologue.h>
29
27
 
@@ -49,6 +47,9 @@ struct __numeric_type
49
47
  _CCCL_API inline static double __test(unsigned long long);
50
48
  _CCCL_API inline static double __test(double);
51
49
  _CCCL_API inline static long double __test(long double);
50
+ #if _CCCL_HAS_FLOAT128()
51
+ _CCCL_API inline static __float128 __test(__float128);
52
+ #endif // _CCCL_HAS_FLOAT128()
52
53
 
53
54
  using type = decltype(__test(declval<_Tp>()));
54
55
  static const bool value = !is_same_v<type, void>;
@@ -57,7 +57,7 @@
57
57
  #include <cuda/std/version>
58
58
 
59
59
  #if !_CCCL_COMPILER(NVRTC)
60
- # include <iosfwd>
60
+ # include <string_view>
61
61
  #endif // !_CCCL_COMPILER(NVRTC)
62
62
 
63
63
  #include <cuda/std/__cccl/prologue.h>
@@ -727,14 +727,21 @@ _CCCL_HOST_DEVICE basic_string_view(_Range&&) -> basic_string_view<::cuda::std::
727
727
 
728
728
  // operator <<
729
729
 
730
- #if 0 // todo: we need to implement char_traits stream types & functions
730
+ #if !_CCCL_COMPILER(NVRTC)
731
+ template <class _CharT>
732
+ _CCCL_HOST_API ::std::basic_ostream<_CharT>&
733
+ operator<<(::std::basic_ostream<_CharT>& __os, basic_string_view<_CharT> __str)
734
+ {
735
+ return __os << ::std::basic_string_view<_CharT>{__str.data(), __str.size()};
736
+ }
737
+
731
738
  template <class _CharT, class _Traits>
732
- _CCCL_API inline ::std::basic_ostream<_CharT, _Traits>&
739
+ _CCCL_HOST_API ::std::basic_ostream<_CharT, _Traits>&
733
740
  operator<<(::std::basic_ostream<_CharT, _Traits>& __os, basic_string_view<_CharT, _Traits> __str)
734
741
  {
735
- return __os.write(__str.data(), static_cast<::std::streamsize>(__str.size()));
742
+ return __os << ::std::basic_string_view<_CharT, _Traits>{__str.data(), __str.size()};
736
743
  }
737
- #endif // 0
744
+ #endif // !_CCCL_COMPILER(NVRTC)
738
745
 
739
746
  // literals
740
747
 
@@ -141,7 +141,7 @@
141
141
  // # define __cccl_lib_shared_mutex 201505L
142
142
  // # define __cccl_lib_shared_ptr_arrays 201611L
143
143
  // # define __cccl_lib_shared_ptr_weak_type 201606L
144
- // # define __cccl_lib_string_view 201606L
144
+ #define __cccl_lib_string_view 201803L
145
145
  // # define __cccl_lib_to_chars 201611L
146
146
  // # define __cccl_lib_uncaught_exceptions 201411L
147
147
  // # define __cccl_lib_unordered_map_try_emplace 201411L
@@ -171,7 +171,6 @@
171
171
  // # define __cccl_lib_constexpr_misc 201811L
172
172
  // # define __cccl_lib_constexpr_numeric 201911L
173
173
  // # define __cccl_lib_constexpr_string 201907L
174
- // # define __cccl_lib_constexpr_string_view 201811L
175
174
  // # define __cccl_lib_constexpr_swap_algorithms 201806L
176
175
  // # define __cccl_lib_constexpr_tuple 201811L
177
176
  // # define __cccl_lib_constexpr_utility 201811L
@@ -204,8 +203,6 @@
204
203
  // # define __cccl_lib_source_location 201907L
205
204
  // # define __cccl_lib_ssize 201902L
206
205
  // # define __cccl_lib_starts_ends_with 201711L
207
- // # undef __cccl_lib_string_view
208
- // # define __cccl_lib_string_view 201803L
209
206
  // # define __cccl_lib_syncbuf 201803L
210
207
  // # define __cccl_lib_three_way_comparison 201907L
211
208
  # define __cccl_lib_unwrap_ref 201811L
@@ -27,6 +27,8 @@
27
27
  #endif // no system header
28
28
  #include <thrust/detail/type_deduction.h>
29
29
 
30
+ #include <cuda/std/__bit/countl.h>
31
+ #include <cuda/std/__type_traits/make_unsigned.h>
30
32
  #include <cuda/std/limits>
31
33
  #include <cuda/std/type_traits>
32
34
 
@@ -36,25 +38,6 @@ THRUST_NAMESPACE_BEGIN
36
38
  namespace detail
37
39
  {
38
40
 
39
- template <typename Integer>
40
- _CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer clz(Integer x)
41
- {
42
- Integer result;
43
-
44
- NV_IF_TARGET(NV_IS_DEVICE,
45
- (result = ::__clz(x);),
46
- (int num_bits = 8 * sizeof(Integer); int num_bits_minus_one = num_bits - 1; result = num_bits;
47
- for (int i = num_bits_minus_one; i >= 0; --i) {
48
- if ((Integer(1) << i) & x)
49
- {
50
- result = num_bits_minus_one - i;
51
- break;
52
- }
53
- }));
54
-
55
- return result;
56
- }
57
-
58
41
  template <typename Integer>
59
42
  _CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_power_of_2(Integer x)
60
43
  {
@@ -85,7 +68,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer log2(Integer x)
85
68
  Integer num_bits = 8 * sizeof(Integer);
86
69
  Integer num_bits_minus_one = num_bits - 1;
87
70
 
88
- return num_bits_minus_one - clz(x);
71
+ return num_bits_minus_one - ::cuda::std::countl_zero(::cuda::std::__to_unsigned_like(x));
89
72
  }
90
73
 
91
74
  template <typename Integer>
@@ -316,6 +316,17 @@ struct iterator_traversal<::cuda::zip_iterator<Iterators...>>
316
316
  using type = detail::minimum_type<iterator_traversal_t<Iterators>...>;
317
317
  };
318
318
 
319
+ template <class Fn, class... Iterators>
320
+ struct iterator_system<::cuda::zip_transform_iterator<Fn, Iterators...>>
321
+ {
322
+ using type = detail::minimum_system_t<iterator_system_t<Iterators>...>;
323
+ };
324
+ template <class Fn, class... Iterators>
325
+ struct iterator_traversal<::cuda::zip_transform_iterator<Fn, Iterators...>>
326
+ {
327
+ using type = detail::minimum_type<iterator_traversal_t<Iterators>...>;
328
+ };
329
+
319
330
  //! \} // end iterator_traits
320
331
 
321
332
  THRUST_NAMESPACE_END
@@ -48,6 +48,13 @@
48
48
  #include <thrust/system/cuda/detail/util.h>
49
49
  #include <thrust/type_traits/is_trivially_relocatable.h>
50
50
 
51
+ #if _CCCL_HAS_CUDA_COMPILER()
52
+ # include <cub/device/dispatch/tuning/tuning_transform.cuh>
53
+ #endif // _CCCL_HAS_CUDA_COMPILER()
54
+
55
+ #include <cuda/__fwd/zip_iterator.h>
56
+ #include <cuda/std/tuple>
57
+
51
58
  THRUST_NAMESPACE_BEGIN
52
59
  namespace cuda_cub
53
60
  {
@@ -61,6 +68,21 @@ template <class Derived, class InputIt, class OutputIt, class TransformOp>
61
68
  OutputIt _CCCL_API _CCCL_FORCEINLINE
62
69
  transform(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op);
63
70
 
71
+ // Forward declare to work around a cyclic include, since "cuda/detail/transform.h" includes this header
72
+ // We want this to unwrap zip_transform_iterator
73
+ namespace __transform
74
+ {
75
+ _CCCL_EXEC_CHECK_DISABLE
76
+ template <class Derived, class Offset, class... InputIts, class OutputIt, class TransformOp, class Predicate>
77
+ OutputIt _CCCL_API _CCCL_FORCEINLINE cub_transform_many(
78
+ execution_policy<Derived>& policy,
79
+ ::cuda::std::tuple<InputIts...> firsts,
80
+ OutputIt result,
81
+ Offset num_items,
82
+ TransformOp transform_op,
83
+ Predicate pred);
84
+ } // namespace __transform
85
+
64
86
  namespace __copy
65
87
  {
66
88
  template <class H, class D, class T, class Size>
@@ -190,6 +212,17 @@ device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last,
190
212
 
191
213
  return result + n;
192
214
  }
215
+ else if constexpr (::cuda::__is_zip_transform_iterator<InputIt>)
216
+ {
217
+ const auto n = ::cuda::std::distance(first, last);
218
+ return cuda_cub::__transform::cub_transform_many(
219
+ policy,
220
+ ::cuda::std::move(first).__base(),
221
+ result,
222
+ n,
223
+ ::cuda::std::move(first).__pred(),
224
+ cub::detail::transform::always_true_predicate{});
225
+ }
193
226
  else
194
227
  {
195
228
  return cuda_cub::transform(
@@ -39,37 +39,23 @@
39
39
  #if _CCCL_HAS_CUDA_COMPILER()
40
40
  # include <thrust/system/cuda/config.h>
41
41
 
42
- # include <thrust/distance.h>
43
- # include <thrust/system/cuda/detail/parallel_for.h>
42
+ # include <thrust/system/cuda/detail/transform.h>
44
43
  # include <thrust/system/cuda/execution_policy.h>
45
44
 
45
+ # include <cuda/__functional/address_stability.h>
46
+ # include <cuda/std/iterator>
47
+
46
48
  THRUST_NAMESPACE_BEGIN
47
49
  namespace cuda_cub
48
50
  {
49
- namespace __tabulate
50
- {
51
- template <class Iterator, class TabulateOp>
52
- struct functor
53
- {
54
- Iterator items;
55
- TabulateOp op;
56
-
57
- template <typename Size>
58
- void _CCCL_DEVICE operator()(Size idx)
59
- {
60
- items[idx] = op(idx);
61
- }
62
- };
63
- } // namespace __tabulate
64
-
65
51
  template <class Derived, class Iterator, class TabulateOp>
66
52
  void _CCCL_HOST_DEVICE tabulate(execution_policy<Derived>& policy, Iterator first, Iterator last, TabulateOp tabulate_op)
67
53
  {
68
- using size_type = thrust::detail::it_difference_t<Iterator>;
69
- size_type count = ::cuda::std::distance(first, last);
70
- cuda_cub::parallel_for(policy, __tabulate::functor<Iterator, TabulateOp>{first, tabulate_op}, count);
54
+ using size_type = ::cuda::std::iter_difference_t<Iterator>;
55
+ const auto count = ::cuda::std::distance(first, last);
56
+ cuda_cub::transform_n(
57
+ policy, ::cuda::counting_iterator<size_type>{}, count, first, ::cuda::proclaim_copyable_arguments(tabulate_op));
71
58
  }
72
-
73
59
  } // namespace cuda_cub
74
60
  THRUST_NAMESPACE_END
75
61
  #endif
@@ -25,72 +25,39 @@
25
25
 
26
26
  THRUST_NAMESPACE_BEGIN
27
27
 
28
- namespace detail
29
- {
30
- // Type traits for contiguous iterators:
31
- template <typename Iterator>
32
- struct contiguous_iterator_traits
33
- {
34
- static_assert(thrust::is_contiguous_iterator_v<Iterator>,
35
- "contiguous_iterator_traits requires a contiguous iterator.");
36
-
37
- using raw_pointer =
38
- typename thrust::detail::pointer_traits<decltype(&*::cuda::std::declval<Iterator>())>::raw_pointer;
39
- };
40
- } // namespace detail
41
-
42
- //! Converts a contiguous iterator type to its underlying raw pointer type.
43
- template <typename ContiguousIterator>
44
- using unwrap_contiguous_iterator_t = typename detail::contiguous_iterator_traits<ContiguousIterator>::raw_pointer;
45
-
46
28
  //! Converts a contiguous iterator to its underlying raw pointer.
29
+ _CCCL_EXEC_CHECK_DISABLE
47
30
  template <typename ContiguousIterator>
48
31
  _CCCL_HOST_DEVICE auto unwrap_contiguous_iterator(ContiguousIterator it)
49
- -> unwrap_contiguous_iterator_t<ContiguousIterator>
50
32
  {
51
33
  static_assert(thrust::is_contiguous_iterator_v<ContiguousIterator>,
52
34
  "unwrap_contiguous_iterator called with non-contiguous iterator.");
53
35
  return thrust::raw_pointer_cast(&*it);
54
36
  }
55
37
 
56
- namespace detail
57
- {
58
- // Implementation for non-contiguous iterators -- passthrough.
59
- template <typename Iterator, bool IsContiguous = thrust::is_contiguous_iterator_v<Iterator>>
60
- struct try_unwrap_contiguous_iterator_impl
61
- {
62
- using type = Iterator;
63
-
64
- static _CCCL_HOST_DEVICE type get(Iterator it)
65
- {
66
- return it;
67
- }
68
- };
38
+ //! Converts a contiguous iterator type to its underlying raw pointer type.
39
+ template <typename ContiguousIterator>
40
+ using unwrap_contiguous_iterator_t = decltype(unwrap_contiguous_iterator(::cuda::std::declval<ContiguousIterator>()));
69
41
 
70
- // Implementation for contiguous iterators -- unwraps to raw pointer.
42
+ //! Takes an iterator and, if it is contiguous, unwraps it to the raw pointer it represents. Otherwise returns the
43
+ //! iterator unmodified.
44
+ _CCCL_EXEC_CHECK_DISABLE
71
45
  template <typename Iterator>
72
- struct try_unwrap_contiguous_iterator_impl<Iterator, true /*is_contiguous*/>
46
+ _CCCL_HOST_DEVICE auto try_unwrap_contiguous_iterator(Iterator it)
73
47
  {
74
- using type = unwrap_contiguous_iterator_t<Iterator>;
75
-
76
- static _CCCL_HOST_DEVICE type get(Iterator it)
48
+ if constexpr (thrust::is_contiguous_iterator_v<Iterator>)
77
49
  {
78
50
  return unwrap_contiguous_iterator(it);
79
51
  }
80
- };
81
- } // namespace detail
52
+ else
53
+ {
54
+ return it;
55
+ }
56
+ }
82
57
 
83
58
  //! Takes an iterator type and, if it is contiguous, yields the raw pointer type it represents. Otherwise returns the
84
59
  //! iterator type unmodified.
85
60
  template <typename Iterator>
86
- using try_unwrap_contiguous_iterator_t = typename detail::try_unwrap_contiguous_iterator_impl<Iterator>::type;
87
-
88
- //! Takes an iterator and, if it is contiguous, unwraps it to the raw pointer it represents. Otherwise returns the
89
- //! iterator unmodified.
90
- template <typename Iterator>
91
- _CCCL_HOST_DEVICE auto try_unwrap_contiguous_iterator(Iterator it) -> try_unwrap_contiguous_iterator_t<Iterator>
92
- {
93
- return detail::try_unwrap_contiguous_iterator_impl<Iterator>::get(it);
94
- }
61
+ using try_unwrap_contiguous_iterator_t = decltype(try_unwrap_contiguous_iterator(::cuda::std::declval<Iterator>()));
95
62
 
96
63
  THRUST_NAMESPACE_END
@@ -1,73 +1,24 @@
1
- # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
2
  #
3
- # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
4
14
 
5
- from .algorithms import (
6
- DoubleBuffer,
7
- SortOrder,
8
- binary_transform,
9
- exclusive_scan,
10
- histogram_even,
11
- inclusive_scan,
12
- make_binary_transform,
13
- make_exclusive_scan,
14
- make_histogram_even,
15
- make_inclusive_scan,
16
- make_merge_sort,
17
- make_radix_sort,
18
- make_reduce_into,
19
- make_segmented_reduce,
20
- make_unary_transform,
21
- make_unique_by_key,
22
- merge_sort,
23
- radix_sort,
24
- reduce_into,
25
- segmented_reduce,
26
- unary_transform,
27
- unique_by_key,
28
- )
29
- from .iterators import (
30
- CacheModifiedInputIterator,
31
- ConstantIterator,
32
- CountingIterator,
33
- ReverseIterator,
34
- TransformIterator,
35
- TransformOutputIterator,
36
- ZipIterator,
37
- )
38
- from .op import OpKind
39
- from .struct import gpu_struct
15
+ # alias for backwards compatibility
40
16
 
41
- __all__ = [
42
- "binary_transform",
43
- "CacheModifiedInputIterator",
44
- "ConstantIterator",
45
- "CountingIterator",
46
- "DoubleBuffer",
47
- "exclusive_scan",
48
- "gpu_struct",
49
- "histogram_even",
50
- "inclusive_scan",
51
- "make_binary_transform",
52
- "make_exclusive_scan",
53
- "make_histogram_even",
54
- "make_inclusive_scan",
55
- "make_merge_sort",
56
- "make_radix_sort",
57
- "make_reduce_into",
58
- "make_segmented_reduce",
59
- "make_unary_transform",
60
- "make_unique_by_key",
61
- "merge_sort",
62
- "OpKind",
63
- "radix_sort",
64
- "reduce_into",
65
- "ReverseIterator",
66
- "segmented_reduce",
67
- "SortOrder",
68
- "TransformIterator",
69
- "TransformOutputIterator",
70
- "unary_transform",
71
- "unique_by_key",
72
- "ZipIterator",
73
- ]
17
+ from warnings import warn
18
+
19
+ from cuda.compute import * # noqa: F403
20
+
21
+ warn(
22
+ "The module cuda.cccl.parallel.experimental is deprecated. Use cuda.compute instead.",
23
+ FutureWarning,
24
+ )