cuda-cccl 0.1.3.2.0.dev438__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.1__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
- cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
- cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +10 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/util_device.cuh +51 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +4 -4
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
- cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
- cuda/cccl/headers/include/cuda/__device/device_ref.h +32 -51
- cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +330 -36
- cuda/cccl/headers/include/cuda/__event/event.h +8 -8
- cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -4
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
- cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
- cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/devices +10 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
- cuda/cccl/headers/include/cuda/std/string_view +12 -5
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
- cuda/cccl/parallel/experimental/__init__.py +21 -70
- cuda/compute/__init__.py +77 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +141 -1
- cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +171 -166
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -76,25 +76,25 @@ template <class _To, class _From>
|
|
|
76
76
|
#if _CCCL_HAS_NVFP8_E8M0()
|
|
77
77
|
else if constexpr (is_same_v<_To, __nv_fp8_e8m0>)
|
|
78
78
|
{
|
|
79
|
-
return ::cuda::std::__fp_from_storage<__nv_fp8_e8m0>(::__nv_cvt_float_to_e8m0(__v, __NV_NOSAT, cudaRoundZero));
|
|
79
|
+
return ::cuda::std::__fp_from_storage<__nv_fp8_e8m0>(::__nv_cvt_float_to_e8m0(__v, __NV_NOSAT, ::cudaRoundZero));
|
|
80
80
|
}
|
|
81
81
|
#endif // _CCCL_HAS_NVFP8_E8M0()
|
|
82
82
|
#if _CCCL_HAS_NVFP6_E2M3()
|
|
83
83
|
else if constexpr (is_same_v<_To, __nv_fp6_e2m3>)
|
|
84
84
|
{
|
|
85
|
-
return ::cuda::std::__fp_from_storage<__nv_fp6_e2m3>(::__nv_cvt_float_to_fp6(__v, __NV_E2M3, cudaRoundNearest));
|
|
85
|
+
return ::cuda::std::__fp_from_storage<__nv_fp6_e2m3>(::__nv_cvt_float_to_fp6(__v, __NV_E2M3, ::cudaRoundNearest));
|
|
86
86
|
}
|
|
87
87
|
#endif // _CCCL_HAS_NVFP6_E2M3()
|
|
88
88
|
#if _CCCL_HAS_NVFP6_E3M2()
|
|
89
89
|
else if constexpr (is_same_v<_To, __nv_fp6_e3m2>)
|
|
90
90
|
{
|
|
91
|
-
return ::cuda::std::__fp_from_storage<__nv_fp6_e3m2>(::__nv_cvt_float_to_fp6(__v, __NV_E3M2, cudaRoundNearest));
|
|
91
|
+
return ::cuda::std::__fp_from_storage<__nv_fp6_e3m2>(::__nv_cvt_float_to_fp6(__v, __NV_E3M2, ::cudaRoundNearest));
|
|
92
92
|
}
|
|
93
93
|
#endif // _CCCL_HAS_NVFP6_E3M2()
|
|
94
94
|
#if _CCCL_HAS_NVFP4_E2M1()
|
|
95
95
|
else if constexpr (is_same_v<_To, __nv_fp4_e2m1>)
|
|
96
96
|
{
|
|
97
|
-
return ::cuda::std::__fp_from_storage<__nv_fp4_e2m1>(::__nv_cvt_float_to_fp4(__v, __NV_E2M1, cudaRoundNearest));
|
|
97
|
+
return ::cuda::std::__fp_from_storage<__nv_fp4_e2m1>(::__nv_cvt_float_to_fp4(__v, __NV_E2M1, ::cudaRoundNearest));
|
|
98
98
|
}
|
|
99
99
|
#endif // _CCCL_HAS_NVFP4_E2M1()
|
|
100
100
|
else
|
|
@@ -145,25 +145,28 @@ template <class _To, class _From>
|
|
|
145
145
|
#if _CCCL_HAS_NVFP8_E8M0()
|
|
146
146
|
else if constexpr (is_same_v<_To, __nv_fp8_e8m0>)
|
|
147
147
|
{
|
|
148
|
-
return ::cuda::std::__fp_from_storage<__nv_fp8_e8m0>(::__nv_cvt_double_to_e8m0(__v, __NV_NOSAT, cudaRoundZero));
|
|
148
|
+
return ::cuda::std::__fp_from_storage<__nv_fp8_e8m0>(::__nv_cvt_double_to_e8m0(__v, __NV_NOSAT, ::cudaRoundZero));
|
|
149
149
|
}
|
|
150
150
|
#endif // _CCCL_HAS_NVFP8_E8M0()
|
|
151
151
|
#if _CCCL_HAS_NVFP6_E2M3()
|
|
152
152
|
else if constexpr (is_same_v<_To, __nv_fp6_e2m3>)
|
|
153
153
|
{
|
|
154
|
-
return ::cuda::std::__fp_from_storage<__nv_fp6_e2m3>(
|
|
154
|
+
return ::cuda::std::__fp_from_storage<__nv_fp6_e2m3>(
|
|
155
|
+
::__nv_cvt_double_to_fp6(__v, __NV_E2M3, ::cudaRoundNearest));
|
|
155
156
|
}
|
|
156
157
|
#endif // _CCCL_HAS_NVFP6_E2M3()
|
|
157
158
|
#if _CCCL_HAS_NVFP6_E3M2()
|
|
158
159
|
else if constexpr (is_same_v<_To, __nv_fp6_e3m2>)
|
|
159
160
|
{
|
|
160
|
-
return ::cuda::std::__fp_from_storage<__nv_fp6_e3m2>(
|
|
161
|
+
return ::cuda::std::__fp_from_storage<__nv_fp6_e3m2>(
|
|
162
|
+
::__nv_cvt_double_to_fp6(__v, __NV_E3M2, ::cudaRoundNearest));
|
|
161
163
|
}
|
|
162
164
|
#endif // _CCCL_HAS_NVFP6_E3M2()
|
|
163
165
|
#if _CCCL_HAS_NVFP4_E2M1()
|
|
164
166
|
else if constexpr (is_same_v<_To, __nv_fp4_e2m1>)
|
|
165
167
|
{
|
|
166
|
-
return ::cuda::std::__fp_from_storage<__nv_fp4_e2m1>(
|
|
168
|
+
return ::cuda::std::__fp_from_storage<__nv_fp4_e2m1>(
|
|
169
|
+
::__nv_cvt_double_to_fp4(__v, __NV_E2M1, ::cudaRoundNearest));
|
|
167
170
|
}
|
|
168
171
|
#endif // _CCCL_HAS_NVFP4_E2M1()
|
|
169
172
|
else
|
|
@@ -352,28 +355,28 @@ template <class _To, class _From>
|
|
|
352
355
|
else if constexpr (is_same_v<_To, __nv_fp8_e8m0>)
|
|
353
356
|
{
|
|
354
357
|
return ::cuda::std::__fp_from_storage<__nv_fp8_e8m0>(
|
|
355
|
-
::__nv_cvt_bfloat16raw_to_e8m0(__v, __NV_NOSAT, cudaRoundZero));
|
|
358
|
+
::__nv_cvt_bfloat16raw_to_e8m0(__v, __NV_NOSAT, ::cudaRoundZero));
|
|
356
359
|
}
|
|
357
360
|
# endif // _CCCL_HAS_NVFP8_E8M0()
|
|
358
361
|
# if _CCCL_HAS_NVFP6_E2M3()
|
|
359
362
|
else if constexpr (is_same_v<_To, __nv_fp6_e2m3>)
|
|
360
363
|
{
|
|
361
364
|
return ::cuda::std::__fp_from_storage<__nv_fp6_e2m3>(
|
|
362
|
-
::__nv_cvt_bfloat16raw_to_fp6(__v, __NV_E2M3, cudaRoundNearest));
|
|
365
|
+
::__nv_cvt_bfloat16raw_to_fp6(__v, __NV_E2M3, ::cudaRoundNearest));
|
|
363
366
|
}
|
|
364
367
|
# endif // _CCCL_HAS_NVFP6_E2M3()
|
|
365
368
|
# if _CCCL_HAS_NVFP6_E3M2()
|
|
366
369
|
else if constexpr (is_same_v<_To, __nv_fp6_e3m2>)
|
|
367
370
|
{
|
|
368
371
|
return ::cuda::std::__fp_from_storage<__nv_fp6_e3m2>(
|
|
369
|
-
::__nv_cvt_bfloat16raw_to_fp6(__v, __NV_E3M2, cudaRoundNearest));
|
|
372
|
+
::__nv_cvt_bfloat16raw_to_fp6(__v, __NV_E3M2, ::cudaRoundNearest));
|
|
370
373
|
}
|
|
371
374
|
# endif // _CCCL_HAS_NVFP6_E3M2()
|
|
372
375
|
# if _CCCL_HAS_NVFP4_E2M1()
|
|
373
376
|
else if constexpr (is_same_v<_To, __nv_fp4_e2m1>)
|
|
374
377
|
{
|
|
375
378
|
return ::cuda::std::__fp_from_storage<__nv_fp4_e2m1>(
|
|
376
|
-
::__nv_cvt_bfloat16raw_to_fp4(__v, __NV_E2M1, cudaRoundNearest));
|
|
379
|
+
::__nv_cvt_bfloat16raw_to_fp4(__v, __NV_E2M1, ::cudaRoundNearest));
|
|
377
380
|
}
|
|
378
381
|
# endif // _CCCL_HAS_NVFP4_E2M1()
|
|
379
382
|
else
|
|
@@ -55,6 +55,9 @@ _CCCL_DIAG_SUPPRESS_MSVC(4100) // unreferenced formal parameter
|
|
|
55
55
|
_CCCL_DIAG_POP
|
|
56
56
|
#endif // _CCCL_HAS_NVFP4()
|
|
57
57
|
|
|
58
|
+
// crt/device_fp128_functions.h is available in CUDA 12.8+.
|
|
59
|
+
// _CCCL_HAS_FLOAT128() checks the *compiler* compatibility with __float128.
|
|
60
|
+
// We also need to check the toolkit version to ensure the compatibility with nvc++.
|
|
58
61
|
#if _CCCL_HAS_FLOAT128() && _CCCL_DEVICE_COMPILATION() && _CCCL_CTK_AT_LEAST(12, 8)
|
|
59
62
|
# if !_CCCL_COMPILER(NVRTC)
|
|
60
63
|
_CCCL_DIAG_PUSH
|
|
@@ -439,7 +439,8 @@ public:
|
|
|
439
439
|
[[nodiscard]] _CCCL_API constexpr bool is_exhaustive() const
|
|
440
440
|
noexcept(noexcept(::cuda::std::declval<const mapping_type&>().is_exhaustive()))
|
|
441
441
|
{
|
|
442
|
-
|
|
442
|
+
auto __tmp = mapping(); // workaround for clang with nodiscard
|
|
443
|
+
return __tmp.is_exhaustive();
|
|
443
444
|
}
|
|
444
445
|
[[nodiscard]] _CCCL_API constexpr bool is_strided() const
|
|
445
446
|
noexcept(noexcept(::cuda::std::declval<const mapping_type&>().is_strided()))
|
|
@@ -20,7 +20,9 @@
|
|
|
20
20
|
# pragma system_header
|
|
21
21
|
#endif // no system header
|
|
22
22
|
|
|
23
|
+
#include <cuda/__fwd/complex.h>
|
|
23
24
|
#include <cuda/std/__fwd/array.h>
|
|
25
|
+
#include <cuda/std/__fwd/complex.h>
|
|
24
26
|
#include <cuda/std/__fwd/tuple.h>
|
|
25
27
|
#include <cuda/std/__tuple_dir/tuple_element.h>
|
|
26
28
|
#include <cuda/std/__tuple_dir/tuple_indices.h>
|
|
@@ -61,7 +63,27 @@ struct __make_tuple_types_flat<array<_Vt, _Np>, __tuple_indices<_Idx...>>
|
|
|
61
63
|
template <size_t>
|
|
62
64
|
using __value_type = _Vt;
|
|
63
65
|
template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
|
|
64
|
-
using __apply_quals = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
|
|
66
|
+
using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
template <class _Vt, size_t... _Idx>
|
|
70
|
+
struct __make_tuple_types_flat<complex<_Vt>, __tuple_indices<_Idx...>>
|
|
71
|
+
{
|
|
72
|
+
static_assert(sizeof...(_Idx) == 2, "__make_tuple_types: complex has only 2 members");
|
|
73
|
+
template <size_t>
|
|
74
|
+
using __value_type = _Vt;
|
|
75
|
+
template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
|
|
76
|
+
using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
template <class _Vt, size_t... _Idx>
|
|
80
|
+
struct __make_tuple_types_flat<::cuda::complex<_Vt>, __tuple_indices<_Idx...>>
|
|
81
|
+
{
|
|
82
|
+
static_assert(sizeof...(_Idx) == 2, "__make_tuple_types: complex has only 2 members");
|
|
83
|
+
template <size_t>
|
|
84
|
+
using __value_type = _Vt;
|
|
85
|
+
template <class _Tp, class _ApplyFn = __apply_cvref_fn<_Tp>>
|
|
86
|
+
using __apply_quals _CCCL_NODEBUG_ALIAS = __tuple_types<__type_call<_ApplyFn, __value_type<_Idx>>...>;
|
|
65
87
|
};
|
|
66
88
|
|
|
67
89
|
template <class _Tp,
|
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
# pragma system_header
|
|
21
21
|
#endif // no system header
|
|
22
22
|
|
|
23
|
+
#include <cuda/__fwd/complex.h>
|
|
23
24
|
#include <cuda/std/__concepts/concept_macros.h>
|
|
24
25
|
#include <cuda/std/__fwd/array.h>
|
|
25
26
|
#include <cuda/std/__fwd/complex.h>
|
|
@@ -58,6 +59,9 @@ inline constexpr bool __tuple_like_impl<array<_Tp, _Size>> = true;
|
|
|
58
59
|
template <class _Tp>
|
|
59
60
|
inline constexpr bool __tuple_like_impl<complex<_Tp>> = true;
|
|
60
61
|
|
|
62
|
+
template <class _Tp>
|
|
63
|
+
inline constexpr bool __tuple_like_impl<::cuda::complex<_Tp>> = true;
|
|
64
|
+
|
|
61
65
|
template <class _Ip, class _Sp, ::cuda::std::ranges::subrange_kind _Kp>
|
|
62
66
|
inline constexpr bool __tuple_like_impl<::cuda::std::ranges::subrange<_Ip, _Sp, _Kp>> = true;
|
|
63
67
|
|
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
# pragma system_header
|
|
21
21
|
#endif // no system header
|
|
22
22
|
|
|
23
|
+
#include <cuda/__fwd/complex.h>
|
|
23
24
|
#include <cuda/std/__fwd/array.h>
|
|
24
25
|
#include <cuda/std/__fwd/complex.h>
|
|
25
26
|
#include <cuda/std/__fwd/pair.h>
|
|
@@ -54,6 +55,9 @@ inline constexpr bool __tuple_like_ext<array<_Tp, _Size>> = true;
|
|
|
54
55
|
template <class _Tp>
|
|
55
56
|
inline constexpr bool __tuple_like_ext<complex<_Tp>> = true;
|
|
56
57
|
|
|
58
|
+
template <class _Tp>
|
|
59
|
+
inline constexpr bool __tuple_like_ext<::cuda::complex<_Tp>> = true;
|
|
60
|
+
|
|
57
61
|
template <class... _Tp>
|
|
58
62
|
inline constexpr bool __tuple_like_ext<__tuple_types<_Tp...>> = true;
|
|
59
63
|
|
|
@@ -20,10 +20,8 @@
|
|
|
20
20
|
# pragma system_header
|
|
21
21
|
#endif // no system header
|
|
22
22
|
|
|
23
|
-
#include <cuda/std/__type_traits/integral_constant.h>
|
|
24
23
|
#include <cuda/std/__type_traits/is_same.h>
|
|
25
24
|
#include <cuda/std/__utility/declval.h>
|
|
26
|
-
#include <cuda/std/cstddef>
|
|
27
25
|
|
|
28
26
|
#include <cuda/std/__cccl/prologue.h>
|
|
29
27
|
|
|
@@ -49,6 +47,9 @@ struct __numeric_type
|
|
|
49
47
|
_CCCL_API inline static double __test(unsigned long long);
|
|
50
48
|
_CCCL_API inline static double __test(double);
|
|
51
49
|
_CCCL_API inline static long double __test(long double);
|
|
50
|
+
#if _CCCL_HAS_FLOAT128()
|
|
51
|
+
_CCCL_API inline static __float128 __test(__float128);
|
|
52
|
+
#endif // _CCCL_HAS_FLOAT128()
|
|
52
53
|
|
|
53
54
|
using type = decltype(__test(declval<_Tp>()));
|
|
54
55
|
static const bool value = !is_same_v<type, void>;
|
|
@@ -57,7 +57,7 @@
|
|
|
57
57
|
#include <cuda/std/version>
|
|
58
58
|
|
|
59
59
|
#if !_CCCL_COMPILER(NVRTC)
|
|
60
|
-
# include <
|
|
60
|
+
# include <string_view>
|
|
61
61
|
#endif // !_CCCL_COMPILER(NVRTC)
|
|
62
62
|
|
|
63
63
|
#include <cuda/std/__cccl/prologue.h>
|
|
@@ -727,14 +727,21 @@ _CCCL_HOST_DEVICE basic_string_view(_Range&&) -> basic_string_view<::cuda::std::
|
|
|
727
727
|
|
|
728
728
|
// operator <<
|
|
729
729
|
|
|
730
|
-
#if
|
|
730
|
+
#if !_CCCL_COMPILER(NVRTC)
|
|
731
|
+
template <class _CharT>
|
|
732
|
+
_CCCL_HOST_API ::std::basic_ostream<_CharT>&
|
|
733
|
+
operator<<(::std::basic_ostream<_CharT>& __os, basic_string_view<_CharT> __str)
|
|
734
|
+
{
|
|
735
|
+
return __os << ::std::basic_string_view<_CharT>{__str.data(), __str.size()};
|
|
736
|
+
}
|
|
737
|
+
|
|
731
738
|
template <class _CharT, class _Traits>
|
|
732
|
-
|
|
739
|
+
_CCCL_HOST_API ::std::basic_ostream<_CharT, _Traits>&
|
|
733
740
|
operator<<(::std::basic_ostream<_CharT, _Traits>& __os, basic_string_view<_CharT, _Traits> __str)
|
|
734
741
|
{
|
|
735
|
-
return __os
|
|
742
|
+
return __os << ::std::basic_string_view<_CharT, _Traits>{__str.data(), __str.size()};
|
|
736
743
|
}
|
|
737
|
-
#endif //
|
|
744
|
+
#endif // !_CCCL_COMPILER(NVRTC)
|
|
738
745
|
|
|
739
746
|
// literals
|
|
740
747
|
|
|
@@ -141,7 +141,7 @@
|
|
|
141
141
|
// # define __cccl_lib_shared_mutex 201505L
|
|
142
142
|
// # define __cccl_lib_shared_ptr_arrays 201611L
|
|
143
143
|
// # define __cccl_lib_shared_ptr_weak_type 201606L
|
|
144
|
-
|
|
144
|
+
#define __cccl_lib_string_view 201803L
|
|
145
145
|
// # define __cccl_lib_to_chars 201611L
|
|
146
146
|
// # define __cccl_lib_uncaught_exceptions 201411L
|
|
147
147
|
// # define __cccl_lib_unordered_map_try_emplace 201411L
|
|
@@ -171,7 +171,6 @@
|
|
|
171
171
|
// # define __cccl_lib_constexpr_misc 201811L
|
|
172
172
|
// # define __cccl_lib_constexpr_numeric 201911L
|
|
173
173
|
// # define __cccl_lib_constexpr_string 201907L
|
|
174
|
-
// # define __cccl_lib_constexpr_string_view 201811L
|
|
175
174
|
// # define __cccl_lib_constexpr_swap_algorithms 201806L
|
|
176
175
|
// # define __cccl_lib_constexpr_tuple 201811L
|
|
177
176
|
// # define __cccl_lib_constexpr_utility 201811L
|
|
@@ -204,8 +203,6 @@
|
|
|
204
203
|
// # define __cccl_lib_source_location 201907L
|
|
205
204
|
// # define __cccl_lib_ssize 201902L
|
|
206
205
|
// # define __cccl_lib_starts_ends_with 201711L
|
|
207
|
-
// # undef __cccl_lib_string_view
|
|
208
|
-
// # define __cccl_lib_string_view 201803L
|
|
209
206
|
// # define __cccl_lib_syncbuf 201803L
|
|
210
207
|
// # define __cccl_lib_three_way_comparison 201907L
|
|
211
208
|
# define __cccl_lib_unwrap_ref 201811L
|
|
@@ -27,6 +27,8 @@
|
|
|
27
27
|
#endif // no system header
|
|
28
28
|
#include <thrust/detail/type_deduction.h>
|
|
29
29
|
|
|
30
|
+
#include <cuda/std/__bit/countl.h>
|
|
31
|
+
#include <cuda/std/__type_traits/make_unsigned.h>
|
|
30
32
|
#include <cuda/std/limits>
|
|
31
33
|
#include <cuda/std/type_traits>
|
|
32
34
|
|
|
@@ -36,25 +38,6 @@ THRUST_NAMESPACE_BEGIN
|
|
|
36
38
|
namespace detail
|
|
37
39
|
{
|
|
38
40
|
|
|
39
|
-
template <typename Integer>
|
|
40
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer clz(Integer x)
|
|
41
|
-
{
|
|
42
|
-
Integer result;
|
|
43
|
-
|
|
44
|
-
NV_IF_TARGET(NV_IS_DEVICE,
|
|
45
|
-
(result = ::__clz(x);),
|
|
46
|
-
(int num_bits = 8 * sizeof(Integer); int num_bits_minus_one = num_bits - 1; result = num_bits;
|
|
47
|
-
for (int i = num_bits_minus_one; i >= 0; --i) {
|
|
48
|
-
if ((Integer(1) << i) & x)
|
|
49
|
-
{
|
|
50
|
-
result = num_bits_minus_one - i;
|
|
51
|
-
break;
|
|
52
|
-
}
|
|
53
|
-
}));
|
|
54
|
-
|
|
55
|
-
return result;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
41
|
template <typename Integer>
|
|
59
42
|
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool is_power_of_2(Integer x)
|
|
60
43
|
{
|
|
@@ -85,7 +68,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE Integer log2(Integer x)
|
|
|
85
68
|
Integer num_bits = 8 * sizeof(Integer);
|
|
86
69
|
Integer num_bits_minus_one = num_bits - 1;
|
|
87
70
|
|
|
88
|
-
return num_bits_minus_one -
|
|
71
|
+
return num_bits_minus_one - ::cuda::std::countl_zero(::cuda::std::__to_unsigned_like(x));
|
|
89
72
|
}
|
|
90
73
|
|
|
91
74
|
template <typename Integer>
|
|
@@ -316,6 +316,17 @@ struct iterator_traversal<::cuda::zip_iterator<Iterators...>>
|
|
|
316
316
|
using type = detail::minimum_type<iterator_traversal_t<Iterators>...>;
|
|
317
317
|
};
|
|
318
318
|
|
|
319
|
+
template <class Fn, class... Iterators>
|
|
320
|
+
struct iterator_system<::cuda::zip_transform_iterator<Fn, Iterators...>>
|
|
321
|
+
{
|
|
322
|
+
using type = detail::minimum_system_t<iterator_system_t<Iterators>...>;
|
|
323
|
+
};
|
|
324
|
+
template <class Fn, class... Iterators>
|
|
325
|
+
struct iterator_traversal<::cuda::zip_transform_iterator<Fn, Iterators...>>
|
|
326
|
+
{
|
|
327
|
+
using type = detail::minimum_type<iterator_traversal_t<Iterators>...>;
|
|
328
|
+
};
|
|
329
|
+
|
|
319
330
|
//! \} // end iterator_traits
|
|
320
331
|
|
|
321
332
|
THRUST_NAMESPACE_END
|
|
@@ -48,6 +48,13 @@
|
|
|
48
48
|
#include <thrust/system/cuda/detail/util.h>
|
|
49
49
|
#include <thrust/type_traits/is_trivially_relocatable.h>
|
|
50
50
|
|
|
51
|
+
#if _CCCL_HAS_CUDA_COMPILER()
|
|
52
|
+
# include <cub/device/dispatch/tuning/tuning_transform.cuh>
|
|
53
|
+
#endif // _CCCL_HAS_CUDA_COMPILER()
|
|
54
|
+
|
|
55
|
+
#include <cuda/__fwd/zip_iterator.h>
|
|
56
|
+
#include <cuda/std/tuple>
|
|
57
|
+
|
|
51
58
|
THRUST_NAMESPACE_BEGIN
|
|
52
59
|
namespace cuda_cub
|
|
53
60
|
{
|
|
@@ -61,6 +68,21 @@ template <class Derived, class InputIt, class OutputIt, class TransformOp>
|
|
|
61
68
|
OutputIt _CCCL_API _CCCL_FORCEINLINE
|
|
62
69
|
transform(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op);
|
|
63
70
|
|
|
71
|
+
// Forward declare to work around a cyclic include, since "cuda/detail/transform.h" includes this header
|
|
72
|
+
// We want this to unwrap zip_transform_iterator
|
|
73
|
+
namespace __transform
|
|
74
|
+
{
|
|
75
|
+
_CCCL_EXEC_CHECK_DISABLE
|
|
76
|
+
template <class Derived, class Offset, class... InputIts, class OutputIt, class TransformOp, class Predicate>
|
|
77
|
+
OutputIt _CCCL_API _CCCL_FORCEINLINE cub_transform_many(
|
|
78
|
+
execution_policy<Derived>& policy,
|
|
79
|
+
::cuda::std::tuple<InputIts...> firsts,
|
|
80
|
+
OutputIt result,
|
|
81
|
+
Offset num_items,
|
|
82
|
+
TransformOp transform_op,
|
|
83
|
+
Predicate pred);
|
|
84
|
+
} // namespace __transform
|
|
85
|
+
|
|
64
86
|
namespace __copy
|
|
65
87
|
{
|
|
66
88
|
template <class H, class D, class T, class Size>
|
|
@@ -190,6 +212,17 @@ device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last,
|
|
|
190
212
|
|
|
191
213
|
return result + n;
|
|
192
214
|
}
|
|
215
|
+
else if constexpr (::cuda::__is_zip_transform_iterator<InputIt>)
|
|
216
|
+
{
|
|
217
|
+
const auto n = ::cuda::std::distance(first, last);
|
|
218
|
+
return cuda_cub::__transform::cub_transform_many(
|
|
219
|
+
policy,
|
|
220
|
+
::cuda::std::move(first).__base(),
|
|
221
|
+
result,
|
|
222
|
+
n,
|
|
223
|
+
::cuda::std::move(first).__pred(),
|
|
224
|
+
cub::detail::transform::always_true_predicate{});
|
|
225
|
+
}
|
|
193
226
|
else
|
|
194
227
|
{
|
|
195
228
|
return cuda_cub::transform(
|
|
@@ -39,37 +39,23 @@
|
|
|
39
39
|
#if _CCCL_HAS_CUDA_COMPILER()
|
|
40
40
|
# include <thrust/system/cuda/config.h>
|
|
41
41
|
|
|
42
|
-
# include <thrust/
|
|
43
|
-
# include <thrust/system/cuda/detail/parallel_for.h>
|
|
42
|
+
# include <thrust/system/cuda/detail/transform.h>
|
|
44
43
|
# include <thrust/system/cuda/execution_policy.h>
|
|
45
44
|
|
|
45
|
+
# include <cuda/__functional/address_stability.h>
|
|
46
|
+
# include <cuda/std/iterator>
|
|
47
|
+
|
|
46
48
|
THRUST_NAMESPACE_BEGIN
|
|
47
49
|
namespace cuda_cub
|
|
48
50
|
{
|
|
49
|
-
namespace __tabulate
|
|
50
|
-
{
|
|
51
|
-
template <class Iterator, class TabulateOp>
|
|
52
|
-
struct functor
|
|
53
|
-
{
|
|
54
|
-
Iterator items;
|
|
55
|
-
TabulateOp op;
|
|
56
|
-
|
|
57
|
-
template <typename Size>
|
|
58
|
-
void _CCCL_DEVICE operator()(Size idx)
|
|
59
|
-
{
|
|
60
|
-
items[idx] = op(idx);
|
|
61
|
-
}
|
|
62
|
-
};
|
|
63
|
-
} // namespace __tabulate
|
|
64
|
-
|
|
65
51
|
template <class Derived, class Iterator, class TabulateOp>
|
|
66
52
|
void _CCCL_HOST_DEVICE tabulate(execution_policy<Derived>& policy, Iterator first, Iterator last, TabulateOp tabulate_op)
|
|
67
53
|
{
|
|
68
|
-
using size_type
|
|
69
|
-
|
|
70
|
-
cuda_cub::
|
|
54
|
+
using size_type = ::cuda::std::iter_difference_t<Iterator>;
|
|
55
|
+
const auto count = ::cuda::std::distance(first, last);
|
|
56
|
+
cuda_cub::transform_n(
|
|
57
|
+
policy, ::cuda::counting_iterator<size_type>{}, count, first, ::cuda::proclaim_copyable_arguments(tabulate_op));
|
|
71
58
|
}
|
|
72
|
-
|
|
73
59
|
} // namespace cuda_cub
|
|
74
60
|
THRUST_NAMESPACE_END
|
|
75
61
|
#endif
|
|
@@ -25,72 +25,39 @@
|
|
|
25
25
|
|
|
26
26
|
THRUST_NAMESPACE_BEGIN
|
|
27
27
|
|
|
28
|
-
namespace detail
|
|
29
|
-
{
|
|
30
|
-
// Type traits for contiguous iterators:
|
|
31
|
-
template <typename Iterator>
|
|
32
|
-
struct contiguous_iterator_traits
|
|
33
|
-
{
|
|
34
|
-
static_assert(thrust::is_contiguous_iterator_v<Iterator>,
|
|
35
|
-
"contiguous_iterator_traits requires a contiguous iterator.");
|
|
36
|
-
|
|
37
|
-
using raw_pointer =
|
|
38
|
-
typename thrust::detail::pointer_traits<decltype(&*::cuda::std::declval<Iterator>())>::raw_pointer;
|
|
39
|
-
};
|
|
40
|
-
} // namespace detail
|
|
41
|
-
|
|
42
|
-
//! Converts a contiguous iterator type to its underlying raw pointer type.
|
|
43
|
-
template <typename ContiguousIterator>
|
|
44
|
-
using unwrap_contiguous_iterator_t = typename detail::contiguous_iterator_traits<ContiguousIterator>::raw_pointer;
|
|
45
|
-
|
|
46
28
|
//! Converts a contiguous iterator to its underlying raw pointer.
|
|
29
|
+
_CCCL_EXEC_CHECK_DISABLE
|
|
47
30
|
template <typename ContiguousIterator>
|
|
48
31
|
_CCCL_HOST_DEVICE auto unwrap_contiguous_iterator(ContiguousIterator it)
|
|
49
|
-
-> unwrap_contiguous_iterator_t<ContiguousIterator>
|
|
50
32
|
{
|
|
51
33
|
static_assert(thrust::is_contiguous_iterator_v<ContiguousIterator>,
|
|
52
34
|
"unwrap_contiguous_iterator called with non-contiguous iterator.");
|
|
53
35
|
return thrust::raw_pointer_cast(&*it);
|
|
54
36
|
}
|
|
55
37
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
template <typename Iterator, bool IsContiguous = thrust::is_contiguous_iterator_v<Iterator>>
|
|
60
|
-
struct try_unwrap_contiguous_iterator_impl
|
|
61
|
-
{
|
|
62
|
-
using type = Iterator;
|
|
63
|
-
|
|
64
|
-
static _CCCL_HOST_DEVICE type get(Iterator it)
|
|
65
|
-
{
|
|
66
|
-
return it;
|
|
67
|
-
}
|
|
68
|
-
};
|
|
38
|
+
//! Converts a contiguous iterator type to its underlying raw pointer type.
|
|
39
|
+
template <typename ContiguousIterator>
|
|
40
|
+
using unwrap_contiguous_iterator_t = decltype(unwrap_contiguous_iterator(::cuda::std::declval<ContiguousIterator>()));
|
|
69
41
|
|
|
70
|
-
|
|
42
|
+
//! Takes an iterator and, if it is contiguous, unwraps it to the raw pointer it represents. Otherwise returns the
|
|
43
|
+
//! iterator unmodified.
|
|
44
|
+
_CCCL_EXEC_CHECK_DISABLE
|
|
71
45
|
template <typename Iterator>
|
|
72
|
-
|
|
46
|
+
_CCCL_HOST_DEVICE auto try_unwrap_contiguous_iterator(Iterator it)
|
|
73
47
|
{
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
static _CCCL_HOST_DEVICE type get(Iterator it)
|
|
48
|
+
if constexpr (thrust::is_contiguous_iterator_v<Iterator>)
|
|
77
49
|
{
|
|
78
50
|
return unwrap_contiguous_iterator(it);
|
|
79
51
|
}
|
|
80
|
-
|
|
81
|
-
|
|
52
|
+
else
|
|
53
|
+
{
|
|
54
|
+
return it;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
82
57
|
|
|
83
58
|
//! Takes an iterator type and, if it is contiguous, yields the raw pointer type it represents. Otherwise returns the
|
|
84
59
|
//! iterator type unmodified.
|
|
85
60
|
template <typename Iterator>
|
|
86
|
-
using try_unwrap_contiguous_iterator_t =
|
|
87
|
-
|
|
88
|
-
//! Takes an iterator and, if it is contiguous, unwraps it to the raw pointer it represents. Otherwise returns the
|
|
89
|
-
//! iterator unmodified.
|
|
90
|
-
template <typename Iterator>
|
|
91
|
-
_CCCL_HOST_DEVICE auto try_unwrap_contiguous_iterator(Iterator it) -> try_unwrap_contiguous_iterator_t<Iterator>
|
|
92
|
-
{
|
|
93
|
-
return detail::try_unwrap_contiguous_iterator_impl<Iterator>::get(it);
|
|
94
|
-
}
|
|
61
|
+
using try_unwrap_contiguous_iterator_t = decltype(try_unwrap_contiguous_iterator(::cuda::std::declval<Iterator>()));
|
|
95
62
|
|
|
96
63
|
THRUST_NAMESPACE_END
|
|
@@ -1,73 +1,24 @@
|
|
|
1
|
-
# Copyright (c)
|
|
1
|
+
# Copyright (c) 2025, NVIDIA CORPORATION.
|
|
2
2
|
#
|
|
3
|
-
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
4
14
|
|
|
5
|
-
|
|
6
|
-
DoubleBuffer,
|
|
7
|
-
SortOrder,
|
|
8
|
-
binary_transform,
|
|
9
|
-
exclusive_scan,
|
|
10
|
-
histogram_even,
|
|
11
|
-
inclusive_scan,
|
|
12
|
-
make_binary_transform,
|
|
13
|
-
make_exclusive_scan,
|
|
14
|
-
make_histogram_even,
|
|
15
|
-
make_inclusive_scan,
|
|
16
|
-
make_merge_sort,
|
|
17
|
-
make_radix_sort,
|
|
18
|
-
make_reduce_into,
|
|
19
|
-
make_segmented_reduce,
|
|
20
|
-
make_unary_transform,
|
|
21
|
-
make_unique_by_key,
|
|
22
|
-
merge_sort,
|
|
23
|
-
radix_sort,
|
|
24
|
-
reduce_into,
|
|
25
|
-
segmented_reduce,
|
|
26
|
-
unary_transform,
|
|
27
|
-
unique_by_key,
|
|
28
|
-
)
|
|
29
|
-
from .iterators import (
|
|
30
|
-
CacheModifiedInputIterator,
|
|
31
|
-
ConstantIterator,
|
|
32
|
-
CountingIterator,
|
|
33
|
-
ReverseIterator,
|
|
34
|
-
TransformIterator,
|
|
35
|
-
TransformOutputIterator,
|
|
36
|
-
ZipIterator,
|
|
37
|
-
)
|
|
38
|
-
from .op import OpKind
|
|
39
|
-
from .struct import gpu_struct
|
|
15
|
+
# alias for backwards compatibility
|
|
40
16
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
"
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
"histogram_even",
|
|
50
|
-
"inclusive_scan",
|
|
51
|
-
"make_binary_transform",
|
|
52
|
-
"make_exclusive_scan",
|
|
53
|
-
"make_histogram_even",
|
|
54
|
-
"make_inclusive_scan",
|
|
55
|
-
"make_merge_sort",
|
|
56
|
-
"make_radix_sort",
|
|
57
|
-
"make_reduce_into",
|
|
58
|
-
"make_segmented_reduce",
|
|
59
|
-
"make_unary_transform",
|
|
60
|
-
"make_unique_by_key",
|
|
61
|
-
"merge_sort",
|
|
62
|
-
"OpKind",
|
|
63
|
-
"radix_sort",
|
|
64
|
-
"reduce_into",
|
|
65
|
-
"ReverseIterator",
|
|
66
|
-
"segmented_reduce",
|
|
67
|
-
"SortOrder",
|
|
68
|
-
"TransformIterator",
|
|
69
|
-
"TransformOutputIterator",
|
|
70
|
-
"unary_transform",
|
|
71
|
-
"unique_by_key",
|
|
72
|
-
"ZipIterator",
|
|
73
|
-
]
|
|
17
|
+
from warnings import warn
|
|
18
|
+
|
|
19
|
+
from cuda.compute import * # noqa: F403
|
|
20
|
+
|
|
21
|
+
warn(
|
|
22
|
+
"The module cuda.cccl.parallel.experimental is deprecated. Use cuda.compute instead.",
|
|
23
|
+
FutureWarning,
|
|
24
|
+
)
|