cuda-cccl 0.1.3.2.0.dev438__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.1__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +25 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +24 -19
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +3 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +18 -26
- cuda/cccl/headers/include/cub/device/device_copy.cuh +116 -27
- cuda/cccl/headers/include/cub/device/device_partition.cuh +5 -1
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +158 -247
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +4 -4
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +8 -26
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +1 -6
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +320 -262
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +10 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +57 -10
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +37 -13
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +204 -55
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +55 -19
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/util_device.cuh +51 -35
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +4 -4
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__device/all_devices.h +47 -147
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +51 -49
- cuda/cccl/headers/include/cuda/__device/attributes.h +177 -127
- cuda/cccl/headers/include/cuda/__device/device_ref.h +32 -51
- cuda/cccl/headers/include/cuda/__device/physical_device.h +120 -91
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +330 -36
- cuda/cccl/headers/include/cuda/__event/event.h +8 -8
- cuda/cccl/headers/include/cuda/__event/event_ref.h +4 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +4 -4
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +3 -3
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +158 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +8 -120
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +593 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +14 -10
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +4 -3
- cuda/cccl/headers/include/cuda/__runtime/types.h +1 -1
- cuda/cccl/headers/include/cuda/__stream/stream.h +2 -3
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -12
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +2 -2
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/devices +10 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +2 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +10 -5
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +44 -17
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +12 -12
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +1 -8
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +15 -12
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +3 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +2 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +3 -2
- cuda/cccl/headers/include/cuda/std/string_view +12 -5
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +8 -22
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +15 -48
- cuda/cccl/parallel/experimental/__init__.py +21 -70
- cuda/compute/__init__.py +77 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +28 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +141 -1
- cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +4 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +4 -6
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +4 -4
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +8 -8
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/METADATA +1 -1
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/RECORD +171 -166
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +0 -72
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/numba_utils.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/typing.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.1.3.2.0.dev438.dist-info → cuda_cccl-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -24,9 +24,9 @@
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
|
|
26
26
|
# include <cuda/__device/device_ref.h>
|
|
27
|
+
# include <cuda/__driver/driver_api.h>
|
|
27
28
|
# include <cuda/__runtime/ensure_current_context.h>
|
|
28
29
|
# include <cuda/__stream/stream_ref.h> // IWYU pragma: export
|
|
29
|
-
# include <cuda/std/__cuda/api_wrapper.h>
|
|
30
30
|
|
|
31
31
|
# include <cuda/std/__cccl/prologue.h>
|
|
32
32
|
|
|
@@ -47,8 +47,7 @@ struct stream : stream_ref
|
|
|
47
47
|
: stream_ref(__detail::__invalid_stream)
|
|
48
48
|
{
|
|
49
49
|
[[maybe_unused]] __ensure_current_context __ctx_setter(__dev);
|
|
50
|
-
|
|
51
|
-
::cudaStreamCreateWithPriority, "Failed to create a stream", &__stream, cudaStreamNonBlocking, __priority);
|
|
50
|
+
__stream = ::cuda::__driver::__streamCreateWithPriority(cudaStreamNonBlocking, __priority);
|
|
52
51
|
}
|
|
53
52
|
|
|
54
53
|
//! @brief Construct a new `stream` object into the moved-from state.
|
|
@@ -23,11 +23,12 @@
|
|
|
23
23
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
|
|
26
|
+
# include <cuda/__device/device_ref.h>
|
|
27
|
+
# include <cuda/__driver/driver_api.h>
|
|
26
28
|
# include <cuda/__event/timed_event.h>
|
|
27
29
|
# include <cuda/__fwd/get_stream.h>
|
|
28
30
|
# include <cuda/__runtime/ensure_current_context.h>
|
|
29
31
|
# include <cuda/__utility/no_init.h>
|
|
30
|
-
# include <cuda/std/__cuda/api_wrapper.h>
|
|
31
32
|
# include <cuda/std/__exception/cuda_error.h>
|
|
32
33
|
# include <cuda/std/cstddef>
|
|
33
34
|
|
|
@@ -39,7 +40,7 @@ namespace __detail
|
|
|
39
40
|
{
|
|
40
41
|
// 0 is a valid stream in CUDA, so we need some other invalid stream representation
|
|
41
42
|
// Can't make it constexpr, because cudaStream_t is a pointer type
|
|
42
|
-
static const ::cudaStream_t __invalid_stream = reinterpret_cast
|
|
43
|
+
static const ::cudaStream_t __invalid_stream = reinterpret_cast<::cudaStream_t>(~0ULL);
|
|
43
44
|
} // namespace __detail
|
|
44
45
|
|
|
45
46
|
//! @brief A type representing a stream ID.
|
|
@@ -238,11 +239,17 @@ public:
|
|
|
238
239
|
//! @throws cuda_error if device check fails
|
|
239
240
|
_CCCL_HOST_API device_ref device() const
|
|
240
241
|
{
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
242
|
+
::CUdevice __device{};
|
|
243
|
+
# if _CCCL_CTK_AT_LEAST(13, 0)
|
|
244
|
+
__device = ::cuda::__driver::__streamGetDevice(__stream);
|
|
245
|
+
# else // ^^^ _CCCL_CTK_AT_LEAST(13, 0) ^^^ / vvv _CCCL_CTK_BELOW(13, 0) vvv
|
|
246
|
+
{
|
|
247
|
+
::CUcontext __stream_ctx = ::cuda::__driver::__streamGetCtx(__stream);
|
|
248
|
+
__ensure_current_context __setter(__stream_ctx);
|
|
249
|
+
__device = ::cuda::__driver::__ctxGetDevice();
|
|
250
|
+
}
|
|
251
|
+
# endif // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^
|
|
252
|
+
return device_ref{::cuda::__driver::__cudevice_to_ordinal(__device)};
|
|
246
253
|
}
|
|
247
254
|
|
|
248
255
|
//! @brief Queries the \c stream_ref for itself. This makes \c stream_ref usable in places where we expect an
|
|
@@ -262,21 +269,20 @@ inline void event_ref::record(stream_ref __stream) const
|
|
|
262
269
|
}
|
|
263
270
|
|
|
264
271
|
inline event::event(stream_ref __stream, event::flags __flags)
|
|
265
|
-
: event(__stream, static_cast<unsigned
|
|
272
|
+
: event(__stream, static_cast<unsigned>(__flags) | cudaEventDisableTiming)
|
|
266
273
|
{
|
|
267
274
|
record(__stream);
|
|
268
275
|
}
|
|
269
276
|
|
|
270
|
-
inline event::event(stream_ref __stream, unsigned
|
|
277
|
+
inline event::event(stream_ref __stream, unsigned __flags)
|
|
271
278
|
: event_ref(::cudaEvent_t{})
|
|
272
279
|
{
|
|
273
280
|
[[maybe_unused]] __ensure_current_context __ctx_setter(__stream);
|
|
274
|
-
|
|
275
|
-
::cudaEventCreateWithFlags, "Failed to create CUDA event", &__event_, static_cast<unsigned int>(__flags));
|
|
281
|
+
__event_ = ::cuda::__driver::__eventCreate(static_cast<unsigned>(__flags));
|
|
276
282
|
}
|
|
277
283
|
|
|
278
284
|
inline timed_event::timed_event(stream_ref __stream, event::flags __flags)
|
|
279
|
-
: event(__stream, static_cast<unsigned
|
|
285
|
+
: event(__stream, static_cast<unsigned>(__flags))
|
|
280
286
|
{
|
|
281
287
|
record(__stream);
|
|
282
288
|
}
|
|
@@ -42,7 +42,7 @@ using __vtable_for _CCCL_NODEBUG_ALIAS = typename __overrides_for_t<_Interface>:
|
|
|
42
42
|
//! __basic_vtable
|
|
43
43
|
//!
|
|
44
44
|
template <class _Interface, auto... _Mbrs>
|
|
45
|
-
struct _CCCL_DECLSPEC_EMPTY_BASES __basic_vtable
|
|
45
|
+
struct _CCCL_DECLSPEC_EMPTY_BASES _CCCL_TYPE_VISIBILITY_DEFAULT __basic_vtable
|
|
46
46
|
: __rtti_base
|
|
47
47
|
, __virtual_fn<_Mbrs>...
|
|
48
48
|
{
|
|
@@ -105,7 +105,7 @@ struct _CCCL_DECLSPEC_EMPTY_BASES __basic_vtable
|
|
|
105
105
|
//!
|
|
106
106
|
|
|
107
107
|
template <class... _Interfaces>
|
|
108
|
-
struct _CCCL_DECLSPEC_EMPTY_BASES __vtable_tuple
|
|
108
|
+
struct _CCCL_DECLSPEC_EMPTY_BASES _CCCL_TYPE_VISIBILITY_DEFAULT __vtable_tuple
|
|
109
109
|
: __rtti_ex<sizeof...(_Interfaces)>
|
|
110
110
|
, __vtable_for<_Interfaces>...
|
|
111
111
|
{
|
|
@@ -11,6 +11,16 @@
|
|
|
11
11
|
#ifndef _CUDA_DEVICES
|
|
12
12
|
#define _CUDA_DEVICES
|
|
13
13
|
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
|
+
|
|
16
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
|
+
# pragma GCC system_header
|
|
18
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
19
|
+
# pragma clang system_header
|
|
20
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
21
|
+
# pragma system_header
|
|
22
|
+
#endif // no system header
|
|
23
|
+
|
|
14
24
|
#include <cuda/__device/all_devices.h>
|
|
15
25
|
#include <cuda/__device/arch_traits.h>
|
|
16
26
|
#include <cuda/__device/attributes.h>
|
|
@@ -33,6 +33,7 @@
|
|
|
33
33
|
#include <cuda/__iterator/transform_output_iterator.h>
|
|
34
34
|
#include <cuda/__iterator/zip_function.h>
|
|
35
35
|
#include <cuda/__iterator/zip_iterator.h>
|
|
36
|
+
#include <cuda/__iterator/zip_transform_iterator.h>
|
|
36
37
|
#include <cuda/std/iterator>
|
|
37
38
|
|
|
38
39
|
#endif // _CUDA_ITERATOR
|
|
@@ -100,7 +100,14 @@ template <typename _Tp>
|
|
|
100
100
|
template <typename _Tp>
|
|
101
101
|
[[nodiscard]] _CCCL_HIDE_FROM_ABI _CCCL_DEVICE int __cccl_countl_zero_impl_device(_Tp __v) noexcept
|
|
102
102
|
{
|
|
103
|
-
|
|
103
|
+
if constexpr (sizeof(_Tp) == sizeof(uint32_t))
|
|
104
|
+
{
|
|
105
|
+
return static_cast<int>(::__clz(static_cast<int>(__v)));
|
|
106
|
+
}
|
|
107
|
+
else
|
|
108
|
+
{
|
|
109
|
+
return static_cast<int>(::__clzll(static_cast<long long>(__v)));
|
|
110
|
+
}
|
|
104
111
|
}
|
|
105
112
|
#endif // _CCCL_CUDA_COMPILATION()
|
|
106
113
|
|
|
@@ -114,11 +114,11 @@ template <typename _Tp>
|
|
|
114
114
|
{
|
|
115
115
|
if constexpr (sizeof(_Tp) == sizeof(uint32_t))
|
|
116
116
|
{
|
|
117
|
-
return ::__clz(static_cast<int>(::__brev(__v)));
|
|
117
|
+
return static_cast<int>(::__clz(static_cast<int>(::__brev(__v))));
|
|
118
118
|
}
|
|
119
119
|
else
|
|
120
120
|
{
|
|
121
|
-
return ::__clzll(static_cast<long long>(::__brevll(__v)));
|
|
121
|
+
return static_cast<int>(::__clzll(static_cast<long long>(::__brevll(__v))));
|
|
122
122
|
}
|
|
123
123
|
}
|
|
124
124
|
#endif // _CCCL_CUDA_COMPILATION()
|
|
@@ -275,10 +275,10 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_aligned(
|
|
|
275
275
|
// do first word
|
|
276
276
|
if (__first.__ctz_ != 0)
|
|
277
277
|
{
|
|
278
|
-
unsigned
|
|
279
|
-
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(
|
|
278
|
+
unsigned __clz_f = __bits_per_word - __first.__ctz_;
|
|
279
|
+
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
|
|
280
280
|
__n -= __dn;
|
|
281
|
-
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (
|
|
281
|
+
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
|
|
282
282
|
__storage_type __b = *__first.__seg_ & __m;
|
|
283
283
|
*__result.__seg_ &= ~__m;
|
|
284
284
|
*__result.__seg_ |= __b;
|
|
@@ -420,8 +420,8 @@ _CCCL_API constexpr __bit_iterator<_Cp, false> __copy_backward_aligned(
|
|
|
420
420
|
{
|
|
421
421
|
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__last.__ctz_), __n);
|
|
422
422
|
__n -= __dn;
|
|
423
|
-
unsigned
|
|
424
|
-
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >>
|
|
423
|
+
unsigned __clz_f = __bits_per_word - __last.__ctz_;
|
|
424
|
+
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_f);
|
|
425
425
|
__storage_type __b = *__last.__seg_ & __m;
|
|
426
426
|
*__result.__seg_ &= ~__m;
|
|
427
427
|
*__result.__seg_ |= __b;
|
|
@@ -635,10 +635,10 @@ _CCCL_API inline __bit_iterator<_Cr, false> __swap_ranges_aligned(
|
|
|
635
635
|
// do first word
|
|
636
636
|
if (__first.__ctz_ != 0)
|
|
637
637
|
{
|
|
638
|
-
unsigned
|
|
639
|
-
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(
|
|
638
|
+
unsigned __clz_f = __bits_per_word - __first.__ctz_;
|
|
639
|
+
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
|
|
640
640
|
__n -= __dn;
|
|
641
|
-
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (
|
|
641
|
+
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
|
|
642
642
|
__storage_type __b1 = *__first.__seg_ & __m;
|
|
643
643
|
*__first.__seg_ &= ~__m;
|
|
644
644
|
__storage_type __b2 = *__result.__seg_ & __m;
|
|
@@ -988,10 +988,10 @@ _CCCL_API constexpr bool __equal_aligned(
|
|
|
988
988
|
// do first word
|
|
989
989
|
if (__first1.__ctz_ != 0)
|
|
990
990
|
{
|
|
991
|
-
unsigned
|
|
992
|
-
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(
|
|
991
|
+
unsigned __clz_f = __bits_per_word - __first1.__ctz_;
|
|
992
|
+
difference_type __dn = ::cuda::std::min(static_cast<difference_type>(__clz_f), __n);
|
|
993
993
|
__n -= __dn;
|
|
994
|
-
__storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (
|
|
994
|
+
__storage_type __m = (~__storage_type(0) << __first1.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
|
|
995
995
|
if ((*__first2.__seg_ & __m) != (*__first1.__seg_ & __m))
|
|
996
996
|
{
|
|
997
997
|
return false;
|
|
@@ -39,10 +39,10 @@
|
|
|
39
39
|
#if _CCCL_HAS_PDL()
|
|
40
40
|
// Waits for the previous kernel to complete (when it reaches its final membar). Should be put before the first global
|
|
41
41
|
// memory access in a kernel.
|
|
42
|
-
# define _CCCL_PDL_GRID_DEPENDENCY_SYNC() NV_IF_TARGET(NV_PROVIDES_SM_90, cudaGridDependencySynchronize();)
|
|
42
|
+
# define _CCCL_PDL_GRID_DEPENDENCY_SYNC() NV_IF_TARGET(NV_PROVIDES_SM_90, ::cudaGridDependencySynchronize();)
|
|
43
43
|
// Allows the subsequent kernel in the same stream to launch. Can be put anywhere in a kernel.
|
|
44
44
|
// Heuristic(ahendriksen): put it after the last load.
|
|
45
|
-
# define _CCCL_PDL_TRIGGER_NEXT_LAUNCH() NV_IF_TARGET(NV_PROVIDES_SM_90, cudaTriggerProgrammaticLaunchCompletion();)
|
|
45
|
+
# define _CCCL_PDL_TRIGGER_NEXT_LAUNCH() NV_IF_TARGET(NV_PROVIDES_SM_90, ::cudaTriggerProgrammaticLaunchCompletion();)
|
|
46
46
|
#else // _CCCL_HAS_PDL()
|
|
47
47
|
# define _CCCL_PDL_GRID_DEPENDENCY_SYNC()
|
|
48
48
|
# define _CCCL_PDL_TRIGGER_NEXT_LAUNCH()
|
|
@@ -107,6 +107,8 @@
|
|
|
107
107
|
#define _CCCL_PP_FOR_EACH_7(_Mp, _1, _2, _3, _4, _5, _6, _7) _Mp(_1) _Mp(_2) _Mp(_3) _Mp(_4) _Mp(_5) _Mp(_6) _Mp(_7)
|
|
108
108
|
#define _CCCL_PP_FOR_EACH_8(_Mp, _1, _2, _3, _4, _5, _6, _7, _8) \
|
|
109
109
|
_Mp(_1) _Mp(_2) _Mp(_3) _Mp(_4) _Mp(_5) _Mp(_6) _Mp(_7) _Mp(_8)
|
|
110
|
+
#define _CCCL_PP_FOR_EACH_9(_Mp, _1, _2, _3, _4, _5, _6, _7, _8, _9) \
|
|
111
|
+
_Mp(_1) _Mp(_2) _Mp(_3) _Mp(_4) _Mp(_5) _Mp(_6) _Mp(_7) _Mp(_8) _Mp(_9)
|
|
110
112
|
|
|
111
113
|
#define _CCCL_PP_PROBE_EMPTY_PROBE__CCCL_PP_PROBE_EMPTY _CCCL_PP_PROBE(~)
|
|
112
114
|
|
|
@@ -43,19 +43,19 @@ template <class _Rep, class _Period = ratio<1>>
|
|
|
43
43
|
class _CCCL_TYPE_VISIBILITY_DEFAULT duration;
|
|
44
44
|
|
|
45
45
|
template <class _Tp>
|
|
46
|
-
inline
|
|
46
|
+
inline constexpr bool __is_duration_v = false;
|
|
47
47
|
|
|
48
48
|
template <class _Rep, class _Period>
|
|
49
|
-
inline
|
|
49
|
+
inline constexpr bool __is_duration_v<duration<_Rep, _Period>> = true;
|
|
50
50
|
|
|
51
51
|
template <class _Rep, class _Period>
|
|
52
|
-
inline
|
|
52
|
+
inline constexpr bool __is_duration_v<const duration<_Rep, _Period>> = true;
|
|
53
53
|
|
|
54
54
|
template <class _Rep, class _Period>
|
|
55
|
-
inline
|
|
55
|
+
inline constexpr bool __is_duration_v<volatile duration<_Rep, _Period>> = true;
|
|
56
56
|
|
|
57
57
|
template <class _Rep, class _Period>
|
|
58
|
-
inline
|
|
58
|
+
inline constexpr bool __is_duration_v<const volatile duration<_Rep, _Period>> = true;
|
|
59
59
|
|
|
60
60
|
} // namespace chrono
|
|
61
61
|
|
|
@@ -190,29 +190,29 @@ class _CCCL_TYPE_VISIBILITY_DEFAULT duration
|
|
|
190
190
|
struct __no_overflow
|
|
191
191
|
{
|
|
192
192
|
private:
|
|
193
|
-
static
|
|
194
|
-
static
|
|
195
|
-
static
|
|
196
|
-
static
|
|
197
|
-
static
|
|
198
|
-
static
|
|
199
|
-
static
|
|
193
|
+
static constexpr intmax_t __gcd_n1_n2 = __static_gcd<_R1::num, _R2::num>::value;
|
|
194
|
+
static constexpr intmax_t __gcd_d1_d2 = __static_gcd<_R1::den, _R2::den>::value;
|
|
195
|
+
static constexpr intmax_t __n1 = _R1::num / __gcd_n1_n2;
|
|
196
|
+
static constexpr intmax_t __d1 = _R1::den / __gcd_d1_d2;
|
|
197
|
+
static constexpr intmax_t __n2 = _R2::num / __gcd_n1_n2;
|
|
198
|
+
static constexpr intmax_t __d2 = _R2::den / __gcd_d1_d2;
|
|
199
|
+
static constexpr intmax_t max = -((intmax_t(1) << (sizeof(intmax_t) * CHAR_BIT - 1)) + 1);
|
|
200
200
|
|
|
201
201
|
template <intmax_t _Xp, intmax_t _Yp, bool __overflow>
|
|
202
202
|
struct __mul // __overflow == false
|
|
203
203
|
{
|
|
204
|
-
static
|
|
204
|
+
static constexpr intmax_t value = _Xp * _Yp;
|
|
205
205
|
};
|
|
206
206
|
|
|
207
207
|
template <intmax_t _Xp, intmax_t _Yp>
|
|
208
208
|
struct __mul<_Xp, _Yp, true>
|
|
209
209
|
{
|
|
210
|
-
static
|
|
210
|
+
static constexpr intmax_t value = 1;
|
|
211
211
|
};
|
|
212
212
|
|
|
213
213
|
public:
|
|
214
|
-
static
|
|
215
|
-
using type
|
|
214
|
+
static constexpr bool value = (__n1 <= max / __d2) && (__n2 <= max / __d1);
|
|
215
|
+
using type = ratio<__mul<__n1, __d2, !value>::value, __mul<__n2, __d1, !value>::value>;
|
|
216
216
|
};
|
|
217
217
|
|
|
218
218
|
public:
|
|
@@ -40,11 +40,11 @@ namespace chrono
|
|
|
40
40
|
class _CCCL_TYPE_VISIBILITY_DEFAULT steady_clock
|
|
41
41
|
{
|
|
42
42
|
public:
|
|
43
|
-
using duration
|
|
44
|
-
using rep
|
|
45
|
-
using period
|
|
46
|
-
using time_point
|
|
47
|
-
static constexpr
|
|
43
|
+
using duration = nanoseconds;
|
|
44
|
+
using rep = duration::rep;
|
|
45
|
+
using period = duration::period;
|
|
46
|
+
using time_point = ::cuda::std::chrono::time_point<steady_clock, duration>;
|
|
47
|
+
static constexpr bool is_steady = true;
|
|
48
48
|
|
|
49
49
|
[[nodiscard]] _CCCL_API static time_point now() noexcept;
|
|
50
50
|
};
|
|
@@ -39,11 +39,11 @@ namespace chrono
|
|
|
39
39
|
class _CCCL_TYPE_VISIBILITY_DEFAULT system_clock
|
|
40
40
|
{
|
|
41
41
|
public:
|
|
42
|
-
using duration
|
|
43
|
-
using rep
|
|
44
|
-
using period
|
|
45
|
-
using time_point
|
|
46
|
-
static constexpr
|
|
42
|
+
using duration = ::cuda::std::chrono::nanoseconds;
|
|
43
|
+
using rep = duration::rep;
|
|
44
|
+
using period = duration::period;
|
|
45
|
+
using time_point = ::cuda::std::chrono::time_point<system_clock>;
|
|
46
|
+
static constexpr bool is_steady = false;
|
|
47
47
|
|
|
48
48
|
[[nodiscard]] _CCCL_API inline static time_point now() noexcept
|
|
49
49
|
{
|
|
@@ -21,16 +21,15 @@
|
|
|
21
21
|
# pragma system_header
|
|
22
22
|
#endif // no system header
|
|
23
23
|
|
|
24
|
-
#include <cuda/std/__bit/popcount.h>
|
|
25
24
|
#include <cuda/std/__concepts/concept_macros.h>
|
|
26
25
|
#include <cuda/std/__floating_point/fp.h>
|
|
27
26
|
#include <cuda/std/__type_traits/is_constant_evaluated.h>
|
|
27
|
+
#include <cuda/std/__type_traits/is_floating_point.h>
|
|
28
28
|
#include <cuda/std/__type_traits/is_integral.h>
|
|
29
29
|
|
|
30
|
-
|
|
31
|
-
#if _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG)
|
|
30
|
+
#if _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG) || !_CCCL_CUDA_COMPILATION()
|
|
32
31
|
# include <math.h>
|
|
33
|
-
#endif // _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG)
|
|
32
|
+
#endif // _CCCL_COMPILER(MSVC) || _CCCL_CUDA_COMPILER(CLANG) || !_CCCL_CUDA_COMPILATION()
|
|
34
33
|
|
|
35
34
|
#include <cuda/std/__cccl/prologue.h>
|
|
36
35
|
|
|
@@ -158,10 +157,16 @@ template <class _Tp>
|
|
|
158
157
|
#if _CCCL_HAS_FLOAT128()
|
|
159
158
|
[[nodiscard]] _CCCL_API constexpr bool isnan(__float128 __x) noexcept
|
|
160
159
|
{
|
|
160
|
+
// __builtin_isnan is not efficient for __float128, prefer __nv_fp128_isnan at run-time
|
|
161
|
+
if (!::cuda::std::__cccl_default_is_constant_evaluated())
|
|
162
|
+
{
|
|
163
|
+
NV_IF_TARGET(NV_PROVIDES_SM_100, (return ::__nv_fp128_isnan(__x);)) // preserve NaN behavior even with optimization
|
|
164
|
+
// flags
|
|
165
|
+
}
|
|
161
166
|
# if defined(_CCCL_BUILTIN_ISNAN)
|
|
162
167
|
return _CCCL_BUILTIN_ISNAN(__x);
|
|
163
168
|
# else // ^^^ _CCCL_BUILTIN_ISNAN ^^^ / vvv !_CCCL_BUILTIN_ISNAN vvv
|
|
164
|
-
return
|
|
169
|
+
return __x != __x;
|
|
165
170
|
# endif // ^^^ !_CCCL_BUILTIN_ISNAN ^^^
|
|
166
171
|
}
|
|
167
172
|
#endif // _CCCL_HAS_FLOAT128()
|
|
@@ -24,11 +24,11 @@
|
|
|
24
24
|
#include <cuda/__type_traits/is_floating_point.h>
|
|
25
25
|
#include <cuda/std/__cmath/isnan.h>
|
|
26
26
|
#include <cuda/std/__concepts/concept_macros.h>
|
|
27
|
-
#include <cuda/std/
|
|
27
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
28
28
|
#include <cuda/std/__type_traits/is_extended_arithmetic.h>
|
|
29
29
|
#include <cuda/std/__type_traits/is_integral.h>
|
|
30
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
30
31
|
#include <cuda/std/__type_traits/promote.h>
|
|
31
|
-
#include <cuda/std/limits>
|
|
32
32
|
|
|
33
33
|
#include <nv/target>
|
|
34
34
|
|
|
@@ -36,6 +36,10 @@
|
|
|
36
36
|
|
|
37
37
|
_CCCL_BEGIN_NAMESPACE_CUDA_STD
|
|
38
38
|
|
|
39
|
+
/***********************************************************************************************************************
|
|
40
|
+
* fmax
|
|
41
|
+
**********************************************************************************************************************/
|
|
42
|
+
|
|
39
43
|
// We do explicitly also enable GCC here, because that makes the condition below simpler
|
|
40
44
|
#if _CCCL_CHECK_BUILTIN(builtin_fmax) || _CCCL_COMPILER(GCC)
|
|
41
45
|
_CCCL_TEMPLATE(class _Tp)
|
|
@@ -63,13 +67,12 @@ _CCCL_REQUIRES(is_floating_point_v<_Tp>)
|
|
|
63
67
|
# define _CCCL_USE_BUILTIN_FMAX() 0
|
|
64
68
|
#endif // _CCCL_BUILTIN_FABSF
|
|
65
69
|
|
|
66
|
-
// fmax
|
|
67
70
|
_CCCL_TEMPLATE(class _Tp)
|
|
68
71
|
_CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
|
|
69
72
|
[[nodiscard]] _CCCL_API constexpr conditional_t<is_integral_v<_Tp>, double, _Tp> fmax(_Tp __x, _Tp __y) noexcept
|
|
70
73
|
{
|
|
71
74
|
#if _CCCL_HAS_NVFP16()
|
|
72
|
-
if constexpr (is_same_v<_Tp, __half>)
|
|
75
|
+
if constexpr (is_same_v<_Tp, ::__half>)
|
|
73
76
|
{
|
|
74
77
|
# if _CCCL_CTK_AT_LEAST(12, 2)
|
|
75
78
|
return ::__hmax(__x, __y);
|
|
@@ -82,7 +85,7 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
|
|
|
82
85
|
else
|
|
83
86
|
#endif // _CCCL_HAS_NVFP16()
|
|
84
87
|
#if _CCCL_HAS_NVBF16()
|
|
85
|
-
if constexpr (is_same_v<_Tp, __nv_bfloat16>)
|
|
88
|
+
if constexpr (is_same_v<_Tp, ::__nv_bfloat16>)
|
|
86
89
|
{
|
|
87
90
|
# if _CCCL_CTK_AT_LEAST(12, 2)
|
|
88
91
|
return ::__hmax(__x, __y);
|
|
@@ -100,17 +103,27 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
|
|
|
100
103
|
}
|
|
101
104
|
else
|
|
102
105
|
{
|
|
103
|
-
#if _CCCL_USE_BUILTIN_FMAX()
|
|
104
106
|
if (!::cuda::std::__cccl_default_is_constant_evaluated())
|
|
105
107
|
{
|
|
108
|
+
#if _CCCL_HAS_FLOAT128()
|
|
109
|
+
if constexpr (is_same_v<_Tp, __float128>)
|
|
110
|
+
{
|
|
111
|
+
NV_IF_TARGET(NV_PROVIDES_SM_100, (return ::__nv_fp128_fmax(__x, __y);))
|
|
112
|
+
}
|
|
113
|
+
else
|
|
114
|
+
#endif // _CCCL_HAS_FLOAT128()
|
|
115
|
+
#if _CCCL_USE_BUILTIN_FMAX()
|
|
116
|
+
if constexpr (is_floating_point_v<_Tp>)
|
|
117
|
+
{
|
|
106
118
|
// GCC builtins do not treat NaN properly
|
|
107
119
|
# if _CCCL_COMPILER(GCC)
|
|
108
|
-
|
|
120
|
+
NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::std::__with_builtin_fmax(__x, __y);))
|
|
109
121
|
# else // ^^^ _CCCL_COMPILER(GCC) ^^^ / vvv !_CCCL_COMPILER(GCC)
|
|
110
|
-
|
|
122
|
+
return ::cuda::std::__with_builtin_fmax(__x, __y);
|
|
111
123
|
# endif // !_CCCL_COMPILER(GCC)
|
|
112
|
-
|
|
124
|
+
}
|
|
113
125
|
#endif // _CCCL_USE_BUILTIN_FMAX
|
|
126
|
+
}
|
|
114
127
|
if (::cuda::std::isnan(__x))
|
|
115
128
|
{
|
|
116
129
|
return __y;
|
|
@@ -119,7 +132,10 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
|
|
|
119
132
|
{
|
|
120
133
|
return __x;
|
|
121
134
|
}
|
|
122
|
-
|
|
135
|
+
else
|
|
136
|
+
{
|
|
137
|
+
return __x < __y ? __y : __x;
|
|
138
|
+
}
|
|
123
139
|
}
|
|
124
140
|
}
|
|
125
141
|
|
|
@@ -144,7 +160,9 @@ _CCCL_REQUIRES(::cuda::is_floating_point_v<_Tp> _CCCL_AND ::cuda::is_floating_po
|
|
|
144
160
|
return ::cuda::std::fmax(static_cast<__result_type>(__x), static_cast<__result_type>(__y));
|
|
145
161
|
}
|
|
146
162
|
|
|
147
|
-
|
|
163
|
+
/***********************************************************************************************************************
|
|
164
|
+
* fmin
|
|
165
|
+
**********************************************************************************************************************/
|
|
148
166
|
|
|
149
167
|
// We do explicitly also enable GCC here, because that makes the condition below simpler
|
|
150
168
|
#if _CCCL_CHECK_BUILTIN(builtin_fmin) || _CCCL_COMPILER(GCC)
|
|
@@ -178,7 +196,7 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
|
|
|
178
196
|
[[nodiscard]] _CCCL_API constexpr conditional_t<is_integral_v<_Tp>, double, _Tp> fmin(_Tp __x, _Tp __y) noexcept
|
|
179
197
|
{
|
|
180
198
|
#if _CCCL_HAS_NVFP16()
|
|
181
|
-
if constexpr (is_same_v<_Tp, __half>)
|
|
199
|
+
if constexpr (is_same_v<_Tp, ::__half>)
|
|
182
200
|
{
|
|
183
201
|
# if _CCCL_CTK_AT_LEAST(12, 2)
|
|
184
202
|
return ::__hmin(__x, __y);
|
|
@@ -191,7 +209,7 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
|
|
|
191
209
|
else
|
|
192
210
|
#endif // _CCCL_HAS_NVFP16()
|
|
193
211
|
#if _CCCL_HAS_NVBF16()
|
|
194
|
-
if constexpr (is_same_v<_Tp, __nv_bfloat16>)
|
|
212
|
+
if constexpr (is_same_v<_Tp, ::__nv_bfloat16>)
|
|
195
213
|
{
|
|
196
214
|
# if _CCCL_CTK_AT_LEAST(12, 2)
|
|
197
215
|
return ::__hmin(__x, __y);
|
|
@@ -209,17 +227,26 @@ _CCCL_REQUIRES(__is_extended_arithmetic_v<_Tp>)
|
|
|
209
227
|
}
|
|
210
228
|
else
|
|
211
229
|
{
|
|
212
|
-
#if _CCCL_USE_BUILTIN_FMAX()
|
|
213
230
|
if (!::cuda::std::__cccl_default_is_constant_evaluated())
|
|
214
231
|
{
|
|
232
|
+
#if _CCCL_HAS_FLOAT128()
|
|
233
|
+
if constexpr (is_same_v<_Tp, __float128>)
|
|
234
|
+
{
|
|
235
|
+
NV_IF_TARGET(NV_PROVIDES_SM_100, (return ::__nv_fp128_fmin(__x, __y);))
|
|
236
|
+
}
|
|
237
|
+
#endif // _CCCL_HAS_FLOAT128()
|
|
238
|
+
#if _CCCL_USE_BUILTIN_FMAX()
|
|
239
|
+
if constexpr (is_floating_point_v<_Tp>)
|
|
240
|
+
{
|
|
215
241
|
// GCC builtins do not treat NaN properly
|
|
216
242
|
# if _CCCL_COMPILER(GCC)
|
|
217
|
-
|
|
243
|
+
NV_IF_TARGET(NV_IS_DEVICE, (return ::cuda::std::__with_builtin_fmin(__x, __y);))
|
|
218
244
|
# else // ^^^ _CCCL_COMPILER(GCC) ^^^ / vvv !_CCCL_COMPILER(GCC)
|
|
219
|
-
|
|
245
|
+
return ::cuda::std::__with_builtin_fmin(__x, __y);
|
|
220
246
|
# endif // !_CCCL_COMPILER(GCC)
|
|
221
|
-
|
|
247
|
+
}
|
|
222
248
|
#endif // _CCCL_USE_BUILTIN_FMAX
|
|
249
|
+
}
|
|
223
250
|
if (::cuda::std::isnan(__x))
|
|
224
251
|
{
|
|
225
252
|
return __y;
|
|
@@ -138,7 +138,7 @@ _CCCL_CONCEPT __nothrow_initializable_from =
|
|
|
138
138
|
? ::cuda::std::is_nothrow_constructible_v<_Tp, _Args...>
|
|
139
139
|
: __nothrow_list_initializable_from<_Tp, _Args...>);
|
|
140
140
|
|
|
141
|
-
#if !_CCCL_COMPILER(MSVC)
|
|
141
|
+
#if !_CCCL_COMPILER(MSVC) && !_CCCL_CUDA_COMPILER(NVCC, <, 12, 9)
|
|
142
142
|
|
|
143
143
|
//! Constructible with direct non-list initialization syntax from the result of
|
|
144
144
|
//! a function call expression (often useful for immovable types).
|
|
@@ -23,18 +23,18 @@
|
|
|
23
23
|
|
|
24
24
|
#include <cuda/std/__exception/cuda_error.h>
|
|
25
25
|
|
|
26
|
-
#define _CCCL_TRY_CUDA_API(_NAME, _MSG, ...)
|
|
27
|
-
do
|
|
28
|
-
{
|
|
29
|
-
const ::cudaError_t __status = _NAME(__VA_ARGS__);
|
|
30
|
-
switch (__status)
|
|
31
|
-
{
|
|
32
|
-
case ::cudaSuccess:
|
|
33
|
-
break;
|
|
34
|
-
default:
|
|
35
|
-
/* CUDA error state
|
|
36
|
-
::cuda::__throw_cuda_error(__status, _MSG, #_NAME);
|
|
37
|
-
}
|
|
26
|
+
#define _CCCL_TRY_CUDA_API(_NAME, _MSG, ...) \
|
|
27
|
+
do \
|
|
28
|
+
{ \
|
|
29
|
+
const ::cudaError_t __status = _NAME(__VA_ARGS__); \
|
|
30
|
+
switch (__status) \
|
|
31
|
+
{ \
|
|
32
|
+
case ::cudaSuccess: \
|
|
33
|
+
break; \
|
|
34
|
+
default: \
|
|
35
|
+
::cudaGetLastError(); /* clear CUDA error state */ \
|
|
36
|
+
::cuda::__throw_cuda_error(__status, _MSG, #_NAME); \
|
|
37
|
+
} \
|
|
38
38
|
} while (0)
|
|
39
39
|
|
|
40
40
|
#define _CCCL_ASSERT_CUDA_API(_NAME, _MSG, ...) \
|
|
@@ -109,14 +109,7 @@ private:
|
|
|
109
109
|
[[maybe_unused]] const char* __api = nullptr,
|
|
110
110
|
[[maybe_unused]] ::cuda::std::source_location __loc = ::cuda::std::source_location::current())
|
|
111
111
|
{
|
|
112
|
-
|
|
113
|
-
NV_IF_ELSE_TARGET(NV_IS_HOST,
|
|
114
|
-
(::cudaGetLastError(); // clear CUDA error state
|
|
115
|
-
throw ::cuda::cuda_error(__status, __msg, __api, __loc);), //
|
|
116
|
-
(::cuda::std::terminate();))
|
|
117
|
-
# else // ^^^ _CCCL_CUDA_COMPILATION() ^^^ / vvv !_CCCL_CUDA_COMPILATION() vvv
|
|
118
|
-
throw ::cuda::cuda_error(__status, __msg, __api, __loc);
|
|
119
|
-
# endif // !_CCCL_CUDA_COMPILATION()
|
|
112
|
+
NV_IF_TARGET(NV_IS_HOST, (throw ::cuda::cuda_error(__status, __msg, __api, __loc);), (::cuda::std::terminate();))
|
|
120
113
|
}
|
|
121
114
|
#else // ^^^ _CCCL_HAS_EXCEPTIONS() ^^^ / vvv !_CCCL_HAS_EXCEPTIONS() vvv
|
|
122
115
|
class cuda_error
|