cuda-cccl 0.3.0__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
- cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
- cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +27 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +13 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +155 -13
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +79 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
- cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
- cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -22,6 +22,16 @@
|
|
|
22
22
|
#endif // no system header
|
|
23
23
|
|
|
24
24
|
#include <cuda/__mdspan/restrict_accessor.h>
|
|
25
|
+
#include <cuda/std/__concepts/concept_macros.h>
|
|
26
|
+
#include <cuda/std/__fwd/array.h>
|
|
27
|
+
#include <cuda/std/__fwd/span.h>
|
|
28
|
+
#include <cuda/std/__type_traits/extent.h>
|
|
29
|
+
#include <cuda/std/__type_traits/is_convertible.h>
|
|
30
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
31
|
+
#include <cuda/std/__type_traits/rank.h>
|
|
32
|
+
#include <cuda/std/__type_traits/remove_all_extents.h>
|
|
33
|
+
#include <cuda/std/__type_traits/remove_pointer.h>
|
|
34
|
+
#include <cuda/std/__type_traits/remove_reference.h>
|
|
25
35
|
#include <cuda/std/mdspan>
|
|
26
36
|
|
|
27
37
|
#include <cuda/std/__cccl/prologue.h>
|
|
@@ -32,7 +42,63 @@ template <typename _ElementType,
|
|
|
32
42
|
typename _Extents,
|
|
33
43
|
typename _LayoutPolicy = ::cuda::std::layout_right,
|
|
34
44
|
typename _AccessorPolicy = ::cuda::std::default_accessor<_ElementType>>
|
|
35
|
-
|
|
45
|
+
class restrict_mdspan
|
|
46
|
+
: public ::cuda::std::mdspan<_ElementType, _Extents, _LayoutPolicy, restrict_accessor<_AccessorPolicy>>
|
|
47
|
+
{
|
|
48
|
+
public:
|
|
49
|
+
_LIBCUDACXX_DELEGATE_CONSTRUCTORS(
|
|
50
|
+
restrict_mdspan, ::cuda::std::mdspan, _ElementType, _Extents, _LayoutPolicy, restrict_accessor<_AccessorPolicy>);
|
|
51
|
+
|
|
52
|
+
_CCCL_API friend constexpr void swap(restrict_mdspan& __x, restrict_mdspan& __y) noexcept
|
|
53
|
+
{
|
|
54
|
+
swap(static_cast<__base&>(__x), static_cast<__base&>(__y));
|
|
55
|
+
}
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
_CCCL_TEMPLATE(class _ElementType, class... _OtherIndexTypes)
|
|
59
|
+
_CCCL_REQUIRES((sizeof...(_OtherIndexTypes) > 0)
|
|
60
|
+
_CCCL_AND(::cuda::std::is_convertible_v<_OtherIndexTypes, size_t>&&... && true))
|
|
61
|
+
_CCCL_HOST_DEVICE explicit restrict_mdspan(_ElementType*, _OtherIndexTypes...)
|
|
62
|
+
-> restrict_mdspan<_ElementType, ::cuda::std::extents<size_t, ::cuda::std::__maybe_static_ext<_OtherIndexTypes>...>>;
|
|
63
|
+
|
|
64
|
+
_CCCL_TEMPLATE(class _Pointer)
|
|
65
|
+
_CCCL_REQUIRES(::cuda::std::is_pointer_v<::cuda::std::remove_reference_t<_Pointer>>)
|
|
66
|
+
_CCCL_HOST_DEVICE restrict_mdspan(_Pointer&&)
|
|
67
|
+
-> restrict_mdspan<::cuda::std::remove_pointer_t<::cuda::std::remove_reference_t<_Pointer>>,
|
|
68
|
+
::cuda::std::extents<size_t>>;
|
|
69
|
+
|
|
70
|
+
_CCCL_TEMPLATE(class _CArray)
|
|
71
|
+
_CCCL_REQUIRES(::cuda::std::is_array_v<_CArray> _CCCL_AND(::cuda::std::rank_v<_CArray> == 1))
|
|
72
|
+
_CCCL_HOST_DEVICE restrict_mdspan(_CArray&)
|
|
73
|
+
-> restrict_mdspan<::cuda::std::remove_all_extents_t<_CArray>,
|
|
74
|
+
::cuda::std::extents<size_t, ::cuda::std::extent_v<_CArray, 0>>>;
|
|
75
|
+
|
|
76
|
+
template <class _ElementType, class _OtherIndexType, size_t _Size>
|
|
77
|
+
_CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, const ::cuda::std::array<_OtherIndexType, _Size>&)
|
|
78
|
+
-> restrict_mdspan<_ElementType, ::cuda::std::dextents<size_t, _Size>>;
|
|
79
|
+
|
|
80
|
+
template <class _ElementType, class _OtherIndexType, size_t _Size>
|
|
81
|
+
_CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, ::cuda::std::span<_OtherIndexType, _Size>)
|
|
82
|
+
-> restrict_mdspan<_ElementType, ::cuda::std::dextents<size_t, _Size>>;
|
|
83
|
+
|
|
84
|
+
// This one is necessary because all the constructors take `data_handle_type`s, not
|
|
85
|
+
// `_ElementType*`s, and `data_handle_type` is taken from `accessor_type::data_handle_type`, which
|
|
86
|
+
// seems to throw off automatic deduction guides.
|
|
87
|
+
template <class _ElementType, class _OtherIndexType, size_t... _ExtentsPack>
|
|
88
|
+
_CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, const ::cuda::std::extents<_OtherIndexType, _ExtentsPack...>&)
|
|
89
|
+
-> restrict_mdspan<_ElementType, ::cuda::std::extents<_OtherIndexType, _ExtentsPack...>>;
|
|
90
|
+
|
|
91
|
+
template <class _ElementType, class _MappingType>
|
|
92
|
+
_CCCL_HOST_DEVICE restrict_mdspan(_ElementType*, const _MappingType&)
|
|
93
|
+
-> restrict_mdspan<_ElementType, typename _MappingType::extents_type, typename _MappingType::layout_type>;
|
|
94
|
+
|
|
95
|
+
template <class _MappingType, class _AccessorType>
|
|
96
|
+
_CCCL_HOST_DEVICE
|
|
97
|
+
restrict_mdspan(const typename _AccessorType::data_handle_type, const _MappingType&, const _AccessorType&)
|
|
98
|
+
-> restrict_mdspan<typename _AccessorType::element_type,
|
|
99
|
+
typename _MappingType::extents_type,
|
|
100
|
+
typename _MappingType::layout_type,
|
|
101
|
+
_AccessorType>;
|
|
36
102
|
|
|
37
103
|
/***********************************************************************************************************************
|
|
38
104
|
* Accessibility Traits
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
|
2
|
+
//
|
|
3
|
+
// Part of libcu++, the C++ Standard Library for your entire system,
|
|
4
|
+
// under the Apache License v2.0 with LLVM Exceptions.
|
|
5
|
+
// See https://llvm.org/LICENSE.txt for license information.
|
|
6
|
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
7
|
+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
|
|
8
|
+
//
|
|
9
|
+
//===----------------------------------------------------------------------===//
|
|
10
|
+
|
|
11
|
+
#ifndef _CUDA___MEMORY_POINTER_IN_RANGE_H
|
|
12
|
+
#define _CUDA___MEMORY_POINTER_IN_RANGE_H
|
|
13
|
+
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
|
+
|
|
16
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
|
+
# pragma GCC system_header
|
|
18
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
19
|
+
# pragma clang system_header
|
|
20
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
21
|
+
# pragma system_header
|
|
22
|
+
#endif // no system header
|
|
23
|
+
|
|
24
|
+
#include <cuda/std/__type_traits/is_constant_evaluated.h>
|
|
25
|
+
#include <cuda/std/cstdint>
|
|
26
|
+
#if _CCCL_HOST_COMPILATION()
|
|
27
|
+
# include <functional>
|
|
28
|
+
#endif // _CCCL_HOST_COMPILATION()
|
|
29
|
+
|
|
30
|
+
#include <cuda/std/__cccl/prologue.h>
|
|
31
|
+
|
|
32
|
+
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
33
|
+
|
|
34
|
+
// Pointers comparison <, <=, >=, > is undefined behavior in C++ (https://eel.is/c++draft/expr.rel#4) when pointers
|
|
35
|
+
// don't belong to the same object or array.
|
|
36
|
+
// - Even when a platform guarantees flat address space, the compiler can leverage UB for optimization purposes.
|
|
37
|
+
// - However, the compiler treats ::std::less<> other functional operators in a special way, ensuring a total ordering.
|
|
38
|
+
// - For device code, we can convert pointers to uintptr_t and compare them.
|
|
39
|
+
//
|
|
40
|
+
// References:
|
|
41
|
+
// - https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2024/p3234r0.html
|
|
42
|
+
// - https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2023/p2865r2.pdf
|
|
43
|
+
// - https://www.boost.org/doc/libs/develop/libs/core/doc/html/core/pointer_in_range.html
|
|
44
|
+
// - https://pvs-studio.com/en/blog/posts/cpp/1199/
|
|
45
|
+
// - https://releases.llvm.org/20.1.0/tools/clang/docs/ReleaseNotes.html#resolutions-to-c-defect-reports
|
|
46
|
+
|
|
47
|
+
#if _CCCL_HOST_COMPILATION()
|
|
48
|
+
|
|
49
|
+
template <typename _Tp>
|
|
50
|
+
[[nodiscard]] _CCCL_API bool __ptr_in_range_host(_Tp* __ptr, _Tp* __start, _Tp* __end) noexcept
|
|
51
|
+
{
|
|
52
|
+
_CCCL_ASSERT(::std::greater_equal<>{}(__end, __start), "__ptr_in_range_host: __end must be greater than __start");
|
|
53
|
+
return ::std::greater_equal<>{}(__ptr, __start) && ::std::less<>{}(__ptr, __end);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
#endif // _CCCL_HOST_COMPILATION()
|
|
57
|
+
|
|
58
|
+
#if _CCCL_DEVICE_COMPILATION()
|
|
59
|
+
|
|
60
|
+
template <typename _Tp>
|
|
61
|
+
[[nodiscard]] _CCCL_API bool __ptr_in_range_device(_Tp* __ptr, _Tp* __start, _Tp* __end) noexcept
|
|
62
|
+
{
|
|
63
|
+
using uintptr_t = ::cuda::std::uintptr_t;
|
|
64
|
+
auto __end_ptr = reinterpret_cast<uintptr_t>(__end);
|
|
65
|
+
auto __start_ptr = reinterpret_cast<uintptr_t>(__start);
|
|
66
|
+
auto __ptr_ptr = reinterpret_cast<uintptr_t>(__ptr);
|
|
67
|
+
_CCCL_ASSERT(__end_ptr >= __start_ptr, "__ptr_in_range_device: __end must be greater than __start");
|
|
68
|
+
return __ptr_ptr >= __start_ptr && __ptr_ptr < __end_ptr;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
#endif // _CCCL_DEVICE_COMPILATION()
|
|
72
|
+
|
|
73
|
+
template <typename _Tp>
|
|
74
|
+
[[nodiscard]] _CCCL_API constexpr bool ptr_in_range(_Tp* __ptr, _Tp* __start, _Tp* __end) noexcept
|
|
75
|
+
{
|
|
76
|
+
if (::cuda::std::__cccl_default_is_constant_evaluated())
|
|
77
|
+
{
|
|
78
|
+
_CCCL_ASSERT(__end >= __start, "ptr_in_range: __end must be greater than __start");
|
|
79
|
+
return __ptr >= __start && __ptr < __end; // UB is not possible in a constant expression
|
|
80
|
+
}
|
|
81
|
+
else
|
|
82
|
+
{
|
|
83
|
+
NV_IF_ELSE_TARGET(NV_IS_HOST,
|
|
84
|
+
(return ::cuda::__ptr_in_range_host(__ptr, __start, __end);),
|
|
85
|
+
(return ::cuda::__ptr_in_range_device(__ptr, __start, __end);));
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
_CCCL_END_NAMESPACE_CUDA
|
|
90
|
+
|
|
91
|
+
#include <cuda/std/__cccl/epilogue.h>
|
|
92
|
+
|
|
93
|
+
#endif // _CUDA___MEMORY_POINTER_IN_RANGE_H
|
|
@@ -8,8 +8,8 @@
|
|
|
8
8
|
//
|
|
9
9
|
//===----------------------------------------------------------------------===//
|
|
10
10
|
|
|
11
|
-
#ifndef
|
|
12
|
-
#define
|
|
11
|
+
#ifndef _CUDA___MEMORY_RESOURCE_GET_MEMORY_RESOURCE_H
|
|
12
|
+
#define _CUDA___MEMORY_RESOURCE_GET_MEMORY_RESOURCE_H
|
|
13
13
|
|
|
14
14
|
#include <cuda/std/detail/__config>
|
|
15
15
|
|
|
@@ -23,11 +23,11 @@
|
|
|
23
23
|
|
|
24
24
|
#include <cuda/__memory_resource/properties.h>
|
|
25
25
|
#include <cuda/__memory_resource/resource.h>
|
|
26
|
+
#include <cuda/__stream/stream_ref.h>
|
|
26
27
|
#include <cuda/std/__concepts/equality_comparable.h>
|
|
27
28
|
#include <cuda/std/__execution/env.h>
|
|
28
29
|
#include <cuda/std/__type_traits/is_same.h>
|
|
29
30
|
#include <cuda/std/__type_traits/remove_cvref.h>
|
|
30
|
-
#include <cuda/stream_ref>
|
|
31
31
|
|
|
32
32
|
#include <cuda/std/__cccl/prologue.h>
|
|
33
33
|
|
|
@@ -79,4 +79,4 @@ _CCCL_END_NAMESPACE_CUDA_MR
|
|
|
79
79
|
|
|
80
80
|
#include <cuda/std/__cccl/epilogue.h>
|
|
81
81
|
|
|
82
|
-
#endif //
|
|
82
|
+
#endif //_CUDA__MEMORY_RESOURCE_GET_MEMORY_RESOURCE_H
|
|
@@ -21,6 +21,7 @@
|
|
|
21
21
|
# pragma system_header
|
|
22
22
|
#endif // no system header
|
|
23
23
|
|
|
24
|
+
#include <cuda/std/__type_traits/decay.h>
|
|
24
25
|
#include <cuda/std/__type_traits/type_set.h>
|
|
25
26
|
#include <cuda/std/cstddef>
|
|
26
27
|
|
|
@@ -62,6 +63,49 @@ template <class... _Properties>
|
|
|
62
63
|
inline constexpr bool __contains_execution_space_property =
|
|
63
64
|
__is_host_accessible<_Properties...> || __is_device_accessible<_Properties...>;
|
|
64
65
|
|
|
66
|
+
//! @brief A type representing a list of memory resource properties
|
|
67
|
+
//! @tparam _Properties The properties to be included in the list
|
|
68
|
+
//! It has a member template `rebind` that allows constructing a type by combining
|
|
69
|
+
//! a template and type arguments with the properties from this list. The properties
|
|
70
|
+
//! are appended after the type arguments in the resulting type.
|
|
71
|
+
template <class... _Properties>
|
|
72
|
+
struct properties_list
|
|
73
|
+
{
|
|
74
|
+
//! @brief A type alias for a type template instantiated with the properties
|
|
75
|
+
//! from this list appended to the type arguments.
|
|
76
|
+
template <template <class...> class _Fn, class... _ExtraArgs>
|
|
77
|
+
using rebind = _Fn<_ExtraArgs..., _Properties...>;
|
|
78
|
+
|
|
79
|
+
template <class _QueryProperty>
|
|
80
|
+
_CCCL_HOST_API static constexpr bool has_property([[maybe_unused]] _QueryProperty)
|
|
81
|
+
{
|
|
82
|
+
return ::cuda::std::__type_set_contains_v<::cuda::std::__make_type_set<_Properties...>, _QueryProperty>;
|
|
83
|
+
}
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
template <class _Tp>
|
|
87
|
+
inline constexpr bool __is_queries_list = false;
|
|
88
|
+
|
|
89
|
+
template <class... _Tp>
|
|
90
|
+
inline constexpr bool __is_queries_list<properties_list<_Tp...>> = true;
|
|
91
|
+
|
|
92
|
+
template <typename _Tp>
|
|
93
|
+
_CCCL_CONCEPT __has_default_queries =
|
|
94
|
+
_CCCL_REQUIRES_EXPR((_Tp))(requires(__is_queries_list<typename ::cuda::std::decay_t<_Tp>::default_queries>));
|
|
95
|
+
|
|
96
|
+
template <typename _Resource, bool _HasDefaultQueries = __has_default_queries<_Resource>>
|
|
97
|
+
struct __copy_default_queries;
|
|
98
|
+
|
|
99
|
+
template <typename _Resource>
|
|
100
|
+
struct __copy_default_queries<_Resource, true>
|
|
101
|
+
{
|
|
102
|
+
using default_queries = typename _Resource::default_queries;
|
|
103
|
+
};
|
|
104
|
+
|
|
105
|
+
template <typename _Resource>
|
|
106
|
+
struct __copy_default_queries<_Resource, false>
|
|
107
|
+
{};
|
|
108
|
+
|
|
65
109
|
_CCCL_END_NAMESPACE_CUDA_MR
|
|
66
110
|
|
|
67
111
|
#include <cuda/std/__cccl/epilogue.h>
|
|
@@ -22,6 +22,7 @@
|
|
|
22
22
|
#endif // no system header
|
|
23
23
|
|
|
24
24
|
#include <cuda/__memory_resource/get_property.h>
|
|
25
|
+
#include <cuda/__stream/stream_ref.h>
|
|
25
26
|
#include <cuda/std/__concepts/concept_macros.h>
|
|
26
27
|
#include <cuda/std/__concepts/convertible_to.h>
|
|
27
28
|
#include <cuda/std/__concepts/equality_comparable.h>
|
|
@@ -29,7 +30,6 @@
|
|
|
29
30
|
#include <cuda/std/__tuple_dir/sfinae_helpers.h>
|
|
30
31
|
#include <cuda/std/__type_traits/decay.h>
|
|
31
32
|
#include <cuda/std/__type_traits/fold.h>
|
|
32
|
-
#include <cuda/stream_ref>
|
|
33
33
|
|
|
34
34
|
#include <cuda/std/__cccl/prologue.h>
|
|
35
35
|
|
|
@@ -26,6 +26,7 @@
|
|
|
26
26
|
# include <cuda/__memory_resource/get_property.h>
|
|
27
27
|
# include <cuda/__memory_resource/properties.h>
|
|
28
28
|
# include <cuda/__memory_resource/resource.h>
|
|
29
|
+
# include <cuda/__stream/stream_ref.h>
|
|
29
30
|
# include <cuda/std/__concepts/concept_macros.h>
|
|
30
31
|
# include <cuda/std/__memory/addressof.h>
|
|
31
32
|
# include <cuda/std/__type_traits/is_base_of.h>
|
|
@@ -34,7 +35,6 @@
|
|
|
34
35
|
# include <cuda/std/__utility/exchange.h>
|
|
35
36
|
# include <cuda/std/__utility/move.h>
|
|
36
37
|
# include <cuda/std/cstddef>
|
|
37
|
-
# include <cuda/stream_ref>
|
|
38
38
|
|
|
39
39
|
# include <cuda/std/__cccl/prologue.h>
|
|
40
40
|
|
|
@@ -161,10 +161,7 @@ struct _Resource_vtable_builder
|
|
|
161
161
|
template <class _Resource>
|
|
162
162
|
static void _Dealloc(void* __object, void* __ptr, size_t __bytes, size_t __alignment) noexcept
|
|
163
163
|
{
|
|
164
|
-
|
|
165
|
-
// deallocate_sync functions to be noexcept. Comment out the check for now until
|
|
166
|
-
// we can fix RMM.
|
|
167
|
-
// static_assert(noexcept(static_cast<_Resource*>(__object)->deallocate(__ptr, __bytes, __alignment)));
|
|
164
|
+
static_assert(noexcept(static_cast<_Resource*>(__object)->deallocate_sync(__ptr, __bytes, __alignment)));
|
|
168
165
|
return static_cast<_Resource*>(__object)->deallocate_sync(__ptr, __bytes, __alignment);
|
|
169
166
|
}
|
|
170
167
|
|
|
@@ -176,8 +173,9 @@ struct _Resource_vtable_builder
|
|
|
176
173
|
|
|
177
174
|
template <class _Resource>
|
|
178
175
|
static void
|
|
179
|
-
_Dealloc_async(void* __object, void* __ptr, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream)
|
|
176
|
+
_Dealloc_async(void* __object, void* __ptr, size_t __bytes, size_t __alignment, ::cuda::stream_ref __stream) noexcept
|
|
180
177
|
{
|
|
178
|
+
static_assert(noexcept(static_cast<_Resource*>(__object)->deallocate(__stream, __ptr, __bytes, __alignment)));
|
|
181
179
|
return static_cast<_Resource*>(__object)->deallocate(__stream, __ptr, __bytes, __alignment);
|
|
182
180
|
}
|
|
183
181
|
|
|
@@ -653,8 +653,9 @@
|
|
|
653
653
|
#ifndef NVTX3_CPP_DEFINITIONS_V1_0
|
|
654
654
|
# define NVTX3_CPP_DEFINITIONS_V1_0
|
|
655
655
|
|
|
656
|
+
# include <cuda/std/__cccl/memory_wrapper.h>
|
|
657
|
+
|
|
656
658
|
# include <cstddef>
|
|
657
|
-
# include <memory>
|
|
658
659
|
# include <string>
|
|
659
660
|
# include <type_traits>
|
|
660
661
|
# include <utility>
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
#ifndef _CUDA___RUNTIME_ENSURE_CURRENT_CONTEXT_H
|
|
12
12
|
#define _CUDA___RUNTIME_ENSURE_CURRENT_CONTEXT_H
|
|
13
13
|
|
|
14
|
-
#include <cuda/
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
15
|
|
|
16
16
|
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
17
|
# pragma GCC system_header
|
|
@@ -23,7 +23,8 @@
|
|
|
23
23
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
|
|
26
|
-
# include <cuda/__device/
|
|
26
|
+
# include <cuda/__device/device_ref.h>
|
|
27
|
+
# include <cuda/__device/physical_device.h>
|
|
27
28
|
# include <cuda/__driver/driver_api.h>
|
|
28
29
|
|
|
29
30
|
# include <cuda/std/__cccl/prologue.h>
|
|
@@ -31,6 +32,7 @@
|
|
|
31
32
|
# ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
32
33
|
|
|
33
34
|
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
35
|
+
|
|
34
36
|
class stream_ref;
|
|
35
37
|
|
|
36
38
|
//! @brief RAII helper which on construction sets the current context to the specified one.
|
|
@@ -44,9 +46,9 @@ struct [[maybe_unused]] __ensure_current_context
|
|
|
44
46
|
//! @param new_device The device to switch the context to
|
|
45
47
|
//!
|
|
46
48
|
//! @throws cuda_error if the context switch fails
|
|
47
|
-
explicit __ensure_current_context(device_ref __new_device)
|
|
49
|
+
_CCCL_HOST_API explicit __ensure_current_context(device_ref __new_device)
|
|
48
50
|
{
|
|
49
|
-
auto __ctx =
|
|
51
|
+
auto __ctx = ::cuda::__physical_devices()[__new_device.get()].__primary_context();
|
|
50
52
|
::cuda::__driver::__ctxPush(__ctx);
|
|
51
53
|
}
|
|
52
54
|
|
|
@@ -56,7 +58,7 @@ struct [[maybe_unused]] __ensure_current_context
|
|
|
56
58
|
//! @param ctx The context to switch to
|
|
57
59
|
//!
|
|
58
60
|
//! @throws cuda_error if the context switch fails
|
|
59
|
-
explicit __ensure_current_context(::CUcontext __ctx)
|
|
61
|
+
_CCCL_HOST_API explicit __ensure_current_context(::CUcontext __ctx)
|
|
60
62
|
{
|
|
61
63
|
::cuda::__driver::__ctxPush(__ctx);
|
|
62
64
|
}
|
|
@@ -67,7 +69,7 @@ struct [[maybe_unused]] __ensure_current_context
|
|
|
67
69
|
//! @param stream Stream indicating the context to switch to
|
|
68
70
|
//!
|
|
69
71
|
//! @throws cuda_error if the context switch fails
|
|
70
|
-
explicit __ensure_current_context(stream_ref __stream);
|
|
72
|
+
_CCCL_HOST_API explicit __ensure_current_context(stream_ref __stream);
|
|
71
73
|
|
|
72
74
|
__ensure_current_context(__ensure_current_context&&) = delete;
|
|
73
75
|
__ensure_current_context(__ensure_current_context const&) = delete;
|
|
@@ -79,7 +81,7 @@ struct [[maybe_unused]] __ensure_current_context
|
|
|
79
81
|
//!
|
|
80
82
|
//! @throws cuda_error if the device switch fails. If the destructor is called
|
|
81
83
|
//! during stack unwinding, the program is automatically terminated.
|
|
82
|
-
~__ensure_current_context() noexcept(false)
|
|
84
|
+
_CCCL_HOST_API ~__ensure_current_context() noexcept(false)
|
|
83
85
|
{
|
|
84
86
|
// TODO would it make sense to assert here that we pushed and popped the same thing?
|
|
85
87
|
::cuda::__driver::__ctxPop();
|
|
@@ -43,7 +43,7 @@ struct stream : stream_ref
|
|
|
43
43
|
//! Priority is defaulted to stream::default_priority
|
|
44
44
|
//!
|
|
45
45
|
//! @throws cuda_error if stream creation fails
|
|
46
|
-
explicit stream(device_ref __dev, int __priority = default_priority)
|
|
46
|
+
_CCCL_HOST_API explicit stream(device_ref __dev, int __priority = default_priority)
|
|
47
47
|
: stream_ref(__detail::__invalid_stream)
|
|
48
48
|
{
|
|
49
49
|
[[maybe_unused]] __ensure_current_context __ctx_setter(__dev);
|
|
@@ -54,7 +54,7 @@ struct stream : stream_ref
|
|
|
54
54
|
//!
|
|
55
55
|
//! @post `stream()` returns an invalid stream handle
|
|
56
56
|
// Can't be constexpr because __invalid_stream isn't
|
|
57
|
-
explicit stream(no_init_t) noexcept
|
|
57
|
+
_CCCL_HOST_API explicit stream(no_init_t) noexcept
|
|
58
58
|
: stream_ref(__detail::__invalid_stream)
|
|
59
59
|
{}
|
|
60
60
|
|
|
@@ -63,7 +63,7 @@ struct stream : stream_ref
|
|
|
63
63
|
//! @param __other
|
|
64
64
|
//!
|
|
65
65
|
//! @post `__other` is in moved-from state.
|
|
66
|
-
stream(stream&& __other) noexcept
|
|
66
|
+
_CCCL_HOST_API stream(stream&& __other) noexcept
|
|
67
67
|
: stream(::cuda::std::exchange(__other.__stream, __detail::__invalid_stream))
|
|
68
68
|
{}
|
|
69
69
|
|
|
@@ -72,7 +72,7 @@ struct stream : stream_ref
|
|
|
72
72
|
//! Destroy the `stream` object
|
|
73
73
|
//!
|
|
74
74
|
//! @note If the stream fails to be destroyed, the error is silently ignored.
|
|
75
|
-
~stream()
|
|
75
|
+
_CCCL_HOST_API ~stream()
|
|
76
76
|
{
|
|
77
77
|
if (__stream != __detail::__invalid_stream)
|
|
78
78
|
{
|
|
@@ -87,7 +87,7 @@ struct stream : stream_ref
|
|
|
87
87
|
//! @param __other
|
|
88
88
|
//!
|
|
89
89
|
//! @post `__other` is in a moved-from state.
|
|
90
|
-
stream& operator=(stream&& __other) noexcept
|
|
90
|
+
_CCCL_HOST_API stream& operator=(stream&& __other) noexcept
|
|
91
91
|
{
|
|
92
92
|
stream __tmp(::cuda::std::move(__other));
|
|
93
93
|
::cuda::std::swap(__stream, __tmp.__stream);
|
|
@@ -103,7 +103,7 @@ struct stream : stream_ref
|
|
|
103
103
|
//! @return stream The constructed `stream` object
|
|
104
104
|
//!
|
|
105
105
|
//! @note The constructed `stream` object takes ownership of the native handle.
|
|
106
|
-
[[nodiscard]] static stream from_native_handle(::cudaStream_t __handle)
|
|
106
|
+
[[nodiscard]] static _CCCL_HOST_API stream from_native_handle(::cudaStream_t __handle)
|
|
107
107
|
{
|
|
108
108
|
return stream(__handle);
|
|
109
109
|
}
|
|
@@ -119,7 +119,7 @@ struct stream : stream_ref
|
|
|
119
119
|
//! @return cudaStream_t The native handle being held by the `stream` object.
|
|
120
120
|
//!
|
|
121
121
|
//! @post The stream object is in a moved-from state.
|
|
122
|
-
[[nodiscard]] ::cudaStream_t release()
|
|
122
|
+
[[nodiscard]] _CCCL_HOST_API ::cudaStream_t release()
|
|
123
123
|
{
|
|
124
124
|
return ::cuda::std::exchange(__stream, __detail::__invalid_stream);
|
|
125
125
|
}
|
|
@@ -127,7 +127,7 @@ struct stream : stream_ref
|
|
|
127
127
|
private:
|
|
128
128
|
// Use `stream::from_native_handle(s)` to construct an owning `stream`
|
|
129
129
|
// object from a `cudaStream_t` handle.
|
|
130
|
-
explicit stream(::cudaStream_t __handle)
|
|
130
|
+
_CCCL_HOST_API explicit stream(::cudaStream_t __handle)
|
|
131
131
|
: stream_ref(__handle)
|
|
132
132
|
{}
|
|
133
133
|
};
|
|
@@ -23,12 +23,14 @@
|
|
|
23
23
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
|
|
26
|
+
# include <cuda/__device/device_ref.h>
|
|
26
27
|
# include <cuda/__driver/driver_api.h>
|
|
27
28
|
# include <cuda/__event/timed_event.h>
|
|
28
29
|
# include <cuda/__fwd/get_stream.h>
|
|
29
30
|
# include <cuda/__runtime/ensure_current_context.h>
|
|
30
31
|
# include <cuda/__utility/no_init.h>
|
|
31
32
|
# include <cuda/std/__exception/cuda_error.h>
|
|
33
|
+
# include <cuda/std/__utility/to_underlying.h>
|
|
32
34
|
# include <cuda/std/cstddef>
|
|
33
35
|
|
|
34
36
|
# include <cuda/std/__cccl/prologue.h>
|
|
@@ -60,9 +62,10 @@ public:
|
|
|
60
62
|
//!
|
|
61
63
|
//! For behavior of the default stream,
|
|
62
64
|
//! @see //! https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
65
|
+
CCCL_DEPRECATED_BECAUSE("Using the default/null stream is generally discouraged. If you need to use it, please "
|
|
66
|
+
"construct a "
|
|
67
|
+
"stream_ref from cudaStream_t{nullptr}") _CCCL_HIDE_FROM_ABI
|
|
68
|
+
stream_ref() = default;
|
|
66
69
|
|
|
67
70
|
//! @brief Constructs a `stream_ref` from a `cudaStream_t` handle.
|
|
68
71
|
//!
|
|
@@ -123,8 +126,7 @@ public:
|
|
|
123
126
|
//! @brief Deprecated. Use sync() instead.
|
|
124
127
|
//!
|
|
125
128
|
//! @deprecated Use sync() instead.
|
|
126
|
-
|
|
127
|
-
void wait() const
|
|
129
|
+
CCCL_DEPRECATED_BECAUSE("Use sync() instead.") _CCCL_HOST_API void wait() const
|
|
128
130
|
{
|
|
129
131
|
sync();
|
|
130
132
|
}
|
|
@@ -183,7 +185,7 @@ public:
|
|
|
183
185
|
//! @throws cuda::cuda_error if the query fails.
|
|
184
186
|
//!
|
|
185
187
|
//! @return `true` if all operations have completed, or `false` if not.
|
|
186
|
-
[[
|
|
188
|
+
[[nodiscard]] CCCL_DEPRECATED_BECAUSE("Use is_done() instead.") _CCCL_HOST_API bool ready() const
|
|
187
189
|
{
|
|
188
190
|
return is_done();
|
|
189
191
|
}
|
|
@@ -215,7 +217,7 @@ public:
|
|
|
215
217
|
//! @return A new event that was recorded into this stream
|
|
216
218
|
//!
|
|
217
219
|
//! @throws cuda_error if event creation or record failed
|
|
218
|
-
[[nodiscard]] _CCCL_HOST_API event record_event(
|
|
220
|
+
[[nodiscard]] _CCCL_HOST_API event record_event(event_flags __flags = event_flags::none) const
|
|
219
221
|
{
|
|
220
222
|
return event(*this, __flags);
|
|
221
223
|
}
|
|
@@ -225,7 +227,7 @@ public:
|
|
|
225
227
|
//! @return A new timed event that was recorded into this stream
|
|
226
228
|
//!
|
|
227
229
|
//! @throws cuda_error if event creation or record failed
|
|
228
|
-
[[nodiscard]] _CCCL_HOST_API timed_event record_timed_event(
|
|
230
|
+
[[nodiscard]] _CCCL_HOST_API timed_event record_timed_event(event_flags __flags = event_flags::none) const
|
|
229
231
|
{
|
|
230
232
|
return timed_event(*this, __flags);
|
|
231
233
|
}
|
|
@@ -236,7 +238,7 @@ public:
|
|
|
236
238
|
//! returned
|
|
237
239
|
//!
|
|
238
240
|
//! @throws cuda_error if device check fails
|
|
239
|
-
_CCCL_HOST_API device_ref device() const
|
|
241
|
+
[[nodiscard]] _CCCL_HOST_API device_ref device() const
|
|
240
242
|
{
|
|
241
243
|
::CUdevice __device{};
|
|
242
244
|
# if _CCCL_CTK_AT_LEAST(13, 0)
|
|
@@ -259,7 +261,7 @@ public:
|
|
|
259
261
|
}
|
|
260
262
|
};
|
|
261
263
|
|
|
262
|
-
inline void event_ref::record(stream_ref __stream) const
|
|
264
|
+
_CCCL_HOST_API inline void event_ref::record(stream_ref __stream) const
|
|
263
265
|
{
|
|
264
266
|
_CCCL_ASSERT(__event_ != nullptr, "cuda::event_ref::record no event set");
|
|
265
267
|
_CCCL_ASSERT(__stream.get() != nullptr, "cuda::event_ref::record invalid stream passed");
|
|
@@ -267,26 +269,26 @@ inline void event_ref::record(stream_ref __stream) const
|
|
|
267
269
|
::cuda::__driver::__eventRecord(__event_, __stream.get());
|
|
268
270
|
}
|
|
269
271
|
|
|
270
|
-
inline event::event(stream_ref __stream,
|
|
271
|
-
: event(__stream,
|
|
272
|
+
_CCCL_HOST_API inline event::event(stream_ref __stream, event_flags __flags)
|
|
273
|
+
: event(__stream, ::cuda::std::to_underlying(__flags) | cudaEventDisableTiming)
|
|
272
274
|
{
|
|
273
275
|
record(__stream);
|
|
274
276
|
}
|
|
275
277
|
|
|
276
|
-
inline event::event(stream_ref __stream, unsigned __flags)
|
|
278
|
+
_CCCL_HOST_API inline event::event(stream_ref __stream, unsigned __flags)
|
|
277
279
|
: event_ref(::cudaEvent_t{})
|
|
278
280
|
{
|
|
279
281
|
[[maybe_unused]] __ensure_current_context __ctx_setter(__stream);
|
|
280
282
|
__event_ = ::cuda::__driver::__eventCreate(static_cast<unsigned>(__flags));
|
|
281
283
|
}
|
|
282
284
|
|
|
283
|
-
inline timed_event::timed_event(stream_ref __stream,
|
|
284
|
-
: event(__stream,
|
|
285
|
+
_CCCL_HOST_API inline timed_event::timed_event(stream_ref __stream, event_flags __flags)
|
|
286
|
+
: event(__stream, ::cuda::std::to_underlying(__flags))
|
|
285
287
|
{
|
|
286
288
|
record(__stream);
|
|
287
289
|
}
|
|
288
290
|
|
|
289
|
-
inline __ensure_current_context::__ensure_current_context(stream_ref __stream)
|
|
291
|
+
_CCCL_HOST_API inline __ensure_current_context::__ensure_current_context(stream_ref __stream)
|
|
290
292
|
{
|
|
291
293
|
auto __ctx = __driver::__streamGetCtx(__stream.get());
|
|
292
294
|
::cuda::__driver::__ctxPush(__ctx);
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
|
2
|
+
//
|
|
3
|
+
// Part of libcu++, the C++ Standard Library for your entire system,
|
|
4
|
+
// under the Apache License v2.0 with LLVM Exceptions.
|
|
5
|
+
// See https://llvm.org/LICENSE.txt for license information.
|
|
6
|
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
7
|
+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
|
|
8
|
+
//
|
|
9
|
+
//===----------------------------------------------------------------------===//
|
|
10
|
+
|
|
11
|
+
#ifndef _CUDA___UTILITY_IN_RANGE_H
|
|
12
|
+
#define _CUDA___UTILITY_IN_RANGE_H
|
|
13
|
+
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
|
+
|
|
16
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
|
+
# pragma GCC system_header
|
|
18
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
19
|
+
# pragma clang system_header
|
|
20
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
21
|
+
# pragma system_header
|
|
22
|
+
#endif // no system header
|
|
23
|
+
|
|
24
|
+
#include <cuda/__type_traits/is_floating_point.h>
|
|
25
|
+
#include <cuda/std/__cmath/isnan.h>
|
|
26
|
+
#include <cuda/std/__concepts/concept_macros.h>
|
|
27
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
28
|
+
#include <cuda/std/__type_traits/is_extended_floating_point.h>
|
|
29
|
+
#include <cuda/std/__type_traits/is_integer.h>
|
|
30
|
+
#include <cuda/std/__type_traits/is_unsigned_integer.h>
|
|
31
|
+
|
|
32
|
+
#include <cuda/std/__cccl/prologue.h>
|
|
33
|
+
|
|
34
|
+
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
35
|
+
|
|
36
|
+
_CCCL_TEMPLATE(typename _Tp)
|
|
37
|
+
_CCCL_REQUIRES(::cuda::std::__cccl_is_integer_v<_Tp> || ::cuda::std::is_floating_point_v<_Tp>
|
|
38
|
+
|| ::cuda::std::__is_extended_floating_point_v<_Tp>)
|
|
39
|
+
[[nodiscard]] _CCCL_API constexpr bool in_range(_Tp __v, _Tp __start, _Tp __end) noexcept
|
|
40
|
+
{
|
|
41
|
+
_CCCL_ASSERT(::cuda::std::isnan(__start) || ::cuda::std::isnan(__end) || __end >= __start,
|
|
42
|
+
"in_range: __end must be greater than or equal to __start");
|
|
43
|
+
if constexpr (::cuda::std::__cccl_is_unsigned_integer_v<_Tp>)
|
|
44
|
+
{
|
|
45
|
+
// if __end > __start, we know that the range is always positive. Similarly, __v is positive if unsigned.
|
|
46
|
+
// this optimization is useful when __start and __end are compile-time constants, or when in_range is used multiple
|
|
47
|
+
// times with the same range
|
|
48
|
+
using _Up = ::cuda::std::conditional_t<(sizeof(_Tp) <= sizeof(unsigned)), unsigned, _Tp>; // at least 32-bit
|
|
49
|
+
const auto __start1 = static_cast<_Up>(__start);
|
|
50
|
+
const auto __end1 = static_cast<_Up>(__end);
|
|
51
|
+
const auto __v1 = static_cast<_Up>(__v);
|
|
52
|
+
const auto __range = __end1 - __start1;
|
|
53
|
+
return (__v1 - __start1) <= __range;
|
|
54
|
+
}
|
|
55
|
+
else
|
|
56
|
+
{
|
|
57
|
+
return __v >= __start && __v <= __end;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
_CCCL_END_NAMESPACE_CUDA
|
|
62
|
+
|
|
63
|
+
#include <cuda/std/__cccl/epilogue.h>
|
|
64
|
+
|
|
65
|
+
#endif // _CUDA___UTILITY_IN_RANGE_H
|