cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
- cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
- cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +27 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +13 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +155 -13
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +79 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
- cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
- cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
|
2
|
+
//
|
|
3
|
+
// Part of libcu++, the C++ Standard Library for your entire system,
|
|
4
|
+
// under the Apache License v2.0 with LLVM Exceptions.
|
|
5
|
+
// See https://llvm.org/LICENSE.txt for license information.
|
|
6
|
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
7
|
+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
|
|
8
|
+
//
|
|
9
|
+
//===----------------------------------------------------------------------===//
|
|
10
|
+
|
|
11
|
+
#ifndef _CUDA___DEVICE_COMPUTE_CAPABILITY_H
|
|
12
|
+
#define _CUDA___DEVICE_COMPUTE_CAPABILITY_H
|
|
13
|
+
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
|
+
|
|
16
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
|
+
# pragma GCC system_header
|
|
18
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
19
|
+
# pragma clang system_header
|
|
20
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
21
|
+
# pragma system_header
|
|
22
|
+
#endif // no system header
|
|
23
|
+
|
|
24
|
+
#include <cuda/__fwd/devices.h>
|
|
25
|
+
#include <cuda/std/__utility/to_underlying.h>
|
|
26
|
+
|
|
27
|
+
#include <cuda/std/__cccl/prologue.h>
|
|
28
|
+
|
|
29
|
+
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
30
|
+
|
|
31
|
+
//! @brief Type representing the CUDA compute capability.
|
|
32
|
+
class compute_capability
|
|
33
|
+
{
|
|
34
|
+
int __cc_{}; //!< The stored compute capability in format 10 * major + minor.
|
|
35
|
+
|
|
36
|
+
public:
|
|
37
|
+
_CCCL_HIDE_FROM_ABI constexpr compute_capability() noexcept = default;
|
|
38
|
+
|
|
39
|
+
//! @brief Constructs the object from compute capability \c __cc. The expected format is 10 * major + minor.
|
|
40
|
+
//!
|
|
41
|
+
//! @param __cc Compute capability.
|
|
42
|
+
_CCCL_API explicit constexpr compute_capability(int __cc) noexcept
|
|
43
|
+
: __cc_{__cc}
|
|
44
|
+
{}
|
|
45
|
+
|
|
46
|
+
//! @brief Constructs the object by combining the \c __major and \c __minor compute capability.
|
|
47
|
+
//!
|
|
48
|
+
//! @param __major The major compute capability.
|
|
49
|
+
//! @param __minor The minor compute capability. Must be less than 10.
|
|
50
|
+
_CCCL_API constexpr compute_capability(int __major, int __minor) noexcept
|
|
51
|
+
: __cc_{10 * __major + __minor}
|
|
52
|
+
{
|
|
53
|
+
_CCCL_ASSERT(__minor < 10, "invalid minor compute capability");
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
//! @brief Constructs the object from the architecture id.
|
|
57
|
+
//!
|
|
58
|
+
//! @param __arch_id The architecture id.
|
|
59
|
+
_CCCL_API explicit constexpr compute_capability(arch_id __arch_id) noexcept
|
|
60
|
+
{
|
|
61
|
+
const auto __val = ::cuda::std::to_underlying(__arch_id);
|
|
62
|
+
if (__val > __arch_specific_id_multiplier)
|
|
63
|
+
{
|
|
64
|
+
__cc_ = __val / __arch_specific_id_multiplier;
|
|
65
|
+
}
|
|
66
|
+
else
|
|
67
|
+
{
|
|
68
|
+
__cc_ = __val;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
_CCCL_HIDE_FROM_ABI constexpr compute_capability(const compute_capability&) noexcept = default;
|
|
73
|
+
|
|
74
|
+
_CCCL_HIDE_FROM_ABI constexpr compute_capability& operator=(const compute_capability& __other) noexcept = default;
|
|
75
|
+
|
|
76
|
+
//! @brief Gets the stored compute capability.
|
|
77
|
+
//!
|
|
78
|
+
//! @return The stored compute capability in format 10 * major + minor.
|
|
79
|
+
[[nodiscard]] _CCCL_API constexpr int get() const noexcept
|
|
80
|
+
{
|
|
81
|
+
return __cc_;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
//! @brief Gets the major compute capability.
|
|
85
|
+
//!
|
|
86
|
+
//! @return Major compute capability.
|
|
87
|
+
[[nodiscard]] _CCCL_API constexpr int major() const noexcept
|
|
88
|
+
{
|
|
89
|
+
return __cc_ / 10;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
//! @brief Gets the minor compute capability.
|
|
93
|
+
//!
|
|
94
|
+
//! @return Minor compute capability. The value is always less than 10.
|
|
95
|
+
[[nodiscard]] _CCCL_API constexpr int minor() const noexcept
|
|
96
|
+
{
|
|
97
|
+
return __cc_ % 10;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
//! @brief Conversion operator to \c int.
|
|
101
|
+
//!
|
|
102
|
+
//! @return The stored compute capability in format 10 * major + minor.
|
|
103
|
+
_CCCL_API explicit constexpr operator int() const noexcept
|
|
104
|
+
{
|
|
105
|
+
return __cc_;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
//! @brief Equality operator.
|
|
109
|
+
[[nodiscard]] friend _CCCL_API constexpr bool operator==(compute_capability __lhs, compute_capability __rhs) noexcept
|
|
110
|
+
{
|
|
111
|
+
return __lhs.__cc_ == __rhs.__cc_;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
//! @brief Inequality operator.
|
|
115
|
+
[[nodiscard]] friend _CCCL_API constexpr bool operator!=(compute_capability __lhs, compute_capability __rhs) noexcept
|
|
116
|
+
{
|
|
117
|
+
return __lhs.__cc_ != __rhs.__cc_;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
//! @brief Less than operator.
|
|
121
|
+
[[nodiscard]] friend _CCCL_API constexpr bool operator<(compute_capability __lhs, compute_capability __rhs) noexcept
|
|
122
|
+
{
|
|
123
|
+
return __lhs.__cc_ < __rhs.__cc_;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
//! @brief Less than or equal to operator.
|
|
127
|
+
[[nodiscard]] friend _CCCL_API constexpr bool operator<=(compute_capability __lhs, compute_capability __rhs) noexcept
|
|
128
|
+
{
|
|
129
|
+
return __lhs.__cc_ <= __rhs.__cc_;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
//! @brief Greater than operator.
|
|
133
|
+
[[nodiscard]] friend _CCCL_API constexpr bool operator>(compute_capability __lhs, compute_capability __rhs) noexcept
|
|
134
|
+
{
|
|
135
|
+
return __lhs.__cc_ > __rhs.__cc_;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
//! @brief Greater than or equal to operator.
|
|
139
|
+
[[nodiscard]] friend _CCCL_API constexpr bool operator>=(compute_capability __lhs, compute_capability __rhs) noexcept
|
|
140
|
+
{
|
|
141
|
+
return __lhs.__cc_ >= __rhs.__cc_;
|
|
142
|
+
}
|
|
143
|
+
};
|
|
144
|
+
|
|
145
|
+
_CCCL_END_NAMESPACE_CUDA
|
|
146
|
+
|
|
147
|
+
#if _CCCL_CUDA_COMPILATION()
|
|
148
|
+
|
|
149
|
+
_CCCL_BEGIN_NAMESPACE_CUDA_DEVICE
|
|
150
|
+
|
|
151
|
+
//! @brief Returns the \c cuda::compute_capability that is currently being compiled.
|
|
152
|
+
//!
|
|
153
|
+
//! @note This API cannot be used in constexpr context when compiling with nvc++ in CUDA mode.
|
|
154
|
+
[[nodiscard]] _CCCL_DEVICE_API _CCCL_TARGET_CONSTEXPR ::cuda::compute_capability current_compute_capability() noexcept
|
|
155
|
+
{
|
|
156
|
+
# if _CCCL_CUDA_COMPILER(NVHPC)
|
|
157
|
+
return ::cuda::compute_capability{__builtin_current_device_sm()};
|
|
158
|
+
# elif _CCCL_DEVICE_COMPILATION()
|
|
159
|
+
return ::cuda::compute_capability{__CUDA_ARCH__ / 10};
|
|
160
|
+
# else // ^^^ _CCCL_DEVICE_COMPILATION() ^^^ / vvv !_CCCL_DEVICE_COMPILATION() vvv
|
|
161
|
+
return {};
|
|
162
|
+
# endif // ^^^ !_CCCL_DEVICE_COMPILATION() ^^^
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
_CCCL_END_NAMESPACE_CUDA_DEVICE
|
|
166
|
+
|
|
167
|
+
#endif // _CCCL_CUDA_COMPILATION()
|
|
168
|
+
|
|
169
|
+
#include <cuda/std/__cccl/epilogue.h>
|
|
170
|
+
|
|
171
|
+
#endif // _CUDA___DEVICE_COMPUTE_CAPABILITY_H
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
#ifndef _CUDA___DEVICE_DEVICE_REF_H
|
|
12
12
|
#define _CUDA___DEVICE_DEVICE_REF_H
|
|
13
13
|
|
|
14
|
-
#include <cuda/
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
15
|
|
|
16
16
|
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
17
|
# pragma GCC system_header
|
|
@@ -22,44 +22,32 @@
|
|
|
22
22
|
#endif // no system header
|
|
23
23
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
|
+
|
|
25
26
|
# include <cuda/__driver/driver_api.h>
|
|
27
|
+
# include <cuda/__fwd/devices.h>
|
|
26
28
|
# include <cuda/__runtime/types.h>
|
|
27
|
-
|
|
28
|
-
# include <
|
|
29
|
-
# include <vector>
|
|
29
|
+
# include <cuda/std/span>
|
|
30
|
+
# include <cuda/std/string_view>
|
|
30
31
|
|
|
31
32
|
# include <cuda/std/__cccl/prologue.h>
|
|
32
33
|
|
|
33
34
|
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
34
|
-
class physical_device;
|
|
35
|
-
namespace arch
|
|
36
|
-
{
|
|
37
|
-
struct traits_t;
|
|
38
|
-
} // namespace arch
|
|
39
|
-
|
|
40
|
-
namespace __detail
|
|
41
|
-
{
|
|
42
|
-
template <::cudaDeviceAttr _Attr>
|
|
43
|
-
struct __dev_attr;
|
|
44
|
-
} // namespace __detail
|
|
45
35
|
|
|
46
36
|
//! @brief A non-owning representation of a CUDA device
|
|
47
37
|
class device_ref
|
|
48
38
|
{
|
|
49
|
-
friend class physical_device;
|
|
50
|
-
|
|
51
39
|
int __id_ = 0;
|
|
52
40
|
|
|
53
41
|
public:
|
|
54
42
|
//! @brief Create a `device_ref` object from a native device ordinal.
|
|
55
|
-
/*implicit*/ constexpr device_ref(int __id) noexcept
|
|
43
|
+
/*implicit*/ _CCCL_HOST_API constexpr device_ref(int __id) noexcept
|
|
56
44
|
: __id_(__id)
|
|
57
45
|
{}
|
|
58
46
|
|
|
59
47
|
//! @brief Retrieve the native ordinal of the `device_ref`
|
|
60
48
|
//!
|
|
61
49
|
//! @return int The native device ordinal held by the `device_ref` object
|
|
62
|
-
[[nodiscard]] constexpr int get() const noexcept
|
|
50
|
+
[[nodiscard]] _CCCL_HOST_API constexpr int get() const noexcept
|
|
63
51
|
{
|
|
64
52
|
return __id_;
|
|
65
53
|
}
|
|
@@ -72,7 +60,7 @@ public:
|
|
|
72
60
|
//! @param __lhs The first `device_ref` to compare
|
|
73
61
|
//! @param __rhs The second `device_ref` to compare
|
|
74
62
|
//! @return true if `lhs` and `rhs` refer to the same device ordinal
|
|
75
|
-
[[nodiscard]] friend constexpr bool operator==(device_ref __lhs, device_ref __rhs) noexcept
|
|
63
|
+
[[nodiscard]] friend _CCCL_HOST_API constexpr bool operator==(device_ref __lhs, device_ref __rhs) noexcept
|
|
76
64
|
{
|
|
77
65
|
return __lhs.__id_ == __rhs.__id_;
|
|
78
66
|
}
|
|
@@ -86,7 +74,7 @@ public:
|
|
|
86
74
|
//! @param __lhs The first `device_ref` to compare
|
|
87
75
|
//! @param __rhs The second `device_ref` to compare
|
|
88
76
|
//! @return true if `lhs` and `rhs` refer to different device ordinal
|
|
89
|
-
[[nodiscard]] constexpr
|
|
77
|
+
[[nodiscard]] friend _CCCL_HOST_API constexpr bool operator!=(device_ref __lhs, device_ref __rhs) noexcept
|
|
90
78
|
{
|
|
91
79
|
return __lhs.__id_ != __rhs.__id_;
|
|
92
80
|
}
|
|
@@ -101,38 +89,35 @@ public:
|
|
|
101
89
|
//!
|
|
102
90
|
//! @sa device::attrs
|
|
103
91
|
template <typename _Attr>
|
|
104
|
-
[[nodiscard]] auto attribute(_Attr __attr) const
|
|
92
|
+
[[nodiscard]] _CCCL_HOST_API auto attribute(_Attr __attr) const
|
|
105
93
|
{
|
|
106
94
|
return __attr(*this);
|
|
107
95
|
}
|
|
108
96
|
|
|
109
97
|
//! @overload
|
|
110
98
|
template <::cudaDeviceAttr _Attr>
|
|
111
|
-
[[nodiscard]] auto attribute() const
|
|
99
|
+
[[nodiscard]] _CCCL_HOST_API auto attribute() const
|
|
112
100
|
{
|
|
113
|
-
return attribute(
|
|
101
|
+
return attribute(__dev_attr<_Attr>());
|
|
114
102
|
}
|
|
115
103
|
|
|
116
104
|
//! @brief Retrieve the memory location of this device
|
|
117
105
|
//!
|
|
118
106
|
//! @return The memory location of this device
|
|
119
|
-
[[nodiscard]] operator memory_location() const noexcept
|
|
107
|
+
[[nodiscard]] _CCCL_HOST_API operator memory_location() const noexcept
|
|
120
108
|
{
|
|
121
109
|
return memory_location{::cudaMemLocationTypeDevice, get()};
|
|
122
110
|
}
|
|
123
111
|
|
|
124
|
-
//! @brief
|
|
125
|
-
|
|
126
|
-
//! @return String containing the name of this device.
|
|
127
|
-
[[nodiscard]] ::std::string name() const
|
|
128
|
-
{
|
|
129
|
-
constexpr int __max_name_length = 256;
|
|
130
|
-
::std::string __name(256, 0);
|
|
112
|
+
//! @brief Initializes the primary context of the device.
|
|
113
|
+
_CCCL_HOST_API void init() const; // implemented in <cuda/__device/physical_device.h> to avoid circular dependency
|
|
131
114
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
115
|
+
//! @brief Retrieve the name of this device.
|
|
116
|
+
//!
|
|
117
|
+
//! @return String view containing the name of this device.
|
|
118
|
+
[[nodiscard]] _CCCL_HOST_API ::cuda::std::string_view name() const; // implemented in
|
|
119
|
+
// <cuda/__device/physical_device.h> to avoid
|
|
120
|
+
// circular dependency
|
|
136
121
|
|
|
137
122
|
//! @brief Queries if its possible for this device to directly access specified device's memory.
|
|
138
123
|
//!
|
|
@@ -142,31 +127,24 @@ public:
|
|
|
142
127
|
//!
|
|
143
128
|
//! @param __other_dev Device to query the peer access
|
|
144
129
|
//! @return true if its possible for this device to access the specified device's memory
|
|
145
|
-
[[nodiscard]] bool has_peer_access_to(device_ref __other_dev) const
|
|
130
|
+
[[nodiscard]] _CCCL_HOST_API bool has_peer_access_to(device_ref __other_dev) const
|
|
146
131
|
{
|
|
147
132
|
return ::cuda::__driver::__deviceCanAccessPeer(
|
|
148
133
|
::cuda::__driver::__deviceGet(get()), ::cuda::__driver::__deviceGet(__other_dev.get()));
|
|
149
134
|
}
|
|
150
135
|
|
|
151
|
-
//! @brief Retrieve architecture traits of this device.
|
|
152
|
-
//!
|
|
153
|
-
//! Architecture traits object contains information about certain traits
|
|
154
|
-
//! that are shared by all devices belonging to given architecture.
|
|
155
|
-
//!
|
|
156
|
-
//! @return A reference to `arch_traits_t` object containing architecture traits of this device
|
|
157
|
-
const arch::traits_t& arch_traits() const;
|
|
158
|
-
|
|
159
136
|
// TODO this might return some more complex type in the future
|
|
160
137
|
// TODO we might want to include the calling device, depends on what we decide
|
|
161
138
|
// peer access APIs
|
|
162
139
|
|
|
163
|
-
//! @brief Retrieve
|
|
140
|
+
//! @brief Retrieve `device_ref`s that are peers of this device
|
|
164
141
|
//!
|
|
165
|
-
//! The device on which this API is called is not included in the vector
|
|
166
|
-
//! if a full group of peer devices is needed, it needs to be pushed_back separately.
|
|
142
|
+
//! The device on which this API is called is not included in the vector.
|
|
167
143
|
//!
|
|
168
144
|
//! @throws cuda_error if any peer access query fails
|
|
169
|
-
::std::
|
|
145
|
+
[[nodiscard]] _CCCL_HOST_API ::cuda::std::span<const device_ref> peers() const; // implemented in
|
|
146
|
+
// <cuda/__device/physical_device.h>
|
|
147
|
+
// to avoid circular dependency
|
|
170
148
|
};
|
|
171
149
|
|
|
172
150
|
_CCCL_END_NAMESPACE_CUDA
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
#ifndef _CUDA___DEVICE_PHYSICAL_DEVICE_H
|
|
12
12
|
#define _CUDA___DEVICE_PHYSICAL_DEVICE_H
|
|
13
13
|
|
|
14
|
-
#include <cuda/
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
15
|
|
|
16
16
|
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
17
|
# pragma GCC system_header
|
|
@@ -23,141 +23,145 @@
|
|
|
23
23
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
|
|
26
|
-
# include <cuda/__device/arch_traits.h>
|
|
27
|
-
# include <cuda/__device/attributes.h>
|
|
28
26
|
# include <cuda/__device/device_ref.h>
|
|
29
27
|
# include <cuda/__driver/driver_api.h>
|
|
28
|
+
# include <cuda/__fwd/devices.h>
|
|
29
|
+
# include <cuda/std/__cccl/memory_wrapper.h>
|
|
30
|
+
# include <cuda/std/__cstddef/types.h>
|
|
31
|
+
# include <cuda/std/span>
|
|
32
|
+
# include <cuda/std/string_view>
|
|
30
33
|
|
|
31
34
|
# include <cassert>
|
|
32
35
|
# include <mutex>
|
|
36
|
+
# include <vector>
|
|
33
37
|
|
|
34
38
|
# include <cuda/std/__cccl/prologue.h>
|
|
35
39
|
|
|
36
40
|
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
37
|
-
namespace __detail
|
|
38
|
-
{
|
|
39
|
-
//! @brief A proxy object used to in-place construct a `device` object from an
|
|
40
|
-
//! integer ID. Used in __detail/all_devices.cuh.
|
|
41
|
-
struct __emplace_device
|
|
42
|
-
{
|
|
43
|
-
int __id_;
|
|
44
41
|
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
[[nodiscard]] constexpr const __emplace_device* operator->() const;
|
|
48
|
-
};
|
|
49
|
-
} // namespace __detail
|
|
50
|
-
|
|
51
|
-
//! @brief For a given attribute, type of the attribute value.
|
|
52
|
-
//!
|
|
53
|
-
//! @par Example
|
|
54
|
-
//! @code
|
|
55
|
-
//! using threads_per_block_t = device::attr_result_t<device_attributes::max_threads_per_block>;
|
|
56
|
-
//! static_assert(std::is_same_v<threads_per_block_t, int>);
|
|
57
|
-
//! @endcode
|
|
58
|
-
//!
|
|
59
|
-
//! @sa device_attributes
|
|
60
|
-
template <::cudaDeviceAttr _Attr>
|
|
61
|
-
using device_attribute_result_t = typename __detail::__dev_attr<_Attr>::type;
|
|
42
|
+
[[nodiscard]] inline ::cuda::std::span<__physical_device> __physical_devices();
|
|
62
43
|
|
|
63
44
|
// This is the element type of the the global `devices` array. In the future, we
|
|
64
45
|
// can cache device properties here.
|
|
65
46
|
//
|
|
66
47
|
//! @brief An immovable "owning" representation of a CUDA device.
|
|
67
|
-
class
|
|
48
|
+
class __physical_device
|
|
68
49
|
{
|
|
50
|
+
friend _CCCL_HOST_API inline ::std::unique_ptr<__physical_device[]>
|
|
51
|
+
__make_physical_devices(::cuda::std::size_t __device_count);
|
|
52
|
+
|
|
53
|
+
::CUdevice __device_{};
|
|
54
|
+
|
|
55
|
+
::std::once_flag __primary_ctx_once_flag_{};
|
|
56
|
+
::CUcontext __primary_ctx_{};
|
|
57
|
+
|
|
58
|
+
static constexpr ::cuda::std::size_t __max_name_length{256};
|
|
59
|
+
::std::once_flag __name_once_flag_{};
|
|
60
|
+
char __name_[__max_name_length]{};
|
|
61
|
+
::cuda::std::size_t __name_length_{};
|
|
62
|
+
|
|
63
|
+
::std::once_flag __peers_once_flag_{};
|
|
64
|
+
::std::vector<device_ref> __peers_{};
|
|
65
|
+
|
|
69
66
|
public:
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
// a device object from an __emplace_device object. This is a workaround.
|
|
74
|
-
physical_device(__detail::__emplace_device __ed)
|
|
75
|
-
: physical_device(__ed.__id_)
|
|
76
|
-
{}
|
|
77
|
-
# endif // _CCCL_COMPILER(MSVC)
|
|
78
|
-
# endif // _CCCL_COMPILER(MSVC)
|
|
79
|
-
|
|
80
|
-
//! @brief Retrieve architecture traits of this device.
|
|
81
|
-
//!
|
|
82
|
-
//! Architecture traits object contains information about certain traits
|
|
83
|
-
//! that are shared by all devices belonging to given architecture.
|
|
84
|
-
//!
|
|
85
|
-
//! @return A reference to `arch_traits_t` object containing architecture traits of this device
|
|
86
|
-
const arch::traits_t& arch_traits() const noexcept
|
|
67
|
+
_CCCL_HIDE_FROM_ABI __physical_device() = default;
|
|
68
|
+
|
|
69
|
+
_CCCL_HOST_API ~__physical_device()
|
|
87
70
|
{
|
|
88
|
-
|
|
71
|
+
if (__primary_ctx_ != nullptr)
|
|
72
|
+
{
|
|
73
|
+
[[maybe_unused]] const auto __ignore = ::cuda::__driver::__primaryCtxReleaseNoThrow(__device_);
|
|
74
|
+
}
|
|
89
75
|
}
|
|
90
76
|
|
|
91
77
|
//! @brief Retrieve the primary context for this device.
|
|
92
78
|
//!
|
|
93
79
|
//! @return A reference to the primary context for this device.
|
|
94
|
-
::CUcontext
|
|
80
|
+
[[nodiscard]] _CCCL_HOST_API ::CUcontext __primary_context()
|
|
95
81
|
{
|
|
96
|
-
::std::call_once(
|
|
97
|
-
|
|
98
|
-
__primary_ctx = ::cuda::__driver::__primaryCtxRetain(__device);
|
|
82
|
+
::std::call_once(__primary_ctx_once_flag_, [this]() {
|
|
83
|
+
__primary_ctx_ = ::cuda::__driver::__primaryCtxRetain(__device_);
|
|
99
84
|
});
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
return __primary_ctx;
|
|
85
|
+
return __primary_ctx_;
|
|
103
86
|
}
|
|
104
87
|
|
|
105
|
-
|
|
88
|
+
[[nodiscard]] _CCCL_HOST_API ::cuda::std::string_view __name()
|
|
106
89
|
{
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
::cuda::__driver::
|
|
110
|
-
|
|
90
|
+
::std::call_once(__name_once_flag_, [this]() {
|
|
91
|
+
const auto __id = ::cuda::__driver::__cudevice_to_ordinal(__device_);
|
|
92
|
+
::cuda::__driver::__deviceGetName(__name_, __max_name_length, __id);
|
|
93
|
+
__name_length_ = ::cuda::std::char_traits<char>::length(__name_);
|
|
94
|
+
});
|
|
95
|
+
return ::cuda::std::string_view{__name_, __name_length_};
|
|
111
96
|
}
|
|
112
97
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
98
|
+
[[nodiscard]] _CCCL_HOST_API ::cuda::std::span<const device_ref> __peers()
|
|
99
|
+
{
|
|
100
|
+
::std::call_once(__peers_once_flag_, [this]() {
|
|
101
|
+
const auto __count = static_cast<int>(::cuda::__physical_devices().size());
|
|
102
|
+
const auto __id = ::cuda::__driver::__cudevice_to_ordinal(__device_);
|
|
103
|
+
__peers_.reserve(__count);
|
|
104
|
+
for (int __other_id = 0; __other_id < __count; ++__other_id)
|
|
105
|
+
{
|
|
106
|
+
// Exclude the device this API is called on. The main use case for this API
|
|
107
|
+
// is enable/disable peer access. While enable peer access can be called on
|
|
108
|
+
// device on which memory resides, disable peer access will error-out.
|
|
109
|
+
// Usage of the peer access control is smoother when *this is excluded,
|
|
110
|
+
// while it can be easily added with .push_back() on the vector if a full
|
|
111
|
+
// group of peers is needed (for cases other than peer access control)
|
|
112
|
+
if (__other_id != __id)
|
|
113
|
+
{
|
|
114
|
+
device_ref __dev{__id};
|
|
115
|
+
device_ref __other_dev{__other_id};
|
|
116
|
+
|
|
117
|
+
// While in almost all practical applications peer access should be symmetrical,
|
|
118
|
+
// it is possible to build a system with one directional peer access, check
|
|
119
|
+
// both ways here just to be safe
|
|
120
|
+
if (__dev.has_peer_access_to(__other_dev) && __other_dev.has_peer_access_to(__dev))
|
|
121
|
+
{
|
|
122
|
+
__peers_.push_back(__other_dev);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
});
|
|
127
|
+
return ::cuda::std::span<const device_ref>{__peers_};
|
|
128
|
+
}
|
|
129
|
+
};
|
|
133
130
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
131
|
+
[[nodiscard]] _CCCL_HOST_API inline ::std::unique_ptr<__physical_device[]>
|
|
132
|
+
__make_physical_devices(::cuda::std::size_t __device_count)
|
|
133
|
+
{
|
|
134
|
+
::std::unique_ptr<__physical_device[]> __devices{::new __physical_device[__device_count]};
|
|
135
|
+
for (::cuda::std::size_t __i = 0; __i < __device_count; ++__i)
|
|
136
|
+
{
|
|
137
|
+
__devices[__i].__device_ = static_cast<int>(__i);
|
|
138
|
+
}
|
|
139
|
+
return __devices;
|
|
140
|
+
}
|
|
139
141
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
+
[[nodiscard]] inline ::cuda::std::span<__physical_device> __physical_devices()
|
|
143
|
+
{
|
|
144
|
+
static const auto __device_count = static_cast<::cuda::std::size_t>(::cuda::__driver::__deviceGetCount());
|
|
145
|
+
static const auto __devices = ::cuda::__make_physical_devices(__device_count);
|
|
146
|
+
return ::cuda::std::span<__physical_device>{__devices.get(), __device_count};
|
|
147
|
+
}
|
|
142
148
|
|
|
143
|
-
|
|
144
|
-
friend bool operator!=(const physical_device& __lhs, int __rhs) = delete;
|
|
145
|
-
friend bool operator!=(int __lhs, const physical_device& __rhs) = delete;
|
|
146
|
-
# endif // _CCCL_STD_VER <= 2017
|
|
147
|
-
};
|
|
149
|
+
// device_ref methods dependent on __physical_device
|
|
148
150
|
|
|
149
|
-
|
|
151
|
+
_CCCL_HOST_API inline void device_ref::init() const
|
|
150
152
|
{
|
|
151
|
-
|
|
153
|
+
(void) ::cuda::__physical_devices()[__id_].__primary_context();
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
[[nodiscard]] _CCCL_HOST_API inline ::cuda::std::string_view device_ref::name() const
|
|
152
157
|
{
|
|
153
|
-
return
|
|
158
|
+
return ::cuda::__physical_devices()[__id_].__name();
|
|
154
159
|
}
|
|
155
160
|
|
|
156
|
-
[[nodiscard]] inline
|
|
161
|
+
[[nodiscard]] _CCCL_HOST_API inline ::cuda::std::span<const device_ref> device_ref::peers() const
|
|
157
162
|
{
|
|
158
|
-
return
|
|
163
|
+
return ::cuda::__physical_devices()[__id_].__peers();
|
|
159
164
|
}
|
|
160
|
-
} // namespace __detail
|
|
161
165
|
|
|
162
166
|
_CCCL_END_NAMESPACE_CUDA
|
|
163
167
|
|