cuda-cccl 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
- cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
- cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +27 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +13 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +155 -13
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +79 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
- cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
- cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
#ifndef _CUDA___DEVICE_ATTRIBUTES_H
|
|
12
12
|
#define _CUDA___DEVICE_ATTRIBUTES_H
|
|
13
13
|
|
|
14
|
-
#include <cuda/
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
15
|
|
|
16
16
|
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
17
|
# pragma GCC system_header
|
|
@@ -23,29 +23,27 @@
|
|
|
23
23
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
|
|
26
|
+
# include <cuda/__device/compute_capability.h>
|
|
26
27
|
# include <cuda/__device/device_ref.h>
|
|
27
28
|
# include <cuda/__driver/driver_api.h>
|
|
28
|
-
# include <cuda/
|
|
29
|
-
# include <cuda/std/
|
|
29
|
+
# include <cuda/__fwd/devices.h>
|
|
30
|
+
# include <cuda/std/__cstddef/types.h>
|
|
30
31
|
|
|
31
32
|
# include <cuda/std/__cccl/prologue.h>
|
|
32
33
|
|
|
33
34
|
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
34
35
|
|
|
35
|
-
namespace __detail
|
|
36
|
-
{
|
|
37
|
-
|
|
38
36
|
template <::cudaDeviceAttr _Attr, typename _Type>
|
|
39
37
|
struct __dev_attr_impl
|
|
40
38
|
{
|
|
41
39
|
using type = _Type;
|
|
42
40
|
|
|
43
|
-
[[nodiscard]] constexpr operator ::cudaDeviceAttr() const noexcept
|
|
41
|
+
[[nodiscard]] _CCCL_HOST_API constexpr operator ::cudaDeviceAttr() const noexcept
|
|
44
42
|
{
|
|
45
43
|
return _Attr;
|
|
46
44
|
}
|
|
47
45
|
|
|
48
|
-
[[nodiscard]] type operator()(device_ref __dev) const
|
|
46
|
+
[[nodiscard]] _CCCL_HOST_API type operator()(device_ref __dev) const
|
|
49
47
|
{
|
|
50
48
|
return static_cast<type>(::cuda::__driver::__deviceGetAttribute(
|
|
51
49
|
static_cast<::CUdevice_attribute>(_Attr), ::cuda::__driver::__deviceGet(__dev.get())));
|
|
@@ -55,13 +53,36 @@ struct __dev_attr_impl
|
|
|
55
53
|
template <::cudaDeviceAttr _Attr>
|
|
56
54
|
struct __dev_attr : __dev_attr_impl<_Attr, int>
|
|
57
55
|
{};
|
|
58
|
-
|
|
56
|
+
template <>
|
|
57
|
+
struct __dev_attr<::cudaDevAttrMaxSharedMemoryPerBlock> //
|
|
58
|
+
: __dev_attr_impl<::cudaDevAttrMaxSharedMemoryPerBlock, ::cuda::std::size_t>
|
|
59
|
+
{};
|
|
60
|
+
template <>
|
|
61
|
+
struct __dev_attr<::cudaDevAttrTotalConstantMemory> //
|
|
62
|
+
: __dev_attr_impl<::cudaDevAttrTotalConstantMemory, ::cuda::std::size_t>
|
|
63
|
+
{};
|
|
64
|
+
template <>
|
|
65
|
+
struct __dev_attr<::cudaDevAttrMaxPitch> //
|
|
66
|
+
: __dev_attr_impl<::cudaDevAttrMaxPitch, ::cuda::std::size_t>
|
|
67
|
+
{};
|
|
68
|
+
template <>
|
|
69
|
+
struct __dev_attr<::cudaDevAttrMaxTexture2DLinearPitch> //
|
|
70
|
+
: __dev_attr_impl<::cudaDevAttrMaxTexture2DLinearPitch, ::cuda::std::size_t>
|
|
71
|
+
{};
|
|
59
72
|
// TODO: give this a strong type for kilohertz
|
|
60
73
|
template <>
|
|
61
74
|
struct __dev_attr<::cudaDevAttrClockRate> //
|
|
62
75
|
: __dev_attr_impl<::cudaDevAttrClockRate, int>
|
|
63
76
|
{};
|
|
64
77
|
template <>
|
|
78
|
+
struct __dev_attr<::cudaDevAttrTextureAlignment> //
|
|
79
|
+
: __dev_attr_impl<::cudaDevAttrTextureAlignment, ::cuda::std::size_t>
|
|
80
|
+
{};
|
|
81
|
+
template <>
|
|
82
|
+
struct __dev_attr<::cudaDevAttrTexturePitchAlignment> //
|
|
83
|
+
: __dev_attr_impl<::cudaDevAttrTexturePitchAlignment, ::cuda::std::size_t>
|
|
84
|
+
{};
|
|
85
|
+
template <>
|
|
65
86
|
struct __dev_attr<::cudaDevAttrGpuOverlap> //
|
|
66
87
|
: __dev_attr_impl<::cudaDevAttrGpuOverlap, bool>
|
|
67
88
|
{};
|
|
@@ -107,10 +128,9 @@ template <>
|
|
|
107
128
|
struct __dev_attr<::cudaDevAttrGlobalMemoryBusWidth> //
|
|
108
129
|
: __dev_attr_impl<::cudaDevAttrGlobalMemoryBusWidth, int>
|
|
109
130
|
{};
|
|
110
|
-
// TODO: give this a strong type for bytes
|
|
111
131
|
template <>
|
|
112
132
|
struct __dev_attr<::cudaDevAttrL2CacheSize> //
|
|
113
|
-
: __dev_attr_impl<::cudaDevAttrL2CacheSize,
|
|
133
|
+
: __dev_attr_impl<::cudaDevAttrL2CacheSize, ::cuda::std::size_t>
|
|
114
134
|
{};
|
|
115
135
|
template <>
|
|
116
136
|
struct __dev_attr<::cudaDevAttrUnifiedAddressing> //
|
|
@@ -129,6 +149,10 @@ struct __dev_attr<::cudaDevAttrLocalL1CacheSupported> //
|
|
|
129
149
|
: __dev_attr_impl<::cudaDevAttrLocalL1CacheSupported, bool>
|
|
130
150
|
{};
|
|
131
151
|
template <>
|
|
152
|
+
struct __dev_attr<::cudaDevAttrMaxSharedMemoryPerMultiprocessor> //
|
|
153
|
+
: __dev_attr_impl<::cudaDevAttrMaxSharedMemoryPerMultiprocessor, ::cuda::std::size_t>
|
|
154
|
+
{};
|
|
155
|
+
template <>
|
|
132
156
|
struct __dev_attr<::cudaDevAttrManagedMemory> //
|
|
133
157
|
: __dev_attr_impl<::cudaDevAttrManagedMemory, bool>
|
|
134
158
|
{};
|
|
@@ -173,6 +197,22 @@ struct __dev_attr<::cudaDevAttrDirectManagedMemAccessFromHost> //
|
|
|
173
197
|
: __dev_attr_impl<::cudaDevAttrDirectManagedMemAccessFromHost, bool>
|
|
174
198
|
{};
|
|
175
199
|
template <>
|
|
200
|
+
struct __dev_attr<::cudaDevAttrMaxSharedMemoryPerBlockOptin> //
|
|
201
|
+
: __dev_attr_impl<::cudaDevAttrMaxSharedMemoryPerBlockOptin, ::cuda::std::size_t>
|
|
202
|
+
{};
|
|
203
|
+
template <>
|
|
204
|
+
struct __dev_attr<::cudaDevAttrMaxPersistingL2CacheSize> //
|
|
205
|
+
: __dev_attr_impl<::cudaDevAttrMaxPersistingL2CacheSize, ::cuda::std::size_t>
|
|
206
|
+
{};
|
|
207
|
+
template <>
|
|
208
|
+
struct __dev_attr<::cudaDevAttrMaxAccessPolicyWindowSize> //
|
|
209
|
+
: __dev_attr_impl<::cudaDevAttrMaxAccessPolicyWindowSize, ::cuda::std::size_t>
|
|
210
|
+
{};
|
|
211
|
+
template <>
|
|
212
|
+
struct __dev_attr<::cudaDevAttrReservedSharedMemoryPerBlock> //
|
|
213
|
+
: __dev_attr_impl<::cudaDevAttrReservedSharedMemoryPerBlock, ::cuda::std::size_t>
|
|
214
|
+
{};
|
|
215
|
+
template <>
|
|
176
216
|
struct __dev_attr<::cudaDevAttrSparseCudaArraySupported> //
|
|
177
217
|
: __dev_attr_impl<::cudaDevAttrSparseCudaArraySupported, bool>
|
|
178
218
|
{};
|
|
@@ -239,463 +279,460 @@ struct __dev_attr<::cudaDevAttrNumaConfig> //
|
|
|
239
279
|
};
|
|
240
280
|
# endif // _CCCL_CTK_AT_LEAST(12, 2)
|
|
241
281
|
|
|
242
|
-
} // namespace __detail
|
|
243
|
-
|
|
244
282
|
namespace device_attributes
|
|
245
283
|
{
|
|
246
284
|
// Maximum number of threads per block
|
|
247
|
-
using max_threads_per_block_t =
|
|
285
|
+
using max_threads_per_block_t = __dev_attr<::cudaDevAttrMaxThreadsPerBlock>;
|
|
248
286
|
static constexpr max_threads_per_block_t max_threads_per_block{};
|
|
249
287
|
|
|
250
288
|
// Maximum x-dimension of a block
|
|
251
|
-
using max_block_dim_x_t =
|
|
289
|
+
using max_block_dim_x_t = __dev_attr<::cudaDevAttrMaxBlockDimX>;
|
|
252
290
|
static constexpr max_block_dim_x_t max_block_dim_x{};
|
|
253
291
|
|
|
254
292
|
// Maximum y-dimension of a block
|
|
255
|
-
using max_block_dim_y_t =
|
|
293
|
+
using max_block_dim_y_t = __dev_attr<::cudaDevAttrMaxBlockDimY>;
|
|
256
294
|
static constexpr max_block_dim_y_t max_block_dim_y{};
|
|
257
295
|
|
|
258
296
|
// Maximum z-dimension of a block
|
|
259
|
-
using max_block_dim_z_t =
|
|
297
|
+
using max_block_dim_z_t = __dev_attr<::cudaDevAttrMaxBlockDimZ>;
|
|
260
298
|
static constexpr max_block_dim_z_t max_block_dim_z{};
|
|
261
299
|
|
|
262
300
|
// Maximum x-dimension of a grid
|
|
263
|
-
using max_grid_dim_x_t =
|
|
301
|
+
using max_grid_dim_x_t = __dev_attr<::cudaDevAttrMaxGridDimX>;
|
|
264
302
|
static constexpr max_grid_dim_x_t max_grid_dim_x{};
|
|
265
303
|
|
|
266
304
|
// Maximum y-dimension of a grid
|
|
267
|
-
using max_grid_dim_y_t =
|
|
305
|
+
using max_grid_dim_y_t = __dev_attr<::cudaDevAttrMaxGridDimY>;
|
|
268
306
|
static constexpr max_grid_dim_y_t max_grid_dim_y{};
|
|
269
307
|
|
|
270
308
|
// Maximum z-dimension of a grid
|
|
271
|
-
using max_grid_dim_z_t =
|
|
309
|
+
using max_grid_dim_z_t = __dev_attr<::cudaDevAttrMaxGridDimZ>;
|
|
272
310
|
static constexpr max_grid_dim_z_t max_grid_dim_z{};
|
|
273
311
|
|
|
274
312
|
// Maximum amount of shared memory available to a thread block in bytes
|
|
275
|
-
using max_shared_memory_per_block_t =
|
|
313
|
+
using max_shared_memory_per_block_t = __dev_attr<::cudaDevAttrMaxSharedMemoryPerBlock>;
|
|
276
314
|
static constexpr max_shared_memory_per_block_t max_shared_memory_per_block{};
|
|
277
315
|
|
|
278
316
|
// Memory available on device for __constant__ variables in a CUDA C kernel in bytes
|
|
279
|
-
using total_constant_memory_t =
|
|
317
|
+
using total_constant_memory_t = __dev_attr<::cudaDevAttrTotalConstantMemory>;
|
|
280
318
|
static constexpr total_constant_memory_t total_constant_memory{};
|
|
281
319
|
|
|
282
320
|
// Warp size in threads
|
|
283
|
-
using warp_size_t =
|
|
321
|
+
using warp_size_t = __dev_attr<::cudaDevAttrWarpSize>;
|
|
284
322
|
static constexpr warp_size_t warp_size{};
|
|
285
323
|
|
|
286
324
|
// Maximum pitch in bytes allowed by the memory copy functions that involve
|
|
287
325
|
// memory regions allocated through cudaMallocPitch()
|
|
288
|
-
using max_pitch_t =
|
|
326
|
+
using max_pitch_t = __dev_attr<::cudaDevAttrMaxPitch>;
|
|
289
327
|
static constexpr max_pitch_t max_pitch{};
|
|
290
328
|
|
|
291
329
|
// Maximum 1D texture width
|
|
292
|
-
using max_texture_1d_width_t =
|
|
330
|
+
using max_texture_1d_width_t = __dev_attr<::cudaDevAttrMaxTexture1DWidth>;
|
|
293
331
|
static constexpr max_texture_1d_width_t max_texture_1d_width{};
|
|
294
332
|
|
|
295
333
|
// Maximum width for a 1D texture bound to linear memory
|
|
296
|
-
using max_texture_1d_linear_width_t =
|
|
334
|
+
using max_texture_1d_linear_width_t = __dev_attr<::cudaDevAttrMaxTexture1DLinearWidth>;
|
|
297
335
|
static constexpr max_texture_1d_linear_width_t max_texture_1d_linear_width{};
|
|
298
336
|
|
|
299
337
|
// Maximum mipmapped 1D texture width
|
|
300
|
-
using max_texture_1d_mipmapped_width_t =
|
|
338
|
+
using max_texture_1d_mipmapped_width_t = __dev_attr<::cudaDevAttrMaxTexture1DMipmappedWidth>;
|
|
301
339
|
static constexpr max_texture_1d_mipmapped_width_t max_texture_1d_mipmapped_width{};
|
|
302
340
|
|
|
303
341
|
// Maximum 2D texture width
|
|
304
|
-
using max_texture_2d_width_t =
|
|
342
|
+
using max_texture_2d_width_t = __dev_attr<::cudaDevAttrMaxTexture2DWidth>;
|
|
305
343
|
static constexpr max_texture_2d_width_t max_texture_2d_width{};
|
|
306
344
|
|
|
307
345
|
// Maximum 2D texture height
|
|
308
|
-
using max_texture_2d_height_t =
|
|
346
|
+
using max_texture_2d_height_t = __dev_attr<::cudaDevAttrMaxTexture2DHeight>;
|
|
309
347
|
static constexpr max_texture_2d_height_t max_texture_2d_height{};
|
|
310
348
|
|
|
311
349
|
// Maximum width for a 2D texture bound to linear memory
|
|
312
|
-
using max_texture_2d_linear_width_t =
|
|
350
|
+
using max_texture_2d_linear_width_t = __dev_attr<::cudaDevAttrMaxTexture2DLinearWidth>;
|
|
313
351
|
static constexpr max_texture_2d_linear_width_t max_texture_2d_linear_width{};
|
|
314
352
|
|
|
315
353
|
// Maximum height for a 2D texture bound to linear memory
|
|
316
|
-
using max_texture_2d_linear_height_t =
|
|
354
|
+
using max_texture_2d_linear_height_t = __dev_attr<::cudaDevAttrMaxTexture2DLinearHeight>;
|
|
317
355
|
static constexpr max_texture_2d_linear_height_t max_texture_2d_linear_height{};
|
|
318
356
|
|
|
319
357
|
// Maximum pitch in bytes for a 2D texture bound to linear memory
|
|
320
|
-
using max_texture_2d_linear_pitch_t =
|
|
358
|
+
using max_texture_2d_linear_pitch_t = __dev_attr<::cudaDevAttrMaxTexture2DLinearPitch>;
|
|
321
359
|
static constexpr max_texture_2d_linear_pitch_t max_texture_2d_linear_pitch{};
|
|
322
360
|
|
|
323
361
|
// Maximum mipmapped 2D texture width
|
|
324
|
-
using max_texture_2d_mipmapped_width_t =
|
|
362
|
+
using max_texture_2d_mipmapped_width_t = __dev_attr<::cudaDevAttrMaxTexture2DMipmappedWidth>;
|
|
325
363
|
static constexpr max_texture_2d_mipmapped_width_t max_texture_2d_mipmapped_width{};
|
|
326
364
|
|
|
327
365
|
// Maximum mipmapped 2D texture height
|
|
328
|
-
using max_texture_2d_mipmapped_height_t =
|
|
366
|
+
using max_texture_2d_mipmapped_height_t = __dev_attr<::cudaDevAttrMaxTexture2DMipmappedHeight>;
|
|
329
367
|
static constexpr max_texture_2d_mipmapped_height_t max_texture_2d_mipmapped_height{};
|
|
330
368
|
|
|
331
369
|
// Maximum 3D texture width
|
|
332
|
-
using max_texture_3d_width_t =
|
|
370
|
+
using max_texture_3d_width_t = __dev_attr<::cudaDevAttrMaxTexture3DWidth>;
|
|
333
371
|
static constexpr max_texture_3d_width_t max_texture_3d_width{};
|
|
334
372
|
|
|
335
373
|
// Maximum 3D texture height
|
|
336
|
-
using max_texture_3d_height_t =
|
|
374
|
+
using max_texture_3d_height_t = __dev_attr<::cudaDevAttrMaxTexture3DHeight>;
|
|
337
375
|
static constexpr max_texture_3d_height_t max_texture_3d_height{};
|
|
338
376
|
|
|
339
377
|
// Maximum 3D texture depth
|
|
340
|
-
using max_texture_3d_depth_t =
|
|
378
|
+
using max_texture_3d_depth_t = __dev_attr<::cudaDevAttrMaxTexture3DDepth>;
|
|
341
379
|
static constexpr max_texture_3d_depth_t max_texture_3d_depth{};
|
|
342
380
|
|
|
343
381
|
// Alternate maximum 3D texture width, 0 if no alternate maximum 3D texture size is supported
|
|
344
|
-
using max_texture_3d_width_alt_t =
|
|
382
|
+
using max_texture_3d_width_alt_t = __dev_attr<::cudaDevAttrMaxTexture3DWidthAlt>;
|
|
345
383
|
static constexpr max_texture_3d_width_alt_t max_texture_3d_width_alt{};
|
|
346
384
|
|
|
347
385
|
// Alternate maximum 3D texture height, 0 if no alternate maximum 3D texture size is supported
|
|
348
|
-
using max_texture_3d_height_alt_t =
|
|
386
|
+
using max_texture_3d_height_alt_t = __dev_attr<::cudaDevAttrMaxTexture3DHeightAlt>;
|
|
349
387
|
static constexpr max_texture_3d_height_alt_t max_texture_3d_height_alt{};
|
|
350
388
|
|
|
351
389
|
// Alternate maximum 3D texture depth, 0 if no alternate maximum 3D texture size is supported
|
|
352
|
-
using max_texture_3d_depth_alt_t =
|
|
390
|
+
using max_texture_3d_depth_alt_t = __dev_attr<::cudaDevAttrMaxTexture3DDepthAlt>;
|
|
353
391
|
static constexpr max_texture_3d_depth_alt_t max_texture_3d_depth_alt{};
|
|
354
392
|
|
|
355
393
|
// Maximum cubemap texture width or height
|
|
356
|
-
using max_texture_cubemap_width_t =
|
|
394
|
+
using max_texture_cubemap_width_t = __dev_attr<::cudaDevAttrMaxTextureCubemapWidth>;
|
|
357
395
|
static constexpr max_texture_cubemap_width_t max_texture_cubemap_width{};
|
|
358
396
|
|
|
359
397
|
// Maximum 1D layered texture width
|
|
360
|
-
using max_texture_1d_layered_width_t =
|
|
398
|
+
using max_texture_1d_layered_width_t = __dev_attr<::cudaDevAttrMaxTexture1DLayeredWidth>;
|
|
361
399
|
static constexpr max_texture_1d_layered_width_t max_texture_1d_layered_width{};
|
|
362
400
|
|
|
363
401
|
// Maximum layers in a 1D layered texture
|
|
364
|
-
using max_texture_1d_layered_layers_t =
|
|
402
|
+
using max_texture_1d_layered_layers_t = __dev_attr<::cudaDevAttrMaxTexture1DLayeredLayers>;
|
|
365
403
|
static constexpr max_texture_1d_layered_layers_t max_texture_1d_layered_layers{};
|
|
366
404
|
|
|
367
405
|
// Maximum 2D layered texture width
|
|
368
|
-
using max_texture_2d_layered_width_t =
|
|
406
|
+
using max_texture_2d_layered_width_t = __dev_attr<::cudaDevAttrMaxTexture2DLayeredWidth>;
|
|
369
407
|
static constexpr max_texture_2d_layered_width_t max_texture_2d_layered_width{};
|
|
370
408
|
|
|
371
409
|
// Maximum 2D layered texture height
|
|
372
|
-
using max_texture_2d_layered_height_t =
|
|
410
|
+
using max_texture_2d_layered_height_t = __dev_attr<::cudaDevAttrMaxTexture2DLayeredHeight>;
|
|
373
411
|
static constexpr max_texture_2d_layered_height_t max_texture_2d_layered_height{};
|
|
374
412
|
|
|
375
413
|
// Maximum layers in a 2D layered texture
|
|
376
|
-
using max_texture_2d_layered_layers_t =
|
|
414
|
+
using max_texture_2d_layered_layers_t = __dev_attr<::cudaDevAttrMaxTexture2DLayeredLayers>;
|
|
377
415
|
static constexpr max_texture_2d_layered_layers_t max_texture_2d_layered_layers{};
|
|
378
416
|
|
|
379
417
|
// Maximum cubemap layered texture width or height
|
|
380
|
-
using max_texture_cubemap_layered_width_t =
|
|
418
|
+
using max_texture_cubemap_layered_width_t = __dev_attr<::cudaDevAttrMaxTextureCubemapLayeredWidth>;
|
|
381
419
|
static constexpr max_texture_cubemap_layered_width_t max_texture_cubemap_layered_width{};
|
|
382
420
|
|
|
383
421
|
// Maximum layers in a cubemap layered texture
|
|
384
|
-
using max_texture_cubemap_layered_layers_t =
|
|
422
|
+
using max_texture_cubemap_layered_layers_t = __dev_attr<::cudaDevAttrMaxTextureCubemapLayeredLayers>;
|
|
385
423
|
static constexpr max_texture_cubemap_layered_layers_t max_texture_cubemap_layered_layers{};
|
|
386
424
|
|
|
387
425
|
// Maximum 1D surface width
|
|
388
|
-
using max_surface_1d_width_t =
|
|
426
|
+
using max_surface_1d_width_t = __dev_attr<::cudaDevAttrMaxSurface1DWidth>;
|
|
389
427
|
static constexpr max_surface_1d_width_t max_surface_1d_width{};
|
|
390
428
|
|
|
391
429
|
// Maximum 2D surface width
|
|
392
|
-
using max_surface_2d_width_t =
|
|
430
|
+
using max_surface_2d_width_t = __dev_attr<::cudaDevAttrMaxSurface2DWidth>;
|
|
393
431
|
static constexpr max_surface_2d_width_t max_surface_2d_width{};
|
|
394
432
|
|
|
395
433
|
// Maximum 2D surface height
|
|
396
|
-
using max_surface_2d_height_t =
|
|
434
|
+
using max_surface_2d_height_t = __dev_attr<::cudaDevAttrMaxSurface2DHeight>;
|
|
397
435
|
static constexpr max_surface_2d_height_t max_surface_2d_height{};
|
|
398
436
|
|
|
399
437
|
// Maximum 3D surface width
|
|
400
|
-
using max_surface_3d_width_t =
|
|
438
|
+
using max_surface_3d_width_t = __dev_attr<::cudaDevAttrMaxSurface3DWidth>;
|
|
401
439
|
static constexpr max_surface_3d_width_t max_surface_3d_width{};
|
|
402
440
|
|
|
403
441
|
// Maximum 3D surface height
|
|
404
|
-
using max_surface_3d_height_t =
|
|
442
|
+
using max_surface_3d_height_t = __dev_attr<::cudaDevAttrMaxSurface3DHeight>;
|
|
405
443
|
static constexpr max_surface_3d_height_t max_surface_3d_height{};
|
|
406
444
|
|
|
407
445
|
// Maximum 3D surface depth
|
|
408
|
-
using max_surface_3d_depth_t =
|
|
446
|
+
using max_surface_3d_depth_t = __dev_attr<::cudaDevAttrMaxSurface3DDepth>;
|
|
409
447
|
static constexpr max_surface_3d_depth_t max_surface_3d_depth{};
|
|
410
448
|
|
|
411
449
|
// Maximum 1D layered surface width
|
|
412
|
-
using max_surface_1d_layered_width_t =
|
|
450
|
+
using max_surface_1d_layered_width_t = __dev_attr<::cudaDevAttrMaxSurface1DLayeredWidth>;
|
|
413
451
|
static constexpr max_surface_1d_layered_width_t max_surface_1d_layered_width{};
|
|
414
452
|
|
|
415
453
|
// Maximum layers in a 1D layered surface
|
|
416
|
-
using max_surface_1d_layered_layers_t =
|
|
454
|
+
using max_surface_1d_layered_layers_t = __dev_attr<::cudaDevAttrMaxSurface1DLayeredLayers>;
|
|
417
455
|
static constexpr max_surface_1d_layered_layers_t max_surface_1d_layered_layers{};
|
|
418
456
|
|
|
419
457
|
// Maximum 2D layered surface width
|
|
420
|
-
using max_surface_2d_layered_width_t =
|
|
458
|
+
using max_surface_2d_layered_width_t = __dev_attr<::cudaDevAttrMaxSurface2DLayeredWidth>;
|
|
421
459
|
static constexpr max_surface_2d_layered_width_t max_surface_2d_layered_width{};
|
|
422
460
|
|
|
423
461
|
// Maximum 2D layered surface height
|
|
424
|
-
using max_surface_2d_layered_height_t =
|
|
462
|
+
using max_surface_2d_layered_height_t = __dev_attr<::cudaDevAttrMaxSurface2DLayeredHeight>;
|
|
425
463
|
static constexpr max_surface_2d_layered_height_t max_surface_2d_layered_height{};
|
|
426
464
|
|
|
427
465
|
// Maximum layers in a 2D layered surface
|
|
428
|
-
using max_surface_2d_layered_layers_t =
|
|
466
|
+
using max_surface_2d_layered_layers_t = __dev_attr<::cudaDevAttrMaxSurface2DLayeredLayers>;
|
|
429
467
|
static constexpr max_surface_2d_layered_layers_t max_surface_2d_layered_layers{};
|
|
430
468
|
|
|
431
469
|
// Maximum cubemap surface width
|
|
432
|
-
using max_surface_cubemap_width_t =
|
|
470
|
+
using max_surface_cubemap_width_t = __dev_attr<::cudaDevAttrMaxSurfaceCubemapWidth>;
|
|
433
471
|
static constexpr max_surface_cubemap_width_t max_surface_cubemap_width{};
|
|
434
472
|
|
|
435
473
|
// Maximum cubemap layered surface width
|
|
436
|
-
using max_surface_cubemap_layered_width_t =
|
|
474
|
+
using max_surface_cubemap_layered_width_t = __dev_attr<::cudaDevAttrMaxSurfaceCubemapLayeredWidth>;
|
|
437
475
|
static constexpr max_surface_cubemap_layered_width_t max_surface_cubemap_layered_width{};
|
|
438
476
|
|
|
439
477
|
// Maximum layers in a cubemap layered surface
|
|
440
|
-
using max_surface_cubemap_layered_layers_t =
|
|
478
|
+
using max_surface_cubemap_layered_layers_t = __dev_attr<::cudaDevAttrMaxSurfaceCubemapLayeredLayers>;
|
|
441
479
|
static constexpr max_surface_cubemap_layered_layers_t max_surface_cubemap_layered_layers{};
|
|
442
480
|
|
|
443
481
|
// Maximum number of 32-bit registers available to a thread block
|
|
444
|
-
using max_registers_per_block_t =
|
|
482
|
+
using max_registers_per_block_t = __dev_attr<::cudaDevAttrMaxRegistersPerBlock>;
|
|
445
483
|
static constexpr max_registers_per_block_t max_registers_per_block{};
|
|
446
484
|
|
|
447
485
|
// Peak clock frequency in kilohertz
|
|
448
|
-
using clock_rate_t =
|
|
486
|
+
using clock_rate_t = __dev_attr<::cudaDevAttrClockRate>;
|
|
449
487
|
static constexpr clock_rate_t clock_rate{};
|
|
450
488
|
|
|
451
489
|
// Alignment requirement; texture base addresses aligned to textureAlign bytes
|
|
452
490
|
// do not need an offset applied to texture fetches
|
|
453
|
-
using texture_alignment_t =
|
|
491
|
+
using texture_alignment_t = __dev_attr<::cudaDevAttrTextureAlignment>;
|
|
454
492
|
static constexpr texture_alignment_t texture_alignment{};
|
|
455
493
|
|
|
456
494
|
// Pitch alignment requirement for 2D texture references bound to pitched memory
|
|
457
|
-
using texture_pitch_alignment_t =
|
|
495
|
+
using texture_pitch_alignment_t = __dev_attr<::cudaDevAttrTexturePitchAlignment>;
|
|
458
496
|
static constexpr texture_pitch_alignment_t texture_pitch_alignment{};
|
|
459
497
|
|
|
460
498
|
// true if the device can concurrently copy memory between host and device
|
|
461
499
|
// while executing a kernel, or false if not
|
|
462
|
-
using gpu_overlap_t =
|
|
500
|
+
using gpu_overlap_t = __dev_attr<::cudaDevAttrGpuOverlap>;
|
|
463
501
|
static constexpr gpu_overlap_t gpu_overlap{};
|
|
464
502
|
|
|
465
503
|
// Number of multiprocessors on the device
|
|
466
|
-
using multiprocessor_count_t =
|
|
504
|
+
using multiprocessor_count_t = __dev_attr<::cudaDevAttrMultiProcessorCount>;
|
|
467
505
|
static constexpr multiprocessor_count_t multiprocessor_count{};
|
|
468
506
|
|
|
469
507
|
// true if there is a run time limit for kernels executed on the device, or
|
|
470
508
|
// false if not
|
|
471
|
-
using kernel_exec_timeout_t =
|
|
509
|
+
using kernel_exec_timeout_t = __dev_attr<::cudaDevAttrKernelExecTimeout>;
|
|
472
510
|
static constexpr kernel_exec_timeout_t kernel_exec_timeout{};
|
|
473
511
|
|
|
474
512
|
// true if the device is integrated with the memory subsystem, or false if not
|
|
475
|
-
using integrated_t =
|
|
513
|
+
using integrated_t = __dev_attr<::cudaDevAttrIntegrated>;
|
|
476
514
|
static constexpr integrated_t integrated{};
|
|
477
515
|
|
|
478
516
|
// true if the device can map host memory into CUDA address space
|
|
479
|
-
using can_map_host_memory_t =
|
|
517
|
+
using can_map_host_memory_t = __dev_attr<::cudaDevAttrCanMapHostMemory>;
|
|
480
518
|
static constexpr can_map_host_memory_t can_map_host_memory{};
|
|
481
519
|
|
|
482
520
|
// Compute mode is the compute mode that the device is currently in.
|
|
483
|
-
using compute_mode_t =
|
|
521
|
+
using compute_mode_t = __dev_attr<::cudaDevAttrComputeMode>;
|
|
484
522
|
static constexpr compute_mode_t compute_mode{};
|
|
485
523
|
|
|
486
524
|
// true if the device supports executing multiple kernels within the same
|
|
487
525
|
// context simultaneously, or false if not. It is not guaranteed that multiple
|
|
488
526
|
// kernels will be resident on the device concurrently so this feature should
|
|
489
527
|
// not be relied upon for correctness.
|
|
490
|
-
using concurrent_kernels_t =
|
|
528
|
+
using concurrent_kernels_t = __dev_attr<::cudaDevAttrConcurrentKernels>;
|
|
491
529
|
static constexpr concurrent_kernels_t concurrent_kernels{};
|
|
492
530
|
|
|
493
531
|
// true if error correction is enabled on the device, 0 if error correction is
|
|
494
532
|
// disabled or not supported by the device
|
|
495
|
-
using ecc_enabled_t =
|
|
533
|
+
using ecc_enabled_t = __dev_attr<::cudaDevAttrEccEnabled>;
|
|
496
534
|
static constexpr ecc_enabled_t ecc_enabled{};
|
|
497
535
|
|
|
498
536
|
// PCI bus identifier of the device
|
|
499
|
-
using pci_bus_id_t =
|
|
537
|
+
using pci_bus_id_t = __dev_attr<::cudaDevAttrPciBusId>;
|
|
500
538
|
static constexpr pci_bus_id_t pci_bus_id{};
|
|
501
539
|
|
|
502
540
|
// PCI device (also known as slot) identifier of the device
|
|
503
|
-
using pci_device_id_t =
|
|
541
|
+
using pci_device_id_t = __dev_attr<::cudaDevAttrPciDeviceId>;
|
|
504
542
|
static constexpr pci_device_id_t pci_device_id{};
|
|
505
543
|
|
|
506
544
|
// true if the device is using a TCC driver. TCC is only available on Tesla
|
|
507
545
|
// hardware running Windows Vista or later.
|
|
508
|
-
using tcc_driver_t =
|
|
546
|
+
using tcc_driver_t = __dev_attr<::cudaDevAttrTccDriver>;
|
|
509
547
|
static constexpr tcc_driver_t tcc_driver{};
|
|
510
548
|
|
|
511
549
|
// Peak memory clock frequency in kilohertz
|
|
512
|
-
using memory_clock_rate_t =
|
|
550
|
+
using memory_clock_rate_t = __dev_attr<::cudaDevAttrMemoryClockRate>;
|
|
513
551
|
static constexpr memory_clock_rate_t memory_clock_rate{};
|
|
514
552
|
|
|
515
553
|
// Global memory bus width in bits
|
|
516
|
-
using global_memory_bus_width_t =
|
|
554
|
+
using global_memory_bus_width_t = __dev_attr<::cudaDevAttrGlobalMemoryBusWidth>;
|
|
517
555
|
static constexpr global_memory_bus_width_t global_memory_bus_width{};
|
|
518
556
|
|
|
519
557
|
// Size of L2 cache in bytes. 0 if the device doesn't have L2 cache.
|
|
520
|
-
using l2_cache_size_t =
|
|
558
|
+
using l2_cache_size_t = __dev_attr<::cudaDevAttrL2CacheSize>;
|
|
521
559
|
static constexpr l2_cache_size_t l2_cache_size{};
|
|
522
560
|
|
|
523
561
|
// Maximum resident threads per multiprocessor
|
|
524
|
-
using max_threads_per_multiprocessor_t =
|
|
562
|
+
using max_threads_per_multiprocessor_t = __dev_attr<::cudaDevAttrMaxThreadsPerMultiProcessor>;
|
|
525
563
|
static constexpr max_threads_per_multiprocessor_t max_threads_per_multiprocessor{};
|
|
526
564
|
|
|
527
565
|
// true if the device shares a unified address space with the host, or false
|
|
528
566
|
// if not
|
|
529
|
-
using unified_addressing_t =
|
|
567
|
+
using unified_addressing_t = __dev_attr<::cudaDevAttrUnifiedAddressing>;
|
|
530
568
|
static constexpr unified_addressing_t unified_addressing{};
|
|
531
569
|
|
|
532
570
|
// Major compute capability version number
|
|
533
|
-
using compute_capability_major_t =
|
|
571
|
+
using compute_capability_major_t = __dev_attr<::cudaDevAttrComputeCapabilityMajor>;
|
|
534
572
|
static constexpr compute_capability_major_t compute_capability_major{};
|
|
535
573
|
|
|
536
574
|
// Minor compute capability version number
|
|
537
|
-
using compute_capability_minor_t =
|
|
575
|
+
using compute_capability_minor_t = __dev_attr<::cudaDevAttrComputeCapabilityMinor>;
|
|
538
576
|
static constexpr compute_capability_minor_t compute_capability_minor{};
|
|
539
577
|
|
|
540
578
|
// true if the device supports stream priorities, or false if not
|
|
541
|
-
using stream_priorities_supported_t =
|
|
579
|
+
using stream_priorities_supported_t = __dev_attr<::cudaDevAttrStreamPrioritiesSupported>;
|
|
542
580
|
static constexpr stream_priorities_supported_t stream_priorities_supported{};
|
|
543
581
|
|
|
544
582
|
// true if device supports caching globals in L1 cache, false if not
|
|
545
|
-
using global_l1_cache_supported_t =
|
|
583
|
+
using global_l1_cache_supported_t = __dev_attr<::cudaDevAttrGlobalL1CacheSupported>;
|
|
546
584
|
static constexpr global_l1_cache_supported_t global_l1_cache_supported{};
|
|
547
585
|
|
|
548
586
|
// true if device supports caching locals in L1 cache, false if not
|
|
549
|
-
using local_l1_cache_supported_t =
|
|
587
|
+
using local_l1_cache_supported_t = __dev_attr<::cudaDevAttrLocalL1CacheSupported>;
|
|
550
588
|
static constexpr local_l1_cache_supported_t local_l1_cache_supported{};
|
|
551
589
|
|
|
552
590
|
// Maximum amount of shared memory available to a multiprocessor in bytes;
|
|
553
591
|
// this amount is shared by all thread blocks simultaneously resident on a
|
|
554
592
|
// multiprocessor
|
|
555
|
-
using max_shared_memory_per_multiprocessor_t =
|
|
593
|
+
using max_shared_memory_per_multiprocessor_t = __dev_attr<::cudaDevAttrMaxSharedMemoryPerMultiprocessor>;
|
|
556
594
|
static constexpr max_shared_memory_per_multiprocessor_t max_shared_memory_per_multiprocessor{};
|
|
557
595
|
|
|
558
596
|
// Maximum number of 32-bit registers available to a multiprocessor; this
|
|
559
597
|
// number is shared by all thread blocks simultaneously resident on a
|
|
560
598
|
// multiprocessor
|
|
561
|
-
using max_registers_per_multiprocessor_t =
|
|
599
|
+
using max_registers_per_multiprocessor_t = __dev_attr<::cudaDevAttrMaxRegistersPerMultiprocessor>;
|
|
562
600
|
static constexpr max_registers_per_multiprocessor_t max_registers_per_multiprocessor{};
|
|
563
601
|
|
|
564
602
|
// true if device supports allocating managed memory, false if not
|
|
565
|
-
using managed_memory_t =
|
|
603
|
+
using managed_memory_t = __dev_attr<::cudaDevAttrManagedMemory>;
|
|
566
604
|
static constexpr managed_memory_t managed_memory{};
|
|
567
605
|
|
|
568
606
|
// true if device is on a multi-GPU board, false if not
|
|
569
|
-
using is_multi_gpu_board_t =
|
|
607
|
+
using is_multi_gpu_board_t = __dev_attr<::cudaDevAttrIsMultiGpuBoard>;
|
|
570
608
|
static constexpr is_multi_gpu_board_t is_multi_gpu_board{};
|
|
571
609
|
|
|
572
610
|
// Unique identifier for a group of devices on the same multi-GPU board
|
|
573
|
-
using multi_gpu_board_group_id_t =
|
|
611
|
+
using multi_gpu_board_group_id_t = __dev_attr<::cudaDevAttrMultiGpuBoardGroupID>;
|
|
574
612
|
static constexpr multi_gpu_board_group_id_t multi_gpu_board_group_id{};
|
|
575
613
|
|
|
576
614
|
// true if the link between the device and the host supports native atomic
|
|
577
615
|
// operations
|
|
578
|
-
using host_native_atomic_supported_t =
|
|
616
|
+
using host_native_atomic_supported_t = __dev_attr<::cudaDevAttrHostNativeAtomicSupported>;
|
|
579
617
|
static constexpr host_native_atomic_supported_t host_native_atomic_supported{};
|
|
580
618
|
|
|
581
619
|
// Ratio of single precision performance (in floating-point operations per
|
|
582
620
|
// second) to double precision performance
|
|
583
|
-
using single_to_double_precision_perf_ratio_t =
|
|
621
|
+
using single_to_double_precision_perf_ratio_t = __dev_attr<::cudaDevAttrSingleToDoublePrecisionPerfRatio>;
|
|
584
622
|
static constexpr single_to_double_precision_perf_ratio_t single_to_double_precision_perf_ratio{};
|
|
585
623
|
|
|
586
624
|
// true if the device supports coherently accessing pageable memory without
|
|
587
625
|
// calling cudaHostRegister on it, and false otherwise
|
|
588
|
-
using pageable_memory_access_t =
|
|
626
|
+
using pageable_memory_access_t = __dev_attr<::cudaDevAttrPageableMemoryAccess>;
|
|
589
627
|
static constexpr pageable_memory_access_t pageable_memory_access{};
|
|
590
628
|
|
|
591
629
|
// true if the device can coherently access managed memory concurrently with
|
|
592
630
|
// the CPU, and false otherwise
|
|
593
|
-
using concurrent_managed_access_t =
|
|
631
|
+
using concurrent_managed_access_t = __dev_attr<::cudaDevAttrConcurrentManagedAccess>;
|
|
594
632
|
static constexpr concurrent_managed_access_t concurrent_managed_access{};
|
|
595
633
|
|
|
596
634
|
// true if the device supports Compute Preemption, false if not
|
|
597
|
-
using compute_preemption_supported_t =
|
|
635
|
+
using compute_preemption_supported_t = __dev_attr<::cudaDevAttrComputePreemptionSupported>;
|
|
598
636
|
static constexpr compute_preemption_supported_t compute_preemption_supported{};
|
|
599
637
|
|
|
600
638
|
// true if the device can access host registered memory at the same virtual
|
|
601
639
|
// address as the CPU, and false otherwise
|
|
602
|
-
using can_use_host_pointer_for_registered_mem_t =
|
|
640
|
+
using can_use_host_pointer_for_registered_mem_t = __dev_attr<::cudaDevAttrCanUseHostPointerForRegisteredMem>;
|
|
603
641
|
static constexpr can_use_host_pointer_for_registered_mem_t can_use_host_pointer_for_registered_mem{};
|
|
604
642
|
|
|
605
643
|
// true if the device supports launching cooperative kernels via
|
|
606
644
|
// cudaLaunchCooperativeKernel, and false otherwise
|
|
607
|
-
using cooperative_launch_t =
|
|
645
|
+
using cooperative_launch_t = __dev_attr<::cudaDevAttrCooperativeLaunch>;
|
|
608
646
|
static constexpr cooperative_launch_t cooperative_launch{};
|
|
609
647
|
|
|
610
648
|
// true if the device supports flushing of outstanding remote writes, and
|
|
611
649
|
// false otherwise
|
|
612
|
-
using can_flush_remote_writes_t =
|
|
650
|
+
using can_flush_remote_writes_t = __dev_attr<::cudaDevAttrCanFlushRemoteWrites>;
|
|
613
651
|
static constexpr can_flush_remote_writes_t can_flush_remote_writes{};
|
|
614
652
|
|
|
615
653
|
// true if the device supports host memory registration via cudaHostRegister,
|
|
616
654
|
// and false otherwise
|
|
617
|
-
using host_register_supported_t =
|
|
655
|
+
using host_register_supported_t = __dev_attr<::cudaDevAttrHostRegisterSupported>;
|
|
618
656
|
static constexpr host_register_supported_t host_register_supported{};
|
|
619
657
|
|
|
620
658
|
// true if the device accesses pageable memory via the host's page tables, and
|
|
621
659
|
// false otherwise
|
|
622
|
-
using pageable_memory_access_uses_host_page_tables_t =
|
|
623
|
-
__detail::__dev_attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables>;
|
|
660
|
+
using pageable_memory_access_uses_host_page_tables_t = __dev_attr<::cudaDevAttrPageableMemoryAccessUsesHostPageTables>;
|
|
624
661
|
static constexpr pageable_memory_access_uses_host_page_tables_t pageable_memory_access_uses_host_page_tables{};
|
|
625
662
|
|
|
626
663
|
// true if the host can directly access managed memory on the device without
|
|
627
664
|
// migration, and false otherwise
|
|
628
|
-
using direct_managed_mem_access_from_host_t =
|
|
665
|
+
using direct_managed_mem_access_from_host_t = __dev_attr<::cudaDevAttrDirectManagedMemAccessFromHost>;
|
|
629
666
|
static constexpr direct_managed_mem_access_from_host_t direct_managed_mem_access_from_host{};
|
|
630
667
|
|
|
631
668
|
// Maximum per block shared memory size on the device. This value can be opted
|
|
632
669
|
// into when using dynamic_shared_memory with NonPortableSize set to true
|
|
633
|
-
using max_shared_memory_per_block_optin_t =
|
|
670
|
+
using max_shared_memory_per_block_optin_t = __dev_attr<::cudaDevAttrMaxSharedMemoryPerBlockOptin>;
|
|
634
671
|
static constexpr max_shared_memory_per_block_optin_t max_shared_memory_per_block_optin{};
|
|
635
672
|
|
|
636
673
|
// Maximum number of thread blocks that can reside on a multiprocessor
|
|
637
|
-
using max_blocks_per_multiprocessor_t =
|
|
674
|
+
using max_blocks_per_multiprocessor_t = __dev_attr<::cudaDevAttrMaxBlocksPerMultiprocessor>;
|
|
638
675
|
static constexpr max_blocks_per_multiprocessor_t max_blocks_per_multiprocessor{};
|
|
639
676
|
|
|
640
677
|
// Maximum L2 persisting lines capacity setting in bytes
|
|
641
|
-
using max_persisting_l2_cache_size_t =
|
|
678
|
+
using max_persisting_l2_cache_size_t = __dev_attr<::cudaDevAttrMaxPersistingL2CacheSize>;
|
|
642
679
|
static constexpr max_persisting_l2_cache_size_t max_persisting_l2_cache_size{};
|
|
643
680
|
|
|
644
681
|
// Maximum value of cudaAccessPolicyWindow::num_bytes
|
|
645
|
-
using max_access_policy_window_size_t =
|
|
682
|
+
using max_access_policy_window_size_t = __dev_attr<::cudaDevAttrMaxAccessPolicyWindowSize>;
|
|
646
683
|
static constexpr max_access_policy_window_size_t max_access_policy_window_size{};
|
|
647
684
|
|
|
648
685
|
// Shared memory reserved by CUDA driver per block in bytes
|
|
649
|
-
using reserved_shared_memory_per_block_t =
|
|
686
|
+
using reserved_shared_memory_per_block_t = __dev_attr<::cudaDevAttrReservedSharedMemoryPerBlock>;
|
|
650
687
|
static constexpr reserved_shared_memory_per_block_t reserved_shared_memory_per_block{};
|
|
651
688
|
|
|
652
689
|
// true if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays.
|
|
653
|
-
using sparse_cuda_array_supported_t =
|
|
690
|
+
using sparse_cuda_array_supported_t = __dev_attr<::cudaDevAttrSparseCudaArraySupported>;
|
|
654
691
|
static constexpr sparse_cuda_array_supported_t sparse_cuda_array_supported{};
|
|
655
692
|
|
|
656
693
|
// Device supports using the cudaHostRegister flag cudaHostRegisterReadOnly to
|
|
657
694
|
// register memory that must be mapped as read-only to the GPU
|
|
658
|
-
using host_register_read_only_supported_t =
|
|
695
|
+
using host_register_read_only_supported_t = __dev_attr<::cudaDevAttrHostRegisterReadOnlySupported>;
|
|
659
696
|
static constexpr host_register_read_only_supported_t host_register_read_only_supported{};
|
|
660
697
|
|
|
661
698
|
// true if the device supports using the cudaMallocAsync and cudaMemPool
|
|
662
699
|
// family of APIs, and false otherwise
|
|
663
|
-
using memory_pools_supported_t =
|
|
700
|
+
using memory_pools_supported_t = __dev_attr<::cudaDevAttrMemoryPoolsSupported>;
|
|
664
701
|
static constexpr memory_pools_supported_t memory_pools_supported{};
|
|
665
702
|
|
|
666
703
|
// true if the device supports GPUDirect RDMA APIs, and false otherwise
|
|
667
|
-
using gpu_direct_rdma_supported_t =
|
|
704
|
+
using gpu_direct_rdma_supported_t = __dev_attr<::cudaDevAttrGPUDirectRDMASupported>;
|
|
668
705
|
static constexpr gpu_direct_rdma_supported_t gpu_direct_rdma_supported{};
|
|
669
706
|
|
|
670
707
|
// bitmask to be interpreted according to the
|
|
671
708
|
// cudaFlushGPUDirectRDMAWritesOptions enum
|
|
672
|
-
using gpu_direct_rdma_flush_writes_options_t =
|
|
709
|
+
using gpu_direct_rdma_flush_writes_options_t = __dev_attr<::cudaDevAttrGPUDirectRDMAFlushWritesOptions>;
|
|
673
710
|
static constexpr gpu_direct_rdma_flush_writes_options_t gpu_direct_rdma_flush_writes_options{};
|
|
674
711
|
|
|
675
712
|
// see the cudaGPUDirectRDMAWritesOrdering enum for numerical values
|
|
676
|
-
using gpu_direct_rdma_writes_ordering_t =
|
|
713
|
+
using gpu_direct_rdma_writes_ordering_t = __dev_attr<::cudaDevAttrGPUDirectRDMAWritesOrdering>;
|
|
677
714
|
static constexpr gpu_direct_rdma_writes_ordering_t gpu_direct_rdma_writes_ordering{};
|
|
678
715
|
|
|
679
716
|
// Bitmask of handle types supported with mempool based IPC
|
|
680
|
-
using memory_pool_supported_handle_types_t =
|
|
717
|
+
using memory_pool_supported_handle_types_t = __dev_attr<::cudaDevAttrMemoryPoolSupportedHandleTypes>;
|
|
681
718
|
static constexpr memory_pool_supported_handle_types_t memory_pool_supported_handle_types{};
|
|
682
719
|
|
|
683
720
|
// true if the device supports deferred mapping CUDA arrays and CUDA mipmapped
|
|
684
721
|
// arrays.
|
|
685
|
-
using deferred_mapping_cuda_array_supported_t =
|
|
722
|
+
using deferred_mapping_cuda_array_supported_t = __dev_attr<::cudaDevAttrDeferredMappingCudaArraySupported>;
|
|
686
723
|
static constexpr deferred_mapping_cuda_array_supported_t deferred_mapping_cuda_array_supported{};
|
|
687
724
|
|
|
688
725
|
// true if the device supports IPC Events, false otherwise.
|
|
689
|
-
using ipc_event_support_t =
|
|
726
|
+
using ipc_event_support_t = __dev_attr<::cudaDevAttrIpcEventSupport>;
|
|
690
727
|
static constexpr ipc_event_support_t ipc_event_support{};
|
|
691
728
|
|
|
692
729
|
# if _CCCL_CTK_AT_LEAST(12, 2)
|
|
693
730
|
// NUMA configuration of a device: value is of type cudaDeviceNumaConfig enum
|
|
694
|
-
using numa_config_t =
|
|
731
|
+
using numa_config_t = __dev_attr<::cudaDevAttrNumaConfig>;
|
|
695
732
|
static constexpr numa_config_t numa_config{};
|
|
696
733
|
|
|
697
734
|
// NUMA node ID of the GPU memory
|
|
698
|
-
using numa_id_t =
|
|
735
|
+
using numa_id_t = __dev_attr<::cudaDevAttrNumaId>;
|
|
699
736
|
static constexpr numa_id_t numa_id{};
|
|
700
737
|
# endif // _CCCL_CTK_AT_LEAST(12, 2)
|
|
701
738
|
|
|
@@ -703,15 +740,29 @@ static constexpr numa_id_t numa_id{};
|
|
|
703
740
|
// capability in a single query
|
|
704
741
|
struct compute_capability_t
|
|
705
742
|
{
|
|
706
|
-
|
|
743
|
+
using type = ::cuda::compute_capability;
|
|
744
|
+
|
|
745
|
+
[[nodiscard]] _CCCL_HOST_API type operator()(device_ref __dev_id) const
|
|
707
746
|
{
|
|
708
|
-
return
|
|
709
|
-
|
|
747
|
+
return type{::cuda::device_attributes::compute_capability_major(__dev_id),
|
|
748
|
+
::cuda::device_attributes::compute_capability_minor(__dev_id)};
|
|
710
749
|
}
|
|
711
750
|
};
|
|
712
751
|
static constexpr compute_capability_t compute_capability{};
|
|
713
752
|
} // namespace device_attributes
|
|
714
753
|
|
|
754
|
+
//! @brief For a given attribute, type of the attribute value.
|
|
755
|
+
//!
|
|
756
|
+
//! @par Example
|
|
757
|
+
//! @code
|
|
758
|
+
//! using threads_per_block_t = device::attr_result_t<device_attributes::max_threads_per_block>;
|
|
759
|
+
//! static_assert(std::is_same_v<threads_per_block_t, int>);
|
|
760
|
+
//! @endcode
|
|
761
|
+
//!
|
|
762
|
+
//! @sa device_attributes
|
|
763
|
+
template <::cudaDeviceAttr _Attr>
|
|
764
|
+
using device_attribute_result_t = typename __dev_attr<_Attr>::type;
|
|
765
|
+
|
|
715
766
|
_CCCL_END_NAMESPACE_CUDA
|
|
716
767
|
|
|
717
768
|
# include <cuda/std/__cccl/epilogue.h>
|