cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
- cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
- cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +27 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +13 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +155 -13
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +79 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
- cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
- cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
#ifndef _CUDA___DEVICE_ARCH_TRAITS_H
|
|
12
12
|
#define _CUDA___DEVICE_ARCH_TRAITS_H
|
|
13
13
|
|
|
14
|
-
#include <cuda/
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
15
|
|
|
16
16
|
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
17
|
# pragma GCC system_header
|
|
@@ -21,116 +21,93 @@
|
|
|
21
21
|
# pragma system_header
|
|
22
22
|
#endif // no system header
|
|
23
23
|
|
|
24
|
-
#if _CCCL_HAS_CTK()
|
|
25
|
-
|
|
24
|
+
#if _CCCL_HAS_CTK()
|
|
25
|
+
|
|
26
|
+
# include <cuda/__device/arch_id.h>
|
|
27
|
+
# include <cuda/__device/compute_capability.h>
|
|
28
|
+
# include <cuda/__fwd/devices.h>
|
|
26
29
|
# include <cuda/std/__exception/cuda_error.h>
|
|
30
|
+
# include <cuda/std/__type_traits/always_false.h>
|
|
31
|
+
# include <cuda/std/cstdint>
|
|
27
32
|
# include <cuda/std/limits>
|
|
28
33
|
|
|
29
34
|
# include <cuda/std/__cccl/prologue.h>
|
|
30
35
|
|
|
31
36
|
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
32
|
-
namespace arch
|
|
33
|
-
{
|
|
34
|
-
|
|
35
|
-
inline constexpr int __arch_specific_id_multiplier = 100000;
|
|
36
37
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
enum class id : int
|
|
41
|
-
{
|
|
42
|
-
sm_60 = 60,
|
|
43
|
-
sm_61 = 61,
|
|
44
|
-
sm_70 = 70,
|
|
45
|
-
sm_75 = 75,
|
|
46
|
-
sm_80 = 80,
|
|
47
|
-
sm_86 = 86,
|
|
48
|
-
sm_89 = 89,
|
|
49
|
-
sm_90 = 90,
|
|
50
|
-
sm_100 = 100,
|
|
51
|
-
sm_103 = 103,
|
|
52
|
-
sm_110 = 110,
|
|
53
|
-
sm_120 = 120,
|
|
54
|
-
sm_90a = 90 * __arch_specific_id_multiplier,
|
|
55
|
-
sm_100a = 100 * __arch_specific_id_multiplier,
|
|
56
|
-
sm_103a = 103 * __arch_specific_id_multiplier,
|
|
57
|
-
sm_110a = 110 * __arch_specific_id_multiplier,
|
|
58
|
-
sm_120a = 120 * __arch_specific_id_multiplier,
|
|
59
|
-
};
|
|
60
|
-
|
|
61
|
-
// @brief Architecture traits
|
|
62
|
-
// This type contains information about an architecture that is constant across devices of that architecture.
|
|
63
|
-
struct traits_t
|
|
38
|
+
//! @brief Architecture traits
|
|
39
|
+
//! This type contains information about an architecture that is constant across devices of that architecture.
|
|
40
|
+
struct arch_traits_t
|
|
64
41
|
{
|
|
65
42
|
// Maximum number of threads per block
|
|
66
|
-
|
|
43
|
+
int max_threads_per_block;
|
|
67
44
|
|
|
68
45
|
// Maximum x-dimension of a block
|
|
69
|
-
|
|
46
|
+
int max_block_dim_x;
|
|
70
47
|
|
|
71
48
|
// Maximum y-dimension of a block
|
|
72
|
-
|
|
49
|
+
int max_block_dim_y;
|
|
73
50
|
|
|
74
51
|
// Maximum z-dimension of a block
|
|
75
|
-
|
|
52
|
+
int max_block_dim_z;
|
|
76
53
|
|
|
77
54
|
// Maximum x-dimension of a grid
|
|
78
|
-
|
|
55
|
+
int max_grid_dim_x;
|
|
79
56
|
|
|
80
57
|
// Maximum y-dimension of a grid
|
|
81
|
-
|
|
58
|
+
int max_grid_dim_y;
|
|
82
59
|
|
|
83
60
|
// Maximum z-dimension of a grid
|
|
84
|
-
|
|
61
|
+
int max_grid_dim_z;
|
|
85
62
|
|
|
86
63
|
// Maximum amount of shared memory available to a thread block in bytes
|
|
87
|
-
|
|
64
|
+
::cuda::std::size_t max_shared_memory_per_block;
|
|
88
65
|
|
|
89
66
|
// Memory available on device for __constant__ variables in a CUDA C kernel in bytes
|
|
90
|
-
|
|
67
|
+
::cuda::std::size_t total_constant_memory;
|
|
91
68
|
|
|
92
69
|
// Warp size in threads
|
|
93
|
-
|
|
70
|
+
int warp_size;
|
|
94
71
|
|
|
95
72
|
// Maximum number of concurrent grids on the device
|
|
96
|
-
|
|
73
|
+
int max_resident_grids;
|
|
97
74
|
|
|
98
75
|
// true if the device can concurrently copy memory between host and device
|
|
99
76
|
// while executing a kernel, or false if not
|
|
100
|
-
|
|
77
|
+
bool gpu_overlap;
|
|
101
78
|
|
|
102
79
|
// true if the device can map host memory into CUDA address space
|
|
103
|
-
|
|
80
|
+
bool can_map_host_memory;
|
|
104
81
|
|
|
105
82
|
// true if the device supports executing multiple kernels within the same
|
|
106
83
|
// context simultaneously, or false if not. It is not guaranteed that multiple
|
|
107
84
|
// kernels will be resident on the device concurrently so this feature should
|
|
108
85
|
// not be relied upon for correctness.
|
|
109
|
-
|
|
86
|
+
bool concurrent_kernels;
|
|
110
87
|
|
|
111
88
|
// true if the device supports stream priorities, or false if not
|
|
112
|
-
|
|
89
|
+
bool stream_priorities_supported;
|
|
113
90
|
|
|
114
91
|
// true if device supports caching globals in L1 cache, false if not
|
|
115
|
-
|
|
92
|
+
bool global_l1_cache_supported;
|
|
116
93
|
|
|
117
94
|
// true if device supports caching locals in L1 cache, false if not
|
|
118
|
-
|
|
95
|
+
bool local_l1_cache_supported;
|
|
119
96
|
|
|
120
97
|
// TODO: We might want to have these per-arch
|
|
121
98
|
// Maximum number of 32-bit registers available to a thread block
|
|
122
|
-
|
|
99
|
+
int max_registers_per_block;
|
|
123
100
|
|
|
124
101
|
// Maximum number of 32-bit registers available to a multiprocessor; this
|
|
125
102
|
// number is shared by all thread blocks simultaneously resident on a
|
|
126
103
|
// multiprocessor
|
|
127
|
-
|
|
104
|
+
int max_registers_per_multiprocessor;
|
|
128
105
|
|
|
129
106
|
// Maximum number of 32-bit registers available to a thread
|
|
130
|
-
|
|
107
|
+
int max_registers_per_thread;
|
|
131
108
|
|
|
132
109
|
// Identifier for the architecture
|
|
133
|
-
|
|
110
|
+
::cuda::arch_id arch_id;
|
|
134
111
|
|
|
135
112
|
// Major compute capability version number
|
|
136
113
|
int compute_capability_major;
|
|
@@ -139,12 +116,12 @@ struct traits_t
|
|
|
139
116
|
int compute_capability_minor;
|
|
140
117
|
|
|
141
118
|
// Compute capability version number in 100 * major + 10 * minor format
|
|
142
|
-
|
|
119
|
+
::cuda::compute_capability compute_capability;
|
|
143
120
|
|
|
144
121
|
// Maximum amount of shared memory available to a multiprocessor in bytes;
|
|
145
122
|
// this amount is shared by all thread blocks simultaneously resident on a
|
|
146
123
|
// multiprocessor
|
|
147
|
-
|
|
124
|
+
::cuda::std::size_t max_shared_memory_per_multiprocessor;
|
|
148
125
|
|
|
149
126
|
// Maximum number of thread blocks that can reside on a multiprocessor
|
|
150
127
|
int max_blocks_per_multiprocessor;
|
|
@@ -156,11 +133,11 @@ struct traits_t
|
|
|
156
133
|
int max_warps_per_multiprocessor;
|
|
157
134
|
|
|
158
135
|
// Shared memory reserved by CUDA driver per block in bytes
|
|
159
|
-
|
|
136
|
+
::cuda::std::size_t reserved_shared_memory_per_block;
|
|
160
137
|
|
|
161
138
|
// Maximum per block shared memory size on the device. This value can be opted
|
|
162
139
|
// into when using dynamic_shared_memory with NonPortableSize set to true
|
|
163
|
-
|
|
140
|
+
::cuda::std::size_t max_shared_memory_per_block_optin;
|
|
164
141
|
|
|
165
142
|
// TODO: Do we want these?:
|
|
166
143
|
// true if architecture supports clusters
|
|
@@ -179,65 +156,81 @@ struct traits_t
|
|
|
179
156
|
bool tma_supported;
|
|
180
157
|
};
|
|
181
158
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
159
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t __common_arch_traits(arch_id __arch_id) noexcept
|
|
160
|
+
{
|
|
161
|
+
const compute_capability __cc{__arch_id};
|
|
162
|
+
|
|
163
|
+
arch_traits_t __traits{};
|
|
164
|
+
__traits.max_threads_per_block = 1024;
|
|
165
|
+
__traits.max_block_dim_x = 1024;
|
|
166
|
+
__traits.max_block_dim_y = 1024;
|
|
167
|
+
__traits.max_block_dim_z = 64;
|
|
168
|
+
__traits.max_grid_dim_x = ::cuda::std::numeric_limits<::cuda::std::int32_t>::max();
|
|
169
|
+
__traits.max_grid_dim_y = 64 * 1024 - 1;
|
|
170
|
+
__traits.max_grid_dim_z = 64 * 1024 - 1;
|
|
171
|
+
__traits.max_shared_memory_per_block = 48 * 1024;
|
|
172
|
+
__traits.total_constant_memory = 64 * 1024;
|
|
173
|
+
__traits.warp_size = 32;
|
|
174
|
+
__traits.max_resident_grids = 128;
|
|
175
|
+
__traits.gpu_overlap = true;
|
|
176
|
+
__traits.can_map_host_memory = true;
|
|
177
|
+
__traits.concurrent_kernels = true;
|
|
178
|
+
__traits.stream_priorities_supported = true;
|
|
179
|
+
__traits.global_l1_cache_supported = true;
|
|
180
|
+
__traits.local_l1_cache_supported = true;
|
|
181
|
+
__traits.max_registers_per_block = 64 * 1024;
|
|
182
|
+
__traits.max_registers_per_multiprocessor = 64 * 1024;
|
|
183
|
+
__traits.max_registers_per_thread = 255;
|
|
184
|
+
__traits.arch_id = __arch_id;
|
|
185
|
+
__traits.compute_capability_major = __cc.major();
|
|
186
|
+
__traits.compute_capability_minor = __cc.minor();
|
|
187
|
+
__traits.compute_capability = __cc;
|
|
188
|
+
// __traits.max_shared_memory_per_multiprocessor; // set up individually
|
|
189
|
+
// __traits.max_blocks_per_multiprocessor; // set up individually
|
|
190
|
+
// __traits.max_threads_per_multiprocessor; // set up individually
|
|
191
|
+
// __traits.max_warps_per_multiprocessor; // set up individually
|
|
192
|
+
__traits.reserved_shared_memory_per_block = (__cc >= compute_capability{80}) ? 1024 : 0;
|
|
193
|
+
// __traits.max_shared_memory_per_block_optin; // set up individually
|
|
194
|
+
__traits.cluster_supported = (__cc >= compute_capability{90});
|
|
195
|
+
__traits.redux_intrinisic = (__cc >= compute_capability{80});
|
|
196
|
+
__traits.elect_intrinsic = (__cc >= compute_capability{90});
|
|
197
|
+
__traits.cp_async_supported = (__cc >= compute_capability{80});
|
|
198
|
+
__traits.tma_supported = (__cc >= compute_capability{90});
|
|
199
|
+
return __traits;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
//! @brief Gets the architecture traits for the given architecture id \c _Id.
|
|
203
|
+
template <arch_id _Id>
|
|
204
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits() noexcept;
|
|
186
205
|
|
|
187
206
|
template <>
|
|
188
|
-
[[nodiscard]]
|
|
207
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_60>() noexcept
|
|
189
208
|
{
|
|
190
|
-
|
|
191
|
-
__traits.arch_id = id::sm_60;
|
|
192
|
-
__traits.compute_capability_major = 6;
|
|
193
|
-
__traits.compute_capability_minor = 0;
|
|
194
|
-
__traits.compute_capability = 60;
|
|
209
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_60);
|
|
195
210
|
__traits.max_shared_memory_per_multiprocessor = 64 * 1024;
|
|
196
211
|
__traits.max_blocks_per_multiprocessor = 32;
|
|
197
212
|
__traits.max_threads_per_multiprocessor = 2048;
|
|
198
213
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
199
|
-
__traits.reserved_shared_memory_per_block = 0;
|
|
200
214
|
__traits.max_shared_memory_per_block_optin = 48 * 1024;
|
|
201
|
-
|
|
202
|
-
__traits.cluster_supported = false;
|
|
203
|
-
__traits.redux_intrinisic = false;
|
|
204
|
-
__traits.elect_intrinsic = false;
|
|
205
|
-
__traits.cp_async_supported = false;
|
|
206
|
-
__traits.tma_supported = false;
|
|
207
215
|
return __traits;
|
|
208
216
|
};
|
|
209
217
|
|
|
210
218
|
template <>
|
|
211
|
-
[[nodiscard]]
|
|
219
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_61>() noexcept
|
|
212
220
|
{
|
|
213
|
-
|
|
214
|
-
__traits.arch_id = id::sm_61;
|
|
215
|
-
__traits.compute_capability_major = 6;
|
|
216
|
-
__traits.compute_capability_minor = 1;
|
|
217
|
-
__traits.compute_capability = 61;
|
|
221
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_61);
|
|
218
222
|
__traits.max_shared_memory_per_multiprocessor = 96 * 1024;
|
|
219
223
|
__traits.max_blocks_per_multiprocessor = 32;
|
|
220
224
|
__traits.max_threads_per_multiprocessor = 2048;
|
|
221
225
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
222
|
-
__traits.reserved_shared_memory_per_block = 0;
|
|
223
226
|
__traits.max_shared_memory_per_block_optin = 48 * 1024;
|
|
224
|
-
|
|
225
|
-
__traits.cluster_supported = false;
|
|
226
|
-
__traits.redux_intrinisic = false;
|
|
227
|
-
__traits.elect_intrinsic = false;
|
|
228
|
-
__traits.cp_async_supported = false;
|
|
229
|
-
__traits.tma_supported = false;
|
|
230
227
|
return __traits;
|
|
231
228
|
};
|
|
232
229
|
|
|
233
230
|
template <>
|
|
234
|
-
[[nodiscard]]
|
|
231
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_70>() noexcept
|
|
235
232
|
{
|
|
236
|
-
|
|
237
|
-
__traits.arch_id = id::sm_70;
|
|
238
|
-
__traits.compute_capability_major = 7;
|
|
239
|
-
__traits.compute_capability_minor = 0;
|
|
240
|
-
__traits.compute_capability = 70;
|
|
233
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_70);
|
|
241
234
|
__traits.max_shared_memory_per_multiprocessor = 96 * 1024;
|
|
242
235
|
__traits.max_blocks_per_multiprocessor = 32;
|
|
243
236
|
__traits.max_threads_per_multiprocessor = 2048;
|
|
@@ -245,369 +238,300 @@ template <>
|
|
|
245
238
|
__traits.reserved_shared_memory_per_block = 0;
|
|
246
239
|
__traits.max_shared_memory_per_block_optin =
|
|
247
240
|
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
248
|
-
|
|
249
|
-
__traits.cluster_supported = false;
|
|
250
|
-
__traits.redux_intrinisic = false;
|
|
251
|
-
__traits.elect_intrinsic = false;
|
|
252
|
-
__traits.cp_async_supported = false;
|
|
253
|
-
__traits.tma_supported = false;
|
|
254
241
|
return __traits;
|
|
255
242
|
};
|
|
256
243
|
|
|
257
244
|
template <>
|
|
258
|
-
[[nodiscard]]
|
|
245
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_75>() noexcept
|
|
259
246
|
{
|
|
260
|
-
|
|
261
|
-
__traits.arch_id = id::sm_75;
|
|
262
|
-
__traits.compute_capability_major = 7;
|
|
263
|
-
__traits.compute_capability_minor = 5;
|
|
264
|
-
__traits.compute_capability = 75;
|
|
247
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_75);
|
|
265
248
|
__traits.max_shared_memory_per_multiprocessor = 64 * 1024;
|
|
266
249
|
__traits.max_blocks_per_multiprocessor = 16;
|
|
267
250
|
__traits.max_threads_per_multiprocessor = 1024;
|
|
268
251
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
269
|
-
__traits.reserved_shared_memory_per_block = 0;
|
|
270
252
|
__traits.max_shared_memory_per_block_optin =
|
|
271
253
|
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
272
|
-
|
|
273
|
-
__traits.cluster_supported = false;
|
|
274
|
-
__traits.redux_intrinisic = false;
|
|
275
|
-
__traits.elect_intrinsic = false;
|
|
276
|
-
__traits.cp_async_supported = false;
|
|
277
|
-
__traits.tma_supported = false;
|
|
278
254
|
return __traits;
|
|
279
255
|
};
|
|
280
256
|
|
|
281
257
|
template <>
|
|
282
|
-
[[nodiscard]]
|
|
258
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_80>() noexcept
|
|
283
259
|
{
|
|
284
|
-
|
|
285
|
-
__traits.arch_id = id::sm_80;
|
|
286
|
-
__traits.compute_capability_major = 8;
|
|
287
|
-
__traits.compute_capability_minor = 0;
|
|
288
|
-
__traits.compute_capability = 80;
|
|
260
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_80);
|
|
289
261
|
__traits.max_shared_memory_per_multiprocessor = 164 * 1024;
|
|
290
262
|
__traits.max_blocks_per_multiprocessor = 32;
|
|
291
263
|
__traits.max_threads_per_multiprocessor = 2048;
|
|
292
264
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
293
|
-
__traits.reserved_shared_memory_per_block = 1024;
|
|
294
265
|
__traits.max_shared_memory_per_block_optin =
|
|
295
266
|
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
296
|
-
|
|
297
|
-
__traits.cluster_supported = false;
|
|
298
|
-
__traits.redux_intrinisic = true;
|
|
299
|
-
__traits.elect_intrinsic = false;
|
|
300
|
-
__traits.cp_async_supported = true;
|
|
301
|
-
__traits.tma_supported = false;
|
|
302
267
|
return __traits;
|
|
303
268
|
};
|
|
304
269
|
|
|
305
270
|
template <>
|
|
306
|
-
[[nodiscard]]
|
|
271
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_86>() noexcept
|
|
307
272
|
{
|
|
308
|
-
|
|
309
|
-
__traits.arch_id = id::sm_86;
|
|
310
|
-
__traits.compute_capability_major = 8;
|
|
311
|
-
__traits.compute_capability_minor = 6;
|
|
312
|
-
__traits.compute_capability = 86;
|
|
273
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_86);
|
|
313
274
|
__traits.max_shared_memory_per_multiprocessor = 100 * 1024;
|
|
314
275
|
__traits.max_blocks_per_multiprocessor = 16;
|
|
315
276
|
__traits.max_threads_per_multiprocessor = 1536;
|
|
316
277
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
317
|
-
__traits.reserved_shared_memory_per_block = 1024;
|
|
318
278
|
__traits.max_shared_memory_per_block_optin =
|
|
319
279
|
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
280
|
+
return __traits;
|
|
281
|
+
};
|
|
320
282
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
__traits
|
|
325
|
-
__traits.
|
|
283
|
+
template <>
|
|
284
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_87>() noexcept
|
|
285
|
+
{
|
|
286
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_87);
|
|
287
|
+
__traits.max_shared_memory_per_multiprocessor = 164 * 1024;
|
|
288
|
+
__traits.max_blocks_per_multiprocessor = 16;
|
|
289
|
+
__traits.max_threads_per_multiprocessor = 1536;
|
|
290
|
+
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
291
|
+
__traits.max_shared_memory_per_block_optin =
|
|
292
|
+
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
293
|
+
return __traits;
|
|
294
|
+
};
|
|
295
|
+
|
|
296
|
+
template <>
|
|
297
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_88>() noexcept
|
|
298
|
+
{
|
|
299
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_86>();
|
|
300
|
+
__traits.arch_id = arch_id::sm_88;
|
|
301
|
+
__traits.compute_capability_major = 8;
|
|
302
|
+
__traits.compute_capability_minor = 8;
|
|
303
|
+
__traits.compute_capability = compute_capability{88};
|
|
326
304
|
return __traits;
|
|
327
305
|
};
|
|
328
306
|
|
|
329
307
|
template <>
|
|
330
|
-
[[nodiscard]]
|
|
308
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_89>() noexcept
|
|
331
309
|
{
|
|
332
|
-
|
|
333
|
-
__traits.arch_id = id::sm_89;
|
|
334
|
-
__traits.compute_capability_major = 8;
|
|
335
|
-
__traits.compute_capability_minor = 9;
|
|
336
|
-
__traits.compute_capability = 89;
|
|
310
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_89);
|
|
337
311
|
__traits.max_shared_memory_per_multiprocessor = 100 * 1024;
|
|
338
312
|
__traits.max_blocks_per_multiprocessor = 24;
|
|
339
313
|
__traits.max_threads_per_multiprocessor = 1536;
|
|
340
314
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
341
|
-
__traits.reserved_shared_memory_per_block = 1024;
|
|
342
315
|
__traits.max_shared_memory_per_block_optin =
|
|
343
316
|
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
344
|
-
|
|
345
|
-
__traits.cluster_supported = false;
|
|
346
|
-
__traits.redux_intrinisic = true;
|
|
347
|
-
__traits.elect_intrinsic = false;
|
|
348
|
-
__traits.cp_async_supported = true;
|
|
349
|
-
__traits.tma_supported = false;
|
|
350
317
|
return __traits;
|
|
351
318
|
};
|
|
352
319
|
|
|
353
320
|
template <>
|
|
354
|
-
[[nodiscard]]
|
|
321
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_90>() noexcept
|
|
355
322
|
{
|
|
356
|
-
|
|
357
|
-
__traits.arch_id = id::sm_90;
|
|
358
|
-
__traits.compute_capability_major = 9;
|
|
359
|
-
__traits.compute_capability_minor = 0;
|
|
360
|
-
__traits.compute_capability = 90;
|
|
323
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_90);
|
|
361
324
|
__traits.max_shared_memory_per_multiprocessor = 228 * 1024;
|
|
362
325
|
__traits.max_blocks_per_multiprocessor = 32;
|
|
363
326
|
__traits.max_threads_per_multiprocessor = 2048;
|
|
364
327
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
365
|
-
__traits.reserved_shared_memory_per_block = 1024;
|
|
366
328
|
__traits.max_shared_memory_per_block_optin =
|
|
367
329
|
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
368
|
-
|
|
369
|
-
__traits.cluster_supported = true;
|
|
370
|
-
__traits.redux_intrinisic = true;
|
|
371
|
-
__traits.elect_intrinsic = true;
|
|
372
|
-
__traits.cp_async_supported = true;
|
|
373
|
-
__traits.tma_supported = true;
|
|
374
330
|
return __traits;
|
|
375
331
|
};
|
|
376
332
|
|
|
377
333
|
// No sm_90a specific fields for now.
|
|
378
334
|
template <>
|
|
379
|
-
[[nodiscard]]
|
|
335
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_90a>() noexcept
|
|
380
336
|
{
|
|
381
|
-
|
|
337
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_90>();
|
|
338
|
+
__traits.arch_id = arch_id::sm_90a;
|
|
339
|
+
return __traits;
|
|
382
340
|
};
|
|
383
341
|
|
|
384
342
|
template <>
|
|
385
|
-
[[nodiscard]]
|
|
343
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_100>() noexcept
|
|
386
344
|
{
|
|
387
|
-
|
|
388
|
-
__traits.arch_id = id::sm_100;
|
|
389
|
-
__traits.compute_capability_major = 10;
|
|
390
|
-
__traits.compute_capability_minor = 0;
|
|
391
|
-
__traits.compute_capability = 100;
|
|
345
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_90);
|
|
392
346
|
__traits.max_shared_memory_per_multiprocessor = 228 * 1024;
|
|
393
347
|
__traits.max_blocks_per_multiprocessor = 32;
|
|
394
348
|
__traits.max_threads_per_multiprocessor = 2048;
|
|
395
349
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
396
|
-
__traits.reserved_shared_memory_per_block = 1024;
|
|
397
350
|
__traits.max_shared_memory_per_block_optin =
|
|
398
351
|
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
399
|
-
|
|
400
|
-
__traits.cluster_supported = true;
|
|
401
|
-
__traits.redux_intrinisic = true;
|
|
402
|
-
__traits.elect_intrinsic = true;
|
|
403
|
-
__traits.cp_async_supported = true;
|
|
404
|
-
__traits.tma_supported = true;
|
|
405
352
|
return __traits;
|
|
406
353
|
};
|
|
407
354
|
|
|
408
355
|
template <>
|
|
409
|
-
[[nodiscard]]
|
|
356
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_100a>() noexcept
|
|
410
357
|
{
|
|
411
|
-
|
|
358
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_100>();
|
|
359
|
+
__traits.arch_id = arch_id::sm_100a;
|
|
360
|
+
return __traits;
|
|
412
361
|
};
|
|
413
362
|
|
|
414
363
|
template <>
|
|
415
|
-
[[nodiscard]]
|
|
364
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_103>() noexcept
|
|
416
365
|
{
|
|
417
|
-
|
|
418
|
-
__traits.arch_id =
|
|
366
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_100>();
|
|
367
|
+
__traits.arch_id = arch_id::sm_103;
|
|
419
368
|
__traits.compute_capability_major = 10;
|
|
420
369
|
__traits.compute_capability_minor = 3;
|
|
421
|
-
__traits.compute_capability = 103;
|
|
370
|
+
__traits.compute_capability = compute_capability{103};
|
|
422
371
|
return __traits;
|
|
423
372
|
};
|
|
424
373
|
|
|
425
374
|
template <>
|
|
426
|
-
[[nodiscard]]
|
|
375
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_103a>() noexcept
|
|
427
376
|
{
|
|
428
|
-
|
|
377
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_103>();
|
|
378
|
+
__traits.arch_id = arch_id::sm_103a;
|
|
379
|
+
return __traits;
|
|
429
380
|
};
|
|
430
381
|
|
|
431
382
|
template <>
|
|
432
|
-
[[nodiscard]]
|
|
383
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_110>() noexcept
|
|
433
384
|
{
|
|
434
|
-
|
|
435
|
-
__traits.arch_id
|
|
436
|
-
__traits.compute_capability_major
|
|
437
|
-
__traits.compute_capability_minor
|
|
438
|
-
__traits.compute_capability
|
|
385
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_100>();
|
|
386
|
+
__traits.arch_id = arch_id::sm_110;
|
|
387
|
+
__traits.compute_capability_major = 11;
|
|
388
|
+
__traits.compute_capability_minor = 0;
|
|
389
|
+
__traits.compute_capability = compute_capability{110};
|
|
390
|
+
__traits.max_blocks_per_multiprocessor = 24;
|
|
391
|
+
__traits.max_threads_per_multiprocessor = 1536;
|
|
392
|
+
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
439
393
|
return __traits;
|
|
440
394
|
};
|
|
441
395
|
|
|
442
396
|
template <>
|
|
443
|
-
[[nodiscard]]
|
|
397
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_110a>() noexcept
|
|
444
398
|
{
|
|
445
|
-
|
|
399
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_110>();
|
|
400
|
+
__traits.arch_id = arch_id::sm_110a;
|
|
401
|
+
return __traits;
|
|
446
402
|
};
|
|
447
403
|
|
|
448
404
|
template <>
|
|
449
|
-
[[nodiscard]]
|
|
405
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_120>() noexcept
|
|
450
406
|
{
|
|
451
|
-
|
|
452
|
-
__traits.arch_id = id::sm_120;
|
|
453
|
-
__traits.compute_capability_major = 12;
|
|
454
|
-
__traits.compute_capability_minor = 0;
|
|
455
|
-
__traits.compute_capability = 120;
|
|
407
|
+
auto __traits = ::cuda::__common_arch_traits(arch_id::sm_120);
|
|
456
408
|
__traits.max_shared_memory_per_multiprocessor = 100 * 1024;
|
|
457
|
-
__traits.max_blocks_per_multiprocessor =
|
|
409
|
+
__traits.max_blocks_per_multiprocessor = 24;
|
|
458
410
|
__traits.max_threads_per_multiprocessor = 1536;
|
|
459
411
|
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
460
|
-
__traits.reserved_shared_memory_per_block = 1024;
|
|
461
412
|
__traits.max_shared_memory_per_block_optin =
|
|
462
413
|
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
414
|
+
return __traits;
|
|
415
|
+
};
|
|
463
416
|
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
__traits
|
|
468
|
-
__traits.
|
|
417
|
+
template <>
|
|
418
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_120a>() noexcept
|
|
419
|
+
{
|
|
420
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_120>();
|
|
421
|
+
__traits.arch_id = arch_id::sm_120a;
|
|
469
422
|
return __traits;
|
|
470
423
|
};
|
|
471
424
|
|
|
472
425
|
template <>
|
|
473
|
-
[[nodiscard]]
|
|
426
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_121>() noexcept
|
|
474
427
|
{
|
|
475
|
-
|
|
428
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_120>();
|
|
429
|
+
__traits.arch_id = arch_id::sm_121;
|
|
430
|
+
__traits.compute_capability_major = 12;
|
|
431
|
+
__traits.compute_capability_minor = 1;
|
|
432
|
+
__traits.compute_capability = compute_capability{121};
|
|
433
|
+
return __traits;
|
|
476
434
|
};
|
|
477
435
|
|
|
478
|
-
|
|
436
|
+
template <>
|
|
437
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits<arch_id::sm_121a>() noexcept
|
|
438
|
+
{
|
|
439
|
+
auto __traits = ::cuda::arch_traits<arch_id::sm_121>();
|
|
440
|
+
__traits.arch_id = arch_id::sm_121a;
|
|
441
|
+
return __traits;
|
|
442
|
+
};
|
|
479
443
|
|
|
480
|
-
|
|
444
|
+
//! @brief Gets the architecture traits for the given architecture id \c __id.
|
|
445
|
+
//!
|
|
446
|
+
//! @throws \c cuda::cuda_error if the \c __id is not a known architecture.
|
|
447
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits_for(arch_id __id)
|
|
481
448
|
{
|
|
482
449
|
switch (__id)
|
|
483
450
|
{
|
|
484
|
-
case
|
|
485
|
-
return ::cuda::
|
|
486
|
-
case
|
|
487
|
-
return ::cuda::
|
|
488
|
-
case
|
|
489
|
-
return ::cuda::
|
|
490
|
-
case
|
|
491
|
-
return ::cuda::
|
|
492
|
-
case
|
|
493
|
-
return ::cuda::
|
|
494
|
-
case
|
|
495
|
-
return ::cuda::
|
|
496
|
-
case
|
|
497
|
-
return ::cuda::
|
|
498
|
-
case
|
|
499
|
-
return ::cuda::
|
|
500
|
-
case
|
|
501
|
-
return ::cuda::
|
|
502
|
-
case
|
|
503
|
-
return ::cuda::
|
|
504
|
-
case
|
|
505
|
-
return ::cuda::
|
|
506
|
-
case
|
|
507
|
-
return ::cuda::
|
|
508
|
-
case
|
|
509
|
-
return ::cuda::
|
|
510
|
-
case
|
|
511
|
-
return ::cuda::
|
|
512
|
-
case
|
|
513
|
-
return ::cuda::
|
|
514
|
-
case
|
|
515
|
-
return ::cuda::
|
|
516
|
-
case
|
|
517
|
-
return ::cuda::
|
|
451
|
+
case arch_id::sm_60:
|
|
452
|
+
return ::cuda::arch_traits<arch_id::sm_60>();
|
|
453
|
+
case arch_id::sm_61:
|
|
454
|
+
return ::cuda::arch_traits<arch_id::sm_61>();
|
|
455
|
+
case arch_id::sm_70:
|
|
456
|
+
return ::cuda::arch_traits<arch_id::sm_70>();
|
|
457
|
+
case arch_id::sm_75:
|
|
458
|
+
return ::cuda::arch_traits<arch_id::sm_75>();
|
|
459
|
+
case arch_id::sm_80:
|
|
460
|
+
return ::cuda::arch_traits<arch_id::sm_80>();
|
|
461
|
+
case arch_id::sm_86:
|
|
462
|
+
return ::cuda::arch_traits<arch_id::sm_86>();
|
|
463
|
+
case arch_id::sm_87:
|
|
464
|
+
return ::cuda::arch_traits<arch_id::sm_87>();
|
|
465
|
+
case arch_id::sm_88:
|
|
466
|
+
return ::cuda::arch_traits<arch_id::sm_88>();
|
|
467
|
+
case arch_id::sm_89:
|
|
468
|
+
return ::cuda::arch_traits<arch_id::sm_89>();
|
|
469
|
+
case arch_id::sm_90:
|
|
470
|
+
return ::cuda::arch_traits<arch_id::sm_90>();
|
|
471
|
+
case arch_id::sm_90a:
|
|
472
|
+
return ::cuda::arch_traits<arch_id::sm_90a>();
|
|
473
|
+
case arch_id::sm_100:
|
|
474
|
+
return ::cuda::arch_traits<arch_id::sm_100>();
|
|
475
|
+
case arch_id::sm_100a:
|
|
476
|
+
return ::cuda::arch_traits<arch_id::sm_100a>();
|
|
477
|
+
case arch_id::sm_103:
|
|
478
|
+
return ::cuda::arch_traits<arch_id::sm_103>();
|
|
479
|
+
case arch_id::sm_103a:
|
|
480
|
+
return ::cuda::arch_traits<arch_id::sm_103a>();
|
|
481
|
+
case arch_id::sm_110:
|
|
482
|
+
return ::cuda::arch_traits<arch_id::sm_110>();
|
|
483
|
+
case arch_id::sm_110a:
|
|
484
|
+
return ::cuda::arch_traits<arch_id::sm_110a>();
|
|
485
|
+
case arch_id::sm_120:
|
|
486
|
+
return ::cuda::arch_traits<arch_id::sm_120>();
|
|
487
|
+
case arch_id::sm_120a:
|
|
488
|
+
return ::cuda::arch_traits<arch_id::sm_120a>();
|
|
489
|
+
case arch_id::sm_121:
|
|
490
|
+
return ::cuda::arch_traits<arch_id::sm_121>();
|
|
491
|
+
case arch_id::sm_121a:
|
|
492
|
+
return ::cuda::arch_traits<arch_id::sm_121a>();
|
|
518
493
|
default:
|
|
519
494
|
::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Traits requested for an unknown architecture");
|
|
520
495
|
break;
|
|
521
496
|
}
|
|
522
497
|
}
|
|
523
498
|
|
|
524
|
-
|
|
499
|
+
//! @brief Gets the architecture traits for the given compute capability \c __cc.
|
|
500
|
+
//!
|
|
501
|
+
//! @throws \c cuda::cuda_error if the \c __cc doesn't have a corresponding architecture id.
|
|
502
|
+
[[nodiscard]] _CCCL_API constexpr arch_traits_t arch_traits_for(compute_capability __cc)
|
|
525
503
|
{
|
|
526
|
-
|
|
527
|
-
{
|
|
528
|
-
::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
|
|
529
|
-
}
|
|
530
|
-
return static_cast<id>(compute_capability);
|
|
504
|
+
return ::cuda::arch_traits_for(::cuda::to_arch_id(__cc));
|
|
531
505
|
}
|
|
532
506
|
|
|
533
|
-
|
|
534
|
-
{
|
|
535
|
-
return ::cuda::arch::traits_for_id(::cuda::arch::id_for_compute_capability(compute_capability));
|
|
536
|
-
}
|
|
507
|
+
_CCCL_END_NAMESPACE_CUDA
|
|
537
508
|
|
|
538
|
-
|
|
539
|
-
{
|
|
540
|
-
switch (value)
|
|
541
|
-
{
|
|
542
|
-
case 90:
|
|
543
|
-
return id::sm_90a;
|
|
544
|
-
case 100:
|
|
545
|
-
return id::sm_100a;
|
|
546
|
-
case 103:
|
|
547
|
-
return id::sm_103a;
|
|
548
|
-
case 110:
|
|
549
|
-
return id::sm_110a;
|
|
550
|
-
case 120:
|
|
551
|
-
return id::sm_120a;
|
|
552
|
-
default:
|
|
553
|
-
::cuda::__throw_cuda_error(::cudaErrorInvalidValue, "Compute capability out of range");
|
|
554
|
-
break;
|
|
555
|
-
}
|
|
556
|
-
}
|
|
509
|
+
# if _CCCL_CUDA_COMPILATION()
|
|
557
510
|
|
|
558
|
-
|
|
559
|
-
[[nodiscard]] _CCCL_DEVICE inline constexpr arch::traits_t current_traits()
|
|
560
|
-
{
|
|
561
|
-
// fixme: this doesn't work with nvc++ -cuda
|
|
562
|
-
# ifdef __CUDA_ARCH__
|
|
563
|
-
# ifdef __CUDA_ARCH_SPECIFIC__
|
|
564
|
-
return ::cuda::arch::traits_for_id(::cuda::arch::__special_id_for_compute_capability(__CUDA_ARCH_SPECIFIC__ / 10));
|
|
565
|
-
# else
|
|
566
|
-
return ::cuda::arch::traits_for_compute_capability(__CUDA_ARCH__ / 10);
|
|
567
|
-
# endif // __CUDA_ARCH_SPECIFIC__
|
|
568
|
-
# else // __CUDA_ARCH__
|
|
569
|
-
// Should be unreachable in __device__ function
|
|
570
|
-
return ::cuda::arch::traits_t{};
|
|
571
|
-
# endif // __CUDA_ARCH__
|
|
572
|
-
}
|
|
511
|
+
_CCCL_BEGIN_NAMESPACE_CUDA_DEVICE
|
|
573
512
|
|
|
574
|
-
|
|
575
|
-
|
|
513
|
+
//! @brief Returns the \c cuda::arch_trait_t of the architecture that is currently being compiled.
|
|
514
|
+
//!
|
|
515
|
+
//! If the current architecture is not a known architecture from \c cuda::arch_id enumeration, the compilation
|
|
516
|
+
//! will fail.
|
|
517
|
+
//!
|
|
518
|
+
//! @note This API cannot be used in constexpr context when compiling with nvc++ in CUDA mode.
|
|
519
|
+
template <class _Dummy = void>
|
|
520
|
+
[[nodiscard]] _CCCL_DEVICE_API _CCCL_TARGET_CONSTEXPR ::cuda::arch_traits_t current_arch_traits() noexcept
|
|
576
521
|
{
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
}
|
|
581
|
-
|
|
582
|
-
{
|
|
583
|
-
// If the architecture is unknown, we need to craft the arch_traits from attributes
|
|
584
|
-
arch::traits_t __traits{};
|
|
585
|
-
__traits.compute_capability_major = __compute_capability / 10;
|
|
586
|
-
__traits.compute_capability_minor = __compute_capability % 10;
|
|
587
|
-
__traits.compute_capability = __compute_capability;
|
|
588
|
-
__traits.max_shared_memory_per_multiprocessor =
|
|
589
|
-
::cuda::device_attributes::max_shared_memory_per_multiprocessor(__device);
|
|
590
|
-
__traits.max_blocks_per_multiprocessor = ::cuda::device_attributes::max_blocks_per_multiprocessor(__device);
|
|
591
|
-
__traits.max_threads_per_multiprocessor = ::cuda::device_attributes::max_threads_per_multiprocessor(__device);
|
|
592
|
-
__traits.max_warps_per_multiprocessor = __traits.max_threads_per_multiprocessor / __traits.warp_size;
|
|
593
|
-
__traits.reserved_shared_memory_per_block = ::cuda::device_attributes::reserved_shared_memory_per_block(__device);
|
|
594
|
-
__traits.max_shared_memory_per_block_optin =
|
|
595
|
-
__traits.max_shared_memory_per_multiprocessor - __traits.reserved_shared_memory_per_block;
|
|
596
|
-
|
|
597
|
-
__traits.cluster_supported = __compute_capability >= 90;
|
|
598
|
-
__traits.redux_intrinisic = __compute_capability >= 80;
|
|
599
|
-
__traits.elect_intrinsic = __compute_capability >= 90;
|
|
600
|
-
__traits.cp_async_supported = __compute_capability >= 80;
|
|
601
|
-
__traits.tma_supported = __compute_capability >= 90;
|
|
602
|
-
return __traits;
|
|
603
|
-
}
|
|
522
|
+
# if _CCCL_DEVICE_COMPILATION()
|
|
523
|
+
return ::cuda::arch_traits_for(::cuda::device::current_arch_id<_Dummy>());
|
|
524
|
+
# else // ^^^ _CCCL_DEVICE_COMPILATION() ^^^ / vvv !_CCCL_DEVICE_COMPILATION() vvv
|
|
525
|
+
return {};
|
|
526
|
+
# endif // ^^^ !_CCCL_DEVICE_COMPILATION() ^^^
|
|
604
527
|
}
|
|
605
|
-
} // namespace arch
|
|
606
528
|
|
|
607
|
-
|
|
529
|
+
_CCCL_END_NAMESPACE_CUDA_DEVICE
|
|
530
|
+
|
|
531
|
+
# endif // _CCCL_CUDA_COMPILATION
|
|
608
532
|
|
|
609
533
|
# include <cuda/std/__cccl/epilogue.h>
|
|
610
534
|
|
|
611
|
-
#endif // _CCCL_HAS_CTK()
|
|
535
|
+
#endif // _CCCL_HAS_CTK()
|
|
612
536
|
|
|
613
537
|
#endif // _CUDA___DEVICE_ARCH_TRAITS_H
|