cuda-cccl 0.1.3.1.0.dev1486__cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/__init__.py +14 -0
- cuda/cccl/cooperative/__init__.py +3 -0
- cuda/cccl/cooperative/experimental/__init__.py +8 -0
- cuda/cccl/cooperative/experimental/_caching.py +48 -0
- cuda/cccl/cooperative/experimental/_common.py +276 -0
- cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
- cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
- cuda/cccl/cooperative/experimental/_types.py +953 -0
- cuda/cccl/cooperative/experimental/_typing.py +107 -0
- cuda/cccl/cooperative/experimental/block/__init__.py +33 -0
- cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
- cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
- cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
- cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
- cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
- cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
- cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +98 -0
- cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
- cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
- cuda/cccl/headers/__init__.py +7 -0
- cuda/cccl/headers/include/__init__.py +1 -0
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +261 -0
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +919 -0
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +227 -0
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +752 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +678 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +997 -0
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1032 -0
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +342 -0
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1346 -0
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +1259 -0
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +629 -0
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +2600 -0
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
- cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +259 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
- cuda/cccl/headers/include/cub/config.cuh +60 -0
- cuda/cccl/headers/include/cub/cub.cuh +112 -0
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +120 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +74 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
- cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +355 -0
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +206 -0
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
- cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
- cuda/cccl/headers/include/cub/device/device_for.cuh +994 -0
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
- cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
- cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3431 -0
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +1387 -0
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +368 -0
- cuda/cccl/headers/include/cub/device/device_scan.cuh +1901 -0
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
- cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
- cuda/cccl/headers/include/cub/device/device_transform.cuh +313 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1051 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1748 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +625 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +502 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +548 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +1374 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +439 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +552 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +397 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +338 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +523 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +437 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +397 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +555 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +283 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +215 -0
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +238 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +629 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +504 -0
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +340 -0
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +406 -0
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
- cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
- cuda/cccl/headers/include/cub/util_arch.cuh +163 -0
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
- cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
- cuda/cccl/headers/include/cub/util_device.cuh +779 -0
- cuda/cccl/headers/include/cub/util_macro.cuh +91 -0
- cuda/cccl/headers/include/cub/util_math.cuh +115 -0
- cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
- cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
- cuda/cccl/headers/include/cub/util_type.cuh +1111 -0
- cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
- cuda/cccl/headers/include/cub/version.cuh +89 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +688 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +437 -0
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1156 -0
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +169 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +210 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +84 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +127 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +209 -0
- cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
- cuda/cccl/headers/include/cuda/__barrier/aligned_size.h +61 -0
- cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +66 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +100 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +454 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +72 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +61 -0
- cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
- cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
- cuda/cccl/headers/include/cuda/__bit/bitmask.h +88 -0
- cuda/cccl/headers/include/cuda/__cccl_config +36 -0
- cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +126 -0
- cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
- cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
- cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
- cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
- cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
- cuda/cccl/headers/include/cuda/__cmath/round_down.h +104 -0
- cuda/cccl/headers/include/cuda/__cmath/round_up.h +106 -0
- cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +90 -0
- cuda/cccl/headers/include/cuda/__execution/require.h +67 -0
- cuda/cccl/headers/include/cuda/__execution/tune.h +62 -0
- cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
- cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +279 -0
- cuda/cccl/headers/include/cuda/__functional/get_device_address.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
- cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +261 -0
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +407 -0
- cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +323 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +481 -0
- cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +457 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +123 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +98 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +162 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +49 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +99 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
- cuda/cccl/headers/include/cuda/__memory/address_space.h +86 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +94 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +158 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +73 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +129 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +653 -0
- cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +57 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +101 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +193 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +957 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +288 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +596 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1445 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +117 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +62 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +101 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +62 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +15074 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +385 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +176 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +94 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +137 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +138 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +280 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +282 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2148 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1272 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +228 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +430 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1830 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +105 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +81 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +612 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4446 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4061 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6438 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +36 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4582 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +67 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +750 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +275 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +151 -0
- cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +97 -0
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +165 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
- cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
- cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +66 -0
- cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +249 -0
- cuda/cccl/headers/include/cuda/access_property +26 -0
- cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
- cuda/cccl/headers/include/cuda/atomic +27 -0
- cuda/cccl/headers/include/cuda/barrier +262 -0
- cuda/cccl/headers/include/cuda/bit +29 -0
- cuda/cccl/headers/include/cuda/cmath +35 -0
- cuda/cccl/headers/include/cuda/discard_memory +61 -0
- cuda/cccl/headers/include/cuda/functional +31 -0
- cuda/cccl/headers/include/cuda/iterator +31 -0
- cuda/cccl/headers/include/cuda/latch +27 -0
- cuda/cccl/headers/include/cuda/mdspan +28 -0
- cuda/cccl/headers/include/cuda/memory +28 -0
- cuda/cccl/headers/include/cuda/memory_resource +41 -0
- cuda/cccl/headers/include/cuda/numeric +28 -0
- cuda/cccl/headers/include/cuda/pipeline +579 -0
- cuda/cccl/headers/include/cuda/ptx +118 -0
- cuda/cccl/headers/include/cuda/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +60 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +52 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +143 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +79 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +74 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +129 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +64 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +51 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +69 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +188 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +72 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +71 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +141 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +89 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +121 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +89 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +103 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +99 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +69 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +264 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +123 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +135 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +129 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +72 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +77 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +156 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +96 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +127 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +218 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
- cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
- cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +250 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +105 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +73 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
- cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
- cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
- cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +84 -0
- cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +77 -0
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +183 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
- cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
- cuda/cccl/headers/include/cuda/std/__bit/integral.h +124 -0
- cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +1274 -0
- cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
- cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
- cuda/cccl/headers/include/cuda/std/__cccl/assert.h +146 -0
- cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +207 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +1343 -0
- cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +216 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +43 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
- cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
- cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +129 -0
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +124 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +326 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +35 -0
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
- cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +129 -0
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
- cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1234 -0
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +267 -0
- cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +176 -0
- cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
- cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
- cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
- cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
- cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
- cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +112 -0
- cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv_ +30 -0
- cuda/cccl/headers/include/cuda/std/__cmath/abs.h +240 -0
- cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +187 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +620 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +207 -0
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +181 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +250 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +213 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +250 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +323 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +163 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +201 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +176 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +129 -0
- cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +106 -0
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +503 -0
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +236 -0
- cuda/cccl/headers/include/cuda/std/__cmath/nvbf16.h +58 -0
- cuda/cccl/headers/include/cuda/std/__cmath/nvfp16.h +58 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +180 -0
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +877 -0
- cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +155 -0
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +170 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +292 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +351 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +350 -0
- cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +135 -0
- cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
- cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
- cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +46 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +274 -0
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
- cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +71 -0
- cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +57 -0
- cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
- cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
- cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
- cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
- cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
- cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
- cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
- cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +143 -0
- cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
- cuda/cccl/headers/include/cuda/std/__execution/env.h +436 -0
- cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +2002 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1078 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +178 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +172 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +103 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +64 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/nvfp_types.h +58 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +352 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +88 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +75 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +75 -0
- cuda/cccl/headers/include/cuda/std/__functional/compose.h +69 -0
- cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
- cuda/cccl/headers/include/cuda/std/__functional/function.h +1277 -0
- cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
- cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +560 -0
- cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +214 -0
- cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +121 -0
- cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
- cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +127 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
- cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +67 -0
- cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +278 -0
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +36 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
- cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/iterator_traits.h +40 -0
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +73 -0
- cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +38 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
- cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
- cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +102 -0
- cuda/cccl/headers/include/cuda/std/__iterator/access.h +132 -0
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +230 -0
- cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +103 -0
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +264 -0
- cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +608 -0
- cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +469 -0
- cuda/cccl/headers/include/cuda/std/__iterator/data.h +63 -0
- cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
- cuda/cccl/headers/include/cuda/std/__iterator/empty.h +54 -0
- cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +98 -0
- cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +152 -0
- cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +105 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +141 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +935 -0
- cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +401 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +102 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +99 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +101 -0
- cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +92 -0
- cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +146 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +615 -0
- cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
- cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +88 -0
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +259 -0
- cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +781 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +55 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +140 -0
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +134 -0
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +328 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +100 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +74 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +363 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +765 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +317 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +310 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +615 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +190 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +347 -0
- cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
- cuda/cccl/headers/include/cuda/std/__memory/align.h +87 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +569 -0
- cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
- cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
- cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +231 -0
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
- cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
- cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +260 -0
- cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +686 -0
- cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +771 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +55 -0
- cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
- cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
- cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
- cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
- cuda/cccl/headers/include/cuda/std/__new_ +29 -0
- cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +57 -0
- cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
- cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +80 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
- cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
- cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +100 -0
- cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +70 -0
- cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +61 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
- cuda/cccl/headers/include/cuda/std/__ranges/access.h +304 -0
- cuda/cccl/headers/include/cuda/std/__ranges/all.h +97 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +313 -0
- cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
- cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
- cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty.h +111 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +77 -0
- cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
- cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +271 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
- cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +114 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
- cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +343 -0
- cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +156 -0
- cuda/cccl/headers/include/cuda/std/__ranges/size.h +200 -0
- cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +263 -0
- cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +531 -0
- cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
- cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +181 -0
- cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
- cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +591 -0
- cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +299 -0
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
- cuda/cccl/headers/include/cuda/std/__string_ +29 -0
- cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
- cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +144 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +236 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +216 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +242 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +174 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +87 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +203 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1069 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +132 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
- cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +32 -0
- cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
- cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +103 -0
- cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
- cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +162 -0
- cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +56 -0
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
- cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
- cuda/cccl/headers/include/cuda/std/__utility/move.h +75 -0
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +808 -0
- cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
- cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +763 -0
- cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
- cuda/cccl/headers/include/cuda/std/__utility/swap.h +65 -0
- cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/typeid.h +425 -0
- cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
- cuda/cccl/headers/include/cuda/std/array +527 -0
- cuda/cccl/headers/include/cuda/std/atomic +823 -0
- cuda/cccl/headers/include/cuda/std/barrier +43 -0
- cuda/cccl/headers/include/cuda/std/bit +35 -0
- cuda/cccl/headers/include/cuda/std/bitset +1026 -0
- cuda/cccl/headers/include/cuda/std/cassert +28 -0
- cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
- cuda/cccl/headers/include/cuda/std/cfloat +59 -0
- cuda/cccl/headers/include/cuda/std/chrono +26 -0
- cuda/cccl/headers/include/cuda/std/climits +61 -0
- cuda/cccl/headers/include/cuda/std/cmath +25 -0
- cuda/cccl/headers/include/cuda/std/complex +25 -0
- cuda/cccl/headers/include/cuda/std/concepts +48 -0
- cuda/cccl/headers/include/cuda/std/cstddef +28 -0
- cuda/cccl/headers/include/cuda/std/cstdint +178 -0
- cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
- cuda/cccl/headers/include/cuda/std/cstring +111 -0
- cuda/cccl/headers/include/cuda/std/ctime +147 -0
- cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +258 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +2692 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3689 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +685 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/complex +1610 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/optional +1786 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1378 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2160 -0
- cuda/cccl/headers/include/cuda/std/execution +27 -0
- cuda/cccl/headers/include/cuda/std/expected +30 -0
- cuda/cccl/headers/include/cuda/std/functional +56 -0
- cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
- cuda/cccl/headers/include/cuda/std/iterator +70 -0
- cuda/cccl/headers/include/cuda/std/latch +34 -0
- cuda/cccl/headers/include/cuda/std/limits +28 -0
- cuda/cccl/headers/include/cuda/std/linalg +30 -0
- cuda/cccl/headers/include/cuda/std/mdspan +38 -0
- cuda/cccl/headers/include/cuda/std/memory +39 -0
- cuda/cccl/headers/include/cuda/std/numbers +335 -0
- cuda/cccl/headers/include/cuda/std/numeric +41 -0
- cuda/cccl/headers/include/cuda/std/optional +25 -0
- cuda/cccl/headers/include/cuda/std/ranges +68 -0
- cuda/cccl/headers/include/cuda/std/ratio +417 -0
- cuda/cccl/headers/include/cuda/std/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/source_location +83 -0
- cuda/cccl/headers/include/cuda/std/span +640 -0
- cuda/cccl/headers/include/cuda/std/string_view +814 -0
- cuda/cccl/headers/include/cuda/std/tuple +26 -0
- cuda/cccl/headers/include/cuda/std/type_traits +176 -0
- cuda/cccl/headers/include/cuda/std/utility +70 -0
- cuda/cccl/headers/include/cuda/std/variant +25 -0
- cuda/cccl/headers/include/cuda/std/version +245 -0
- cuda/cccl/headers/include/cuda/stream_ref +54 -0
- cuda/cccl/headers/include/cuda/type_traits +27 -0
- cuda/cccl/headers/include/cuda/version +16 -0
- cuda/cccl/headers/include/cuda/warp +28 -0
- cuda/cccl/headers/include/cuda/work_stealing +26 -0
- cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
- cuda/cccl/headers/include/nv/detail/__target_macros +599 -0
- cuda/cccl/headers/include/nv/target +229 -0
- cuda/cccl/headers/include/thrust/addressof.h +22 -0
- cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
- cuda/cccl/headers/include/thrust/advance.h +59 -0
- cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
- cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
- cuda/cccl/headers/include/thrust/complex.h +859 -0
- cuda/cccl/headers/include/thrust/copy.h +506 -0
- cuda/cccl/headers/include/thrust/count.h +245 -0
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
- cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
- cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
- cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
- cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
- cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
- cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
- cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
- cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
- cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config.h +36 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
- cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
- cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
- cuda/cccl/headers/include/thrust/detail/count.h +55 -0
- cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
- cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
- cuda/cccl/headers/include/thrust/detail/device_malloc.inl +60 -0
- cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
- cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
- cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
- cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
- cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
- cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
- cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
- cuda/cccl/headers/include/thrust/detail/function.h +49 -0
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
- cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
- cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
- cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +285 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +92 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
- cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
- cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
- cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
- cuda/cccl/headers/include/thrust/detail/pointer.h +217 -0
- cuda/cccl/headers/include/thrust/detail/pointer.inl +172 -0
- cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
- cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +189 -0
- cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
- cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
- cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
- cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
- cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
- cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
- cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
- cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
- cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
- cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
- cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
- cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +138 -0
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
- cuda/cccl/headers/include/thrust/detail/transform.inl +250 -0
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +131 -0
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
- cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +60 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
- cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
- cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.h +630 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +1242 -0
- cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
- cuda/cccl/headers/include/thrust/device_delete.h +59 -0
- cuda/cccl/headers/include/thrust/device_free.h +72 -0
- cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
- cuda/cccl/headers/include/thrust/device_malloc.h +108 -0
- cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
- cuda/cccl/headers/include/thrust/device_new.h +91 -0
- cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
- cuda/cccl/headers/include/thrust/device_ptr.h +202 -0
- cuda/cccl/headers/include/thrust/device_reference.h +986 -0
- cuda/cccl/headers/include/thrust/device_vector.h +574 -0
- cuda/cccl/headers/include/thrust/distance.h +43 -0
- cuda/cccl/headers/include/thrust/equal.h +247 -0
- cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
- cuda/cccl/headers/include/thrust/extrema.h +657 -0
- cuda/cccl/headers/include/thrust/fill.h +201 -0
- cuda/cccl/headers/include/thrust/find.h +382 -0
- cuda/cccl/headers/include/thrust/for_each.h +261 -0
- cuda/cccl/headers/include/thrust/functional.h +396 -0
- cuda/cccl/headers/include/thrust/gather.h +464 -0
- cuda/cccl/headers/include/thrust/generate.h +193 -0
- cuda/cccl/headers/include/thrust/host_vector.h +576 -0
- cuda/cccl/headers/include/thrust/inner_product.h +264 -0
- cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +219 -0
- cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
- cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
- cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
- cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +164 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +245 -0
- cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +192 -0
- cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
- cuda/cccl/headers/include/thrust/iterator/retag.h +74 -0
- cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +221 -0
- cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +184 -0
- cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
- cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
- cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +357 -0
- cuda/cccl/headers/include/thrust/logical.h +290 -0
- cuda/cccl/headers/include/thrust/memory.h +395 -0
- cuda/cccl/headers/include/thrust/merge.h +725 -0
- cuda/cccl/headers/include/thrust/mismatch.h +261 -0
- cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +68 -0
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
- cuda/cccl/headers/include/thrust/mr/new.h +100 -0
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
- cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
- cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +65 -0
- cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
- cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
- cuda/cccl/headers/include/thrust/pair.h +102 -0
- cuda/cccl/headers/include/thrust/partition.h +1383 -0
- cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
- cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
- cuda/cccl/headers/include/thrust/random.h +120 -0
- cuda/cccl/headers/include/thrust/reduce.h +1112 -0
- cuda/cccl/headers/include/thrust/remove.h +768 -0
- cuda/cccl/headers/include/thrust/replace.h +827 -0
- cuda/cccl/headers/include/thrust/reverse.h +213 -0
- cuda/cccl/headers/include/thrust/scan.h +1671 -0
- cuda/cccl/headers/include/thrust/scatter.h +446 -0
- cuda/cccl/headers/include/thrust/sequence.h +277 -0
- cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
- cuda/cccl/headers/include/thrust/shuffle.h +182 -0
- cuda/cccl/headers/include/thrust/sort.h +1320 -0
- cuda/cccl/headers/include/thrust/swap.h +147 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/vector.inl +130 -0
- cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +161 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +119 -0
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/load_iterator.h +58 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/make_load_iterator.h +60 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +630 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +98 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +961 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1000 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +164 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +88 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1736 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +403 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +94 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +648 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
- cuda/cccl/headers/include/thrust/system/cuda/error.h +175 -0
- cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +140 -0
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sequence.inl +95 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +109 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.inl +185 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +187 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
- cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +259 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +160 -0
- cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
- cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system/system_error.h +184 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +160 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +120 -0
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system_error.h +57 -0
- cuda/cccl/headers/include/thrust/tabulate.h +125 -0
- cuda/cccl/headers/include/thrust/transform.h +903 -0
- cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
- cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
- cuda/cccl/headers/include/thrust/tuple.h +142 -0
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +182 -0
- cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +306 -0
- cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +93 -0
- cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
- cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
- cuda/cccl/headers/include/thrust/unique.h +1090 -0
- cuda/cccl/headers/include/thrust/universal_allocator.h +90 -0
- cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
- cuda/cccl/headers/include/thrust/version.h +93 -0
- cuda/cccl/headers/include/thrust/zip_function.h +176 -0
- cuda/cccl/headers/include_paths.py +72 -0
- cuda/cccl/parallel/__init__.py +3 -0
- cuda/cccl/parallel/experimental/__init__.py +3 -0
- cuda/cccl/parallel/experimental/_bindings.py +24 -0
- cuda/cccl/parallel/experimental/_bindings.pyi +388 -0
- cuda/cccl/parallel/experimental/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/_bindings_impl.pyx +2158 -0
- cuda/cccl/parallel/experimental/_caching.py +71 -0
- cuda/cccl/parallel/experimental/_cccl_interop.py +371 -0
- cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
- cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
- cuda/cccl/parallel/experimental/algorithms/__init__.py +28 -0
- cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +172 -0
- cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +244 -0
- cuda/cccl/parallel/experimental/algorithms/_reduce.py +136 -0
- cuda/cccl/parallel/experimental/algorithms/_scan.py +179 -0
- cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +183 -0
- cuda/cccl/parallel/experimental/algorithms/_transform.py +213 -0
- cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +179 -0
- cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
- cuda/cccl/parallel/experimental/cccl/libcccl.c.parallel.so +0 -0
- cuda/cccl/parallel/experimental/iterators/__init__.py +157 -0
- cuda/cccl/parallel/experimental/iterators/_iterators.py +650 -0
- cuda/cccl/parallel/experimental/numba_utils.py +6 -0
- cuda/cccl/parallel/experimental/struct.py +150 -0
- cuda/cccl/parallel/experimental/typing.py +27 -0
- cuda/cccl/py.typed +0 -0
- cuda_cccl-0.1.3.1.0.dev1486.dist-info/METADATA +29 -0
- cuda_cccl-0.1.3.1.0.dev1486.dist-info/RECORD +1819 -0
- cuda_cccl-0.1.3.1.0.dev1486.dist-info/WHEEL +6 -0
- cuda_cccl-0.1.3.1.0.dev1486.dist-info/licenses/LICENSE +1 -0
|
@@ -0,0 +1,1830 @@
|
|
|
1
|
+
// This file was automatically generated. Do not edit.
|
|
2
|
+
|
|
3
|
+
#ifndef _CUDA_PTX_GENERATED_ST_H_
|
|
4
|
+
#define _CUDA_PTX_GENERATED_ST_H_
|
|
5
|
+
|
|
6
|
+
/*
|
|
7
|
+
// st.space.b8 [addr], src; // PTX ISA 10, SM_50
|
|
8
|
+
// .space = { .global }
|
|
9
|
+
template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
|
|
10
|
+
__device__ static inline void st(
|
|
11
|
+
cuda::ptx::space_global_t,
|
|
12
|
+
B8* addr,
|
|
13
|
+
B8 src);
|
|
14
|
+
*/
|
|
15
|
+
#if __cccl_ptx_isa >= 100
|
|
16
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_is_not_supported_before_SM_50__();
|
|
17
|
+
template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
|
|
18
|
+
_CCCL_DEVICE static inline void st(space_global_t, _B8* __addr, _B8 __src)
|
|
19
|
+
{
|
|
20
|
+
// __space == space_global (due to parameter type constraint)
|
|
21
|
+
static_assert(sizeof(_B8) == 1, "");
|
|
22
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
|
|
23
|
+
asm("st.global.b8 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)) : "memory");
|
|
24
|
+
# else
|
|
25
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
26
|
+
__cuda_ptx_st_is_not_supported_before_SM_50__();
|
|
27
|
+
# endif
|
|
28
|
+
}
|
|
29
|
+
#endif // __cccl_ptx_isa >= 100
|
|
30
|
+
|
|
31
|
+
/*
|
|
32
|
+
// st.space.b16 [addr], src; // PTX ISA 10, SM_50
|
|
33
|
+
// .space = { .global }
|
|
34
|
+
template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
|
|
35
|
+
__device__ static inline void st(
|
|
36
|
+
cuda::ptx::space_global_t,
|
|
37
|
+
B16* addr,
|
|
38
|
+
B16 src);
|
|
39
|
+
*/
|
|
40
|
+
#if __cccl_ptx_isa >= 100
|
|
41
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_is_not_supported_before_SM_50__();
|
|
42
|
+
template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
|
|
43
|
+
_CCCL_DEVICE static inline void st(space_global_t, _B16* __addr, _B16 __src)
|
|
44
|
+
{
|
|
45
|
+
// __space == space_global (due to parameter type constraint)
|
|
46
|
+
static_assert(sizeof(_B16) == 2, "");
|
|
47
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
|
|
48
|
+
asm("st.global.b16 [%0], %1;"
|
|
49
|
+
:
|
|
50
|
+
: "l"(__as_ptr_gmem(__addr)), "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src))
|
|
51
|
+
: "memory");
|
|
52
|
+
# else
|
|
53
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
54
|
+
__cuda_ptx_st_is_not_supported_before_SM_50__();
|
|
55
|
+
# endif
|
|
56
|
+
}
|
|
57
|
+
#endif // __cccl_ptx_isa >= 100
|
|
58
|
+
|
|
59
|
+
/*
|
|
60
|
+
// st.space.b32 [addr], src; // PTX ISA 10, SM_50
|
|
61
|
+
// .space = { .global }
|
|
62
|
+
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
|
|
63
|
+
__device__ static inline void st(
|
|
64
|
+
cuda::ptx::space_global_t,
|
|
65
|
+
B32* addr,
|
|
66
|
+
B32 src);
|
|
67
|
+
*/
|
|
68
|
+
#if __cccl_ptx_isa >= 100
|
|
69
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_is_not_supported_before_SM_50__();
|
|
70
|
+
template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
|
|
71
|
+
_CCCL_DEVICE static inline void st(space_global_t, _B32* __addr, _B32 __src)
|
|
72
|
+
{
|
|
73
|
+
// __space == space_global (due to parameter type constraint)
|
|
74
|
+
static_assert(sizeof(_B32) == 4, "");
|
|
75
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
|
|
76
|
+
asm("st.global.b32 [%0], %1;"
|
|
77
|
+
:
|
|
78
|
+
: "l"(__as_ptr_gmem(__addr)), "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src))
|
|
79
|
+
: "memory");
|
|
80
|
+
# else
|
|
81
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
82
|
+
__cuda_ptx_st_is_not_supported_before_SM_50__();
|
|
83
|
+
# endif
|
|
84
|
+
}
|
|
85
|
+
#endif // __cccl_ptx_isa >= 100
|
|
86
|
+
|
|
87
|
+
/*
|
|
88
|
+
// st.space.b64 [addr], src; // PTX ISA 10, SM_50
|
|
89
|
+
// .space = { .global }
|
|
90
|
+
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
|
|
91
|
+
__device__ static inline void st(
|
|
92
|
+
cuda::ptx::space_global_t,
|
|
93
|
+
B64* addr,
|
|
94
|
+
B64 src);
|
|
95
|
+
*/
|
|
96
|
+
#if __cccl_ptx_isa >= 100
|
|
97
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_is_not_supported_before_SM_50__();
|
|
98
|
+
template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
|
|
99
|
+
_CCCL_DEVICE static inline void st(space_global_t, _B64* __addr, _B64 __src)
|
|
100
|
+
{
|
|
101
|
+
// __space == space_global (due to parameter type constraint)
|
|
102
|
+
static_assert(sizeof(_B64) == 8, "");
|
|
103
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 500
|
|
104
|
+
asm("st.global.b64 [%0], %1;"
|
|
105
|
+
:
|
|
106
|
+
: "l"(__as_ptr_gmem(__addr)), "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src))
|
|
107
|
+
: "memory");
|
|
108
|
+
# else
|
|
109
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
110
|
+
__cuda_ptx_st_is_not_supported_before_SM_50__();
|
|
111
|
+
# endif
|
|
112
|
+
}
|
|
113
|
+
#endif // __cccl_ptx_isa >= 100
|
|
114
|
+
|
|
115
|
+
/*
|
|
116
|
+
// st.space.b128 [addr], src; // PTX ISA 83, SM_70
|
|
117
|
+
// .space = { .global }
|
|
118
|
+
template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
|
|
119
|
+
__device__ static inline void st(
|
|
120
|
+
cuda::ptx::space_global_t,
|
|
121
|
+
B128* addr,
|
|
122
|
+
B128 src);
|
|
123
|
+
*/
|
|
124
|
+
#if __cccl_ptx_isa >= 830
|
|
125
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_is_not_supported_before_SM_70__();
|
|
126
|
+
template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
|
|
127
|
+
_CCCL_DEVICE static inline void st(space_global_t, _B128* __addr, _B128 __src)
|
|
128
|
+
{
|
|
129
|
+
// __space == space_global (due to parameter type constraint)
|
|
130
|
+
static_assert(sizeof(_B128) == 16, "");
|
|
131
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
132
|
+
asm("{\n\t .reg .b128 B128_src; \n\t"
|
|
133
|
+
"mov.b128 B128_src, {%1, %2}; \n"
|
|
134
|
+
"st.global.b128 [%0], B128_src;\n\t"
|
|
135
|
+
"}"
|
|
136
|
+
:
|
|
137
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
138
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).x),
|
|
139
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).y)
|
|
140
|
+
: "memory");
|
|
141
|
+
# else
|
|
142
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
143
|
+
__cuda_ptx_st_is_not_supported_before_SM_70__();
|
|
144
|
+
# endif
|
|
145
|
+
}
|
|
146
|
+
#endif // __cccl_ptx_isa >= 830
|
|
147
|
+
|
|
148
|
+
/*
|
|
149
|
+
// st.space.L2::cache_hint.b8 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
150
|
+
// .space = { .global }
|
|
151
|
+
template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
|
|
152
|
+
__device__ static inline void st_L2_cache_hint(
|
|
153
|
+
cuda::ptx::space_global_t,
|
|
154
|
+
B8* addr,
|
|
155
|
+
B8 src,
|
|
156
|
+
uint64_t cache_policy);
|
|
157
|
+
*/
|
|
158
|
+
#if __cccl_ptx_isa >= 740
|
|
159
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
160
|
+
template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
|
|
161
|
+
_CCCL_DEVICE static inline void
|
|
162
|
+
st_L2_cache_hint(space_global_t, _B8* __addr, _B8 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
163
|
+
{
|
|
164
|
+
// __space == space_global (due to parameter type constraint)
|
|
165
|
+
static_assert(sizeof(_B8) == 1, "");
|
|
166
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
167
|
+
asm("st.global.L2::cache_hint.b8 [%0], %1, %2;"
|
|
168
|
+
:
|
|
169
|
+
: "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)), "l"(__cache_policy)
|
|
170
|
+
: "memory");
|
|
171
|
+
# else
|
|
172
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
173
|
+
__cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
174
|
+
# endif
|
|
175
|
+
}
|
|
176
|
+
#endif // __cccl_ptx_isa >= 740
|
|
177
|
+
|
|
178
|
+
/*
|
|
179
|
+
// st.space.L2::cache_hint.b16 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
180
|
+
// .space = { .global }
|
|
181
|
+
template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
|
|
182
|
+
__device__ static inline void st_L2_cache_hint(
|
|
183
|
+
cuda::ptx::space_global_t,
|
|
184
|
+
B16* addr,
|
|
185
|
+
B16 src,
|
|
186
|
+
uint64_t cache_policy);
|
|
187
|
+
*/
|
|
188
|
+
#if __cccl_ptx_isa >= 740
|
|
189
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
190
|
+
template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
|
|
191
|
+
_CCCL_DEVICE static inline void
|
|
192
|
+
st_L2_cache_hint(space_global_t, _B16* __addr, _B16 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
193
|
+
{
|
|
194
|
+
// __space == space_global (due to parameter type constraint)
|
|
195
|
+
static_assert(sizeof(_B16) == 2, "");
|
|
196
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
197
|
+
asm("st.global.L2::cache_hint.b16 [%0], %1, %2;"
|
|
198
|
+
:
|
|
199
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
200
|
+
"h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src)),
|
|
201
|
+
"l"(__cache_policy)
|
|
202
|
+
: "memory");
|
|
203
|
+
# else
|
|
204
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
205
|
+
__cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
206
|
+
# endif
|
|
207
|
+
}
|
|
208
|
+
#endif // __cccl_ptx_isa >= 740
|
|
209
|
+
|
|
210
|
+
/*
|
|
211
|
+
// st.space.L2::cache_hint.b32 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
212
|
+
// .space = { .global }
|
|
213
|
+
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
|
|
214
|
+
__device__ static inline void st_L2_cache_hint(
|
|
215
|
+
cuda::ptx::space_global_t,
|
|
216
|
+
B32* addr,
|
|
217
|
+
B32 src,
|
|
218
|
+
uint64_t cache_policy);
|
|
219
|
+
*/
|
|
220
|
+
#if __cccl_ptx_isa >= 740
|
|
221
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
222
|
+
template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
|
|
223
|
+
_CCCL_DEVICE static inline void
|
|
224
|
+
st_L2_cache_hint(space_global_t, _B32* __addr, _B32 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
225
|
+
{
|
|
226
|
+
// __space == space_global (due to parameter type constraint)
|
|
227
|
+
static_assert(sizeof(_B32) == 4, "");
|
|
228
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
229
|
+
asm("st.global.L2::cache_hint.b32 [%0], %1, %2;"
|
|
230
|
+
:
|
|
231
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
232
|
+
"r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src)),
|
|
233
|
+
"l"(__cache_policy)
|
|
234
|
+
: "memory");
|
|
235
|
+
# else
|
|
236
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
237
|
+
__cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
238
|
+
# endif
|
|
239
|
+
}
|
|
240
|
+
#endif // __cccl_ptx_isa >= 740
|
|
241
|
+
|
|
242
|
+
/*
|
|
243
|
+
// st.space.L2::cache_hint.b64 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
244
|
+
// .space = { .global }
|
|
245
|
+
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
|
|
246
|
+
__device__ static inline void st_L2_cache_hint(
|
|
247
|
+
cuda::ptx::space_global_t,
|
|
248
|
+
B64* addr,
|
|
249
|
+
B64 src,
|
|
250
|
+
uint64_t cache_policy);
|
|
251
|
+
*/
|
|
252
|
+
#if __cccl_ptx_isa >= 740
|
|
253
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
254
|
+
template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
|
|
255
|
+
_CCCL_DEVICE static inline void
|
|
256
|
+
st_L2_cache_hint(space_global_t, _B64* __addr, _B64 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
257
|
+
{
|
|
258
|
+
// __space == space_global (due to parameter type constraint)
|
|
259
|
+
static_assert(sizeof(_B64) == 8, "");
|
|
260
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
261
|
+
asm("st.global.L2::cache_hint.b64 [%0], %1, %2;"
|
|
262
|
+
:
|
|
263
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
264
|
+
"l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src)),
|
|
265
|
+
"l"(__cache_policy)
|
|
266
|
+
: "memory");
|
|
267
|
+
# else
|
|
268
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
269
|
+
__cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
270
|
+
# endif
|
|
271
|
+
}
|
|
272
|
+
#endif // __cccl_ptx_isa >= 740
|
|
273
|
+
|
|
274
|
+
/*
|
|
275
|
+
// st.space.L2::cache_hint.b128 [addr], src, cache_policy; // PTX ISA 83, SM_80
|
|
276
|
+
// .space = { .global }
|
|
277
|
+
template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
|
|
278
|
+
__device__ static inline void st_L2_cache_hint(
|
|
279
|
+
cuda::ptx::space_global_t,
|
|
280
|
+
B128* addr,
|
|
281
|
+
B128 src,
|
|
282
|
+
uint64_t cache_policy);
|
|
283
|
+
*/
|
|
284
|
+
#if __cccl_ptx_isa >= 830
|
|
285
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
286
|
+
template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
|
|
287
|
+
_CCCL_DEVICE static inline void
|
|
288
|
+
st_L2_cache_hint(space_global_t, _B128* __addr, _B128 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
289
|
+
{
|
|
290
|
+
// __space == space_global (due to parameter type constraint)
|
|
291
|
+
static_assert(sizeof(_B128) == 16, "");
|
|
292
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
293
|
+
asm("{\n\t .reg .b128 B128_src; \n\t"
|
|
294
|
+
"mov.b128 B128_src, {%1, %2}; \n"
|
|
295
|
+
"st.global.L2::cache_hint.b128 [%0], B128_src, %3;\n\t"
|
|
296
|
+
"}"
|
|
297
|
+
:
|
|
298
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
299
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).x),
|
|
300
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).y),
|
|
301
|
+
"l"(__cache_policy)
|
|
302
|
+
: "memory");
|
|
303
|
+
# else
|
|
304
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
305
|
+
__cuda_ptx_st_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
306
|
+
# endif
|
|
307
|
+
}
|
|
308
|
+
#endif // __cccl_ptx_isa >= 830
|
|
309
|
+
|
|
310
|
+
/*
|
|
311
|
+
// st.space.L1::evict_normal.b8 [addr], src; // PTX ISA 74, SM_70
|
|
312
|
+
// .space = { .global }
|
|
313
|
+
template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
|
|
314
|
+
__device__ static inline void st_L1_evict_normal(
|
|
315
|
+
cuda::ptx::space_global_t,
|
|
316
|
+
B8* addr,
|
|
317
|
+
B8 src);
|
|
318
|
+
*/
|
|
319
|
+
#if __cccl_ptx_isa >= 740
|
|
320
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
|
|
321
|
+
template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
|
|
322
|
+
_CCCL_DEVICE static inline void st_L1_evict_normal(space_global_t, _B8* __addr, _B8 __src)
|
|
323
|
+
{
|
|
324
|
+
// __space == space_global (due to parameter type constraint)
|
|
325
|
+
static_assert(sizeof(_B8) == 1, "");
|
|
326
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
327
|
+
asm("st.global.L1::evict_normal.b8 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)) : "memory");
|
|
328
|
+
# else
|
|
329
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
330
|
+
__cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
|
|
331
|
+
# endif
|
|
332
|
+
}
|
|
333
|
+
#endif // __cccl_ptx_isa >= 740
|
|
334
|
+
|
|
335
|
+
/*
|
|
336
|
+
// st.space.L1::evict_normal.b16 [addr], src; // PTX ISA 74, SM_70
|
|
337
|
+
// .space = { .global }
|
|
338
|
+
template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
|
|
339
|
+
__device__ static inline void st_L1_evict_normal(
|
|
340
|
+
cuda::ptx::space_global_t,
|
|
341
|
+
B16* addr,
|
|
342
|
+
B16 src);
|
|
343
|
+
*/
|
|
344
|
+
#if __cccl_ptx_isa >= 740
|
|
345
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
|
|
346
|
+
template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
|
|
347
|
+
_CCCL_DEVICE static inline void st_L1_evict_normal(space_global_t, _B16* __addr, _B16 __src)
|
|
348
|
+
{
|
|
349
|
+
// __space == space_global (due to parameter type constraint)
|
|
350
|
+
static_assert(sizeof(_B16) == 2, "");
|
|
351
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
352
|
+
asm("st.global.L1::evict_normal.b16 [%0], %1;"
|
|
353
|
+
:
|
|
354
|
+
: "l"(__as_ptr_gmem(__addr)), "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src))
|
|
355
|
+
: "memory");
|
|
356
|
+
# else
|
|
357
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
358
|
+
__cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
|
|
359
|
+
# endif
|
|
360
|
+
}
|
|
361
|
+
#endif // __cccl_ptx_isa >= 740
|
|
362
|
+
|
|
363
|
+
/*
|
|
364
|
+
// st.space.L1::evict_normal.b32 [addr], src; // PTX ISA 74, SM_70
|
|
365
|
+
// .space = { .global }
|
|
366
|
+
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
|
|
367
|
+
__device__ static inline void st_L1_evict_normal(
|
|
368
|
+
cuda::ptx::space_global_t,
|
|
369
|
+
B32* addr,
|
|
370
|
+
B32 src);
|
|
371
|
+
*/
|
|
372
|
+
#if __cccl_ptx_isa >= 740
|
|
373
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
|
|
374
|
+
template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
|
|
375
|
+
_CCCL_DEVICE static inline void st_L1_evict_normal(space_global_t, _B32* __addr, _B32 __src)
|
|
376
|
+
{
|
|
377
|
+
// __space == space_global (due to parameter type constraint)
|
|
378
|
+
static_assert(sizeof(_B32) == 4, "");
|
|
379
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
380
|
+
asm("st.global.L1::evict_normal.b32 [%0], %1;"
|
|
381
|
+
:
|
|
382
|
+
: "l"(__as_ptr_gmem(__addr)), "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src))
|
|
383
|
+
: "memory");
|
|
384
|
+
# else
|
|
385
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
386
|
+
__cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
|
|
387
|
+
# endif
|
|
388
|
+
}
|
|
389
|
+
#endif // __cccl_ptx_isa >= 740
|
|
390
|
+
|
|
391
|
+
/*
|
|
392
|
+
// st.space.L1::evict_normal.b64 [addr], src; // PTX ISA 74, SM_70
|
|
393
|
+
// .space = { .global }
|
|
394
|
+
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
|
|
395
|
+
__device__ static inline void st_L1_evict_normal(
|
|
396
|
+
cuda::ptx::space_global_t,
|
|
397
|
+
B64* addr,
|
|
398
|
+
B64 src);
|
|
399
|
+
*/
|
|
400
|
+
#if __cccl_ptx_isa >= 740
|
|
401
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
|
|
402
|
+
template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
|
|
403
|
+
_CCCL_DEVICE static inline void st_L1_evict_normal(space_global_t, _B64* __addr, _B64 __src)
|
|
404
|
+
{
|
|
405
|
+
// __space == space_global (due to parameter type constraint)
|
|
406
|
+
static_assert(sizeof(_B64) == 8, "");
|
|
407
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
408
|
+
asm("st.global.L1::evict_normal.b64 [%0], %1;"
|
|
409
|
+
:
|
|
410
|
+
: "l"(__as_ptr_gmem(__addr)), "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src))
|
|
411
|
+
: "memory");
|
|
412
|
+
# else
|
|
413
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
414
|
+
__cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
|
|
415
|
+
# endif
|
|
416
|
+
}
|
|
417
|
+
#endif // __cccl_ptx_isa >= 740
|
|
418
|
+
|
|
419
|
+
/*
|
|
420
|
+
// st.space.L1::evict_normal.b128 [addr], src; // PTX ISA 83, SM_70
|
|
421
|
+
// .space = { .global }
|
|
422
|
+
template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
|
|
423
|
+
__device__ static inline void st_L1_evict_normal(
|
|
424
|
+
cuda::ptx::space_global_t,
|
|
425
|
+
B128* addr,
|
|
426
|
+
B128 src);
|
|
427
|
+
*/
|
|
428
|
+
#if __cccl_ptx_isa >= 830
|
|
429
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
|
|
430
|
+
template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
|
|
431
|
+
_CCCL_DEVICE static inline void st_L1_evict_normal(space_global_t, _B128* __addr, _B128 __src)
|
|
432
|
+
{
|
|
433
|
+
// __space == space_global (due to parameter type constraint)
|
|
434
|
+
static_assert(sizeof(_B128) == 16, "");
|
|
435
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
436
|
+
asm("{\n\t .reg .b128 B128_src; \n\t"
|
|
437
|
+
"mov.b128 B128_src, {%1, %2}; \n"
|
|
438
|
+
"st.global.L1::evict_normal.b128 [%0], B128_src;\n\t"
|
|
439
|
+
"}"
|
|
440
|
+
:
|
|
441
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
442
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).x),
|
|
443
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).y)
|
|
444
|
+
: "memory");
|
|
445
|
+
# else
|
|
446
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
447
|
+
__cuda_ptx_st_L1_evict_normal_is_not_supported_before_SM_70__();
|
|
448
|
+
# endif
|
|
449
|
+
}
|
|
450
|
+
#endif // __cccl_ptx_isa >= 830
|
|
451
|
+
|
|
452
|
+
/*
|
|
453
|
+
// st.space.L1::evict_normal.L2::cache_hint.b8 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
454
|
+
// .space = { .global }
|
|
455
|
+
template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
|
|
456
|
+
__device__ static inline void st_L1_evict_normal_L2_cache_hint(
|
|
457
|
+
cuda::ptx::space_global_t,
|
|
458
|
+
B8* addr,
|
|
459
|
+
B8 src,
|
|
460
|
+
uint64_t cache_policy);
|
|
461
|
+
*/
|
|
462
|
+
#if __cccl_ptx_isa >= 740
|
|
463
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
464
|
+
template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
|
|
465
|
+
_CCCL_DEVICE static inline void
|
|
466
|
+
st_L1_evict_normal_L2_cache_hint(space_global_t, _B8* __addr, _B8 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
467
|
+
{
|
|
468
|
+
// __space == space_global (due to parameter type constraint)
|
|
469
|
+
static_assert(sizeof(_B8) == 1, "");
|
|
470
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
471
|
+
asm("st.global.L1::evict_normal.L2::cache_hint.b8 [%0], %1, %2;"
|
|
472
|
+
:
|
|
473
|
+
: "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)), "l"(__cache_policy)
|
|
474
|
+
: "memory");
|
|
475
|
+
# else
|
|
476
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
477
|
+
__cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
478
|
+
# endif
|
|
479
|
+
}
|
|
480
|
+
#endif // __cccl_ptx_isa >= 740
|
|
481
|
+
|
|
482
|
+
/*
|
|
483
|
+
// st.space.L1::evict_normal.L2::cache_hint.b16 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
484
|
+
// .space = { .global }
|
|
485
|
+
template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
|
|
486
|
+
__device__ static inline void st_L1_evict_normal_L2_cache_hint(
|
|
487
|
+
cuda::ptx::space_global_t,
|
|
488
|
+
B16* addr,
|
|
489
|
+
B16 src,
|
|
490
|
+
uint64_t cache_policy);
|
|
491
|
+
*/
|
|
492
|
+
#if __cccl_ptx_isa >= 740
|
|
493
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
494
|
+
template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
|
|
495
|
+
_CCCL_DEVICE static inline void
|
|
496
|
+
st_L1_evict_normal_L2_cache_hint(space_global_t, _B16* __addr, _B16 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
497
|
+
{
|
|
498
|
+
// __space == space_global (due to parameter type constraint)
|
|
499
|
+
static_assert(sizeof(_B16) == 2, "");
|
|
500
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
501
|
+
asm("st.global.L1::evict_normal.L2::cache_hint.b16 [%0], %1, %2;"
|
|
502
|
+
:
|
|
503
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
504
|
+
"h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src)),
|
|
505
|
+
"l"(__cache_policy)
|
|
506
|
+
: "memory");
|
|
507
|
+
# else
|
|
508
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
509
|
+
__cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
510
|
+
# endif
|
|
511
|
+
}
|
|
512
|
+
#endif // __cccl_ptx_isa >= 740
|
|
513
|
+
|
|
514
|
+
/*
|
|
515
|
+
// st.space.L1::evict_normal.L2::cache_hint.b32 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
516
|
+
// .space = { .global }
|
|
517
|
+
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
|
|
518
|
+
__device__ static inline void st_L1_evict_normal_L2_cache_hint(
|
|
519
|
+
cuda::ptx::space_global_t,
|
|
520
|
+
B32* addr,
|
|
521
|
+
B32 src,
|
|
522
|
+
uint64_t cache_policy);
|
|
523
|
+
*/
|
|
524
|
+
#if __cccl_ptx_isa >= 740
|
|
525
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
526
|
+
template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
|
|
527
|
+
_CCCL_DEVICE static inline void
|
|
528
|
+
st_L1_evict_normal_L2_cache_hint(space_global_t, _B32* __addr, _B32 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
529
|
+
{
|
|
530
|
+
// __space == space_global (due to parameter type constraint)
|
|
531
|
+
static_assert(sizeof(_B32) == 4, "");
|
|
532
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
533
|
+
asm("st.global.L1::evict_normal.L2::cache_hint.b32 [%0], %1, %2;"
|
|
534
|
+
:
|
|
535
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
536
|
+
"r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src)),
|
|
537
|
+
"l"(__cache_policy)
|
|
538
|
+
: "memory");
|
|
539
|
+
# else
|
|
540
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
541
|
+
__cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
542
|
+
# endif
|
|
543
|
+
}
|
|
544
|
+
#endif // __cccl_ptx_isa >= 740
|
|
545
|
+
|
|
546
|
+
/*
|
|
547
|
+
// st.space.L1::evict_normal.L2::cache_hint.b64 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
548
|
+
// .space = { .global }
|
|
549
|
+
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
|
|
550
|
+
__device__ static inline void st_L1_evict_normal_L2_cache_hint(
|
|
551
|
+
cuda::ptx::space_global_t,
|
|
552
|
+
B64* addr,
|
|
553
|
+
B64 src,
|
|
554
|
+
uint64_t cache_policy);
|
|
555
|
+
*/
|
|
556
|
+
#if __cccl_ptx_isa >= 740
|
|
557
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
558
|
+
template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
|
|
559
|
+
_CCCL_DEVICE static inline void
|
|
560
|
+
st_L1_evict_normal_L2_cache_hint(space_global_t, _B64* __addr, _B64 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
561
|
+
{
|
|
562
|
+
// __space == space_global (due to parameter type constraint)
|
|
563
|
+
static_assert(sizeof(_B64) == 8, "");
|
|
564
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
565
|
+
asm("st.global.L1::evict_normal.L2::cache_hint.b64 [%0], %1, %2;"
|
|
566
|
+
:
|
|
567
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
568
|
+
"l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src)),
|
|
569
|
+
"l"(__cache_policy)
|
|
570
|
+
: "memory");
|
|
571
|
+
# else
|
|
572
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
573
|
+
__cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
574
|
+
# endif
|
|
575
|
+
}
|
|
576
|
+
#endif // __cccl_ptx_isa >= 740
|
|
577
|
+
|
|
578
|
+
/*
|
|
579
|
+
// st.space.L1::evict_normal.L2::cache_hint.b128 [addr], src, cache_policy; // PTX ISA 83, SM_80
|
|
580
|
+
// .space = { .global }
|
|
581
|
+
template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
|
|
582
|
+
__device__ static inline void st_L1_evict_normal_L2_cache_hint(
|
|
583
|
+
cuda::ptx::space_global_t,
|
|
584
|
+
B128* addr,
|
|
585
|
+
B128 src,
|
|
586
|
+
uint64_t cache_policy);
|
|
587
|
+
*/
|
|
588
|
+
#if __cccl_ptx_isa >= 830
|
|
589
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
590
|
+
template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
|
|
591
|
+
_CCCL_DEVICE static inline void
|
|
592
|
+
st_L1_evict_normal_L2_cache_hint(space_global_t, _B128* __addr, _B128 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
593
|
+
{
|
|
594
|
+
// __space == space_global (due to parameter type constraint)
|
|
595
|
+
static_assert(sizeof(_B128) == 16, "");
|
|
596
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
597
|
+
asm("{\n\t .reg .b128 B128_src; \n\t"
|
|
598
|
+
"mov.b128 B128_src, {%1, %2}; \n"
|
|
599
|
+
"st.global.L1::evict_normal.L2::cache_hint.b128 [%0], B128_src, %3;\n\t"
|
|
600
|
+
"}"
|
|
601
|
+
:
|
|
602
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
603
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).x),
|
|
604
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).y),
|
|
605
|
+
"l"(__cache_policy)
|
|
606
|
+
: "memory");
|
|
607
|
+
# else
|
|
608
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
609
|
+
__cuda_ptx_st_L1_evict_normal_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
610
|
+
# endif
|
|
611
|
+
}
|
|
612
|
+
#endif // __cccl_ptx_isa >= 830
|
|
613
|
+
|
|
614
|
+
/*
|
|
615
|
+
// st.space.L1::evict_unchanged.b8 [addr], src; // PTX ISA 74, SM_70
|
|
616
|
+
// .space = { .global }
|
|
617
|
+
template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
|
|
618
|
+
__device__ static inline void st_L1_evict_unchanged(
|
|
619
|
+
cuda::ptx::space_global_t,
|
|
620
|
+
B8* addr,
|
|
621
|
+
B8 src);
|
|
622
|
+
*/
|
|
623
|
+
#if __cccl_ptx_isa >= 740
|
|
624
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
|
|
625
|
+
template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
|
|
626
|
+
_CCCL_DEVICE static inline void st_L1_evict_unchanged(space_global_t, _B8* __addr, _B8 __src)
|
|
627
|
+
{
|
|
628
|
+
// __space == space_global (due to parameter type constraint)
|
|
629
|
+
static_assert(sizeof(_B8) == 1, "");
|
|
630
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
631
|
+
asm("st.global.L1::evict_unchanged.b8 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)) : "memory");
|
|
632
|
+
# else
|
|
633
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
634
|
+
__cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
|
|
635
|
+
# endif
|
|
636
|
+
}
|
|
637
|
+
#endif // __cccl_ptx_isa >= 740
|
|
638
|
+
|
|
639
|
+
/*
|
|
640
|
+
// st.space.L1::evict_unchanged.b16 [addr], src; // PTX ISA 74, SM_70
|
|
641
|
+
// .space = { .global }
|
|
642
|
+
template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
|
|
643
|
+
__device__ static inline void st_L1_evict_unchanged(
|
|
644
|
+
cuda::ptx::space_global_t,
|
|
645
|
+
B16* addr,
|
|
646
|
+
B16 src);
|
|
647
|
+
*/
|
|
648
|
+
#if __cccl_ptx_isa >= 740
|
|
649
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
|
|
650
|
+
template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
|
|
651
|
+
_CCCL_DEVICE static inline void st_L1_evict_unchanged(space_global_t, _B16* __addr, _B16 __src)
|
|
652
|
+
{
|
|
653
|
+
// __space == space_global (due to parameter type constraint)
|
|
654
|
+
static_assert(sizeof(_B16) == 2, "");
|
|
655
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
656
|
+
asm("st.global.L1::evict_unchanged.b16 [%0], %1;"
|
|
657
|
+
:
|
|
658
|
+
: "l"(__as_ptr_gmem(__addr)), "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src))
|
|
659
|
+
: "memory");
|
|
660
|
+
# else
|
|
661
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
662
|
+
__cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
|
|
663
|
+
# endif
|
|
664
|
+
}
|
|
665
|
+
#endif // __cccl_ptx_isa >= 740
|
|
666
|
+
|
|
667
|
+
/*
|
|
668
|
+
// st.space.L1::evict_unchanged.b32 [addr], src; // PTX ISA 74, SM_70
|
|
669
|
+
// .space = { .global }
|
|
670
|
+
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
|
|
671
|
+
__device__ static inline void st_L1_evict_unchanged(
|
|
672
|
+
cuda::ptx::space_global_t,
|
|
673
|
+
B32* addr,
|
|
674
|
+
B32 src);
|
|
675
|
+
*/
|
|
676
|
+
#if __cccl_ptx_isa >= 740
|
|
677
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
|
|
678
|
+
template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
|
|
679
|
+
_CCCL_DEVICE static inline void st_L1_evict_unchanged(space_global_t, _B32* __addr, _B32 __src)
|
|
680
|
+
{
|
|
681
|
+
// __space == space_global (due to parameter type constraint)
|
|
682
|
+
static_assert(sizeof(_B32) == 4, "");
|
|
683
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
684
|
+
asm("st.global.L1::evict_unchanged.b32 [%0], %1;"
|
|
685
|
+
:
|
|
686
|
+
: "l"(__as_ptr_gmem(__addr)), "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src))
|
|
687
|
+
: "memory");
|
|
688
|
+
# else
|
|
689
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
690
|
+
__cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
|
|
691
|
+
# endif
|
|
692
|
+
}
|
|
693
|
+
#endif // __cccl_ptx_isa >= 740
|
|
694
|
+
|
|
695
|
+
/*
|
|
696
|
+
// st.space.L1::evict_unchanged.b64 [addr], src; // PTX ISA 74, SM_70
|
|
697
|
+
// .space = { .global }
|
|
698
|
+
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
|
|
699
|
+
__device__ static inline void st_L1_evict_unchanged(
|
|
700
|
+
cuda::ptx::space_global_t,
|
|
701
|
+
B64* addr,
|
|
702
|
+
B64 src);
|
|
703
|
+
*/
|
|
704
|
+
#if __cccl_ptx_isa >= 740
|
|
705
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
|
|
706
|
+
template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
|
|
707
|
+
_CCCL_DEVICE static inline void st_L1_evict_unchanged(space_global_t, _B64* __addr, _B64 __src)
|
|
708
|
+
{
|
|
709
|
+
// __space == space_global (due to parameter type constraint)
|
|
710
|
+
static_assert(sizeof(_B64) == 8, "");
|
|
711
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
712
|
+
asm("st.global.L1::evict_unchanged.b64 [%0], %1;"
|
|
713
|
+
:
|
|
714
|
+
: "l"(__as_ptr_gmem(__addr)), "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src))
|
|
715
|
+
: "memory");
|
|
716
|
+
# else
|
|
717
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
718
|
+
__cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
|
|
719
|
+
# endif
|
|
720
|
+
}
|
|
721
|
+
#endif // __cccl_ptx_isa >= 740
|
|
722
|
+
|
|
723
|
+
/*
|
|
724
|
+
// st.space.L1::evict_unchanged.b128 [addr], src; // PTX ISA 83, SM_70
|
|
725
|
+
// .space = { .global }
|
|
726
|
+
template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
|
|
727
|
+
__device__ static inline void st_L1_evict_unchanged(
|
|
728
|
+
cuda::ptx::space_global_t,
|
|
729
|
+
B128* addr,
|
|
730
|
+
B128 src);
|
|
731
|
+
*/
|
|
732
|
+
#if __cccl_ptx_isa >= 830
|
|
733
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
|
|
734
|
+
template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
|
|
735
|
+
_CCCL_DEVICE static inline void st_L1_evict_unchanged(space_global_t, _B128* __addr, _B128 __src)
|
|
736
|
+
{
|
|
737
|
+
// __space == space_global (due to parameter type constraint)
|
|
738
|
+
static_assert(sizeof(_B128) == 16, "");
|
|
739
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
740
|
+
asm("{\n\t .reg .b128 B128_src; \n\t"
|
|
741
|
+
"mov.b128 B128_src, {%1, %2}; \n"
|
|
742
|
+
"st.global.L1::evict_unchanged.b128 [%0], B128_src;\n\t"
|
|
743
|
+
"}"
|
|
744
|
+
:
|
|
745
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
746
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).x),
|
|
747
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).y)
|
|
748
|
+
: "memory");
|
|
749
|
+
# else
|
|
750
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
751
|
+
__cuda_ptx_st_L1_evict_unchanged_is_not_supported_before_SM_70__();
|
|
752
|
+
# endif
|
|
753
|
+
}
|
|
754
|
+
#endif // __cccl_ptx_isa >= 830
|
|
755
|
+
|
|
756
|
+
/*
|
|
757
|
+
// st.space.L1::evict_unchanged.L2::cache_hint.b8 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
758
|
+
// .space = { .global }
|
|
759
|
+
template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
|
|
760
|
+
__device__ static inline void st_L1_evict_unchanged_L2_cache_hint(
|
|
761
|
+
cuda::ptx::space_global_t,
|
|
762
|
+
B8* addr,
|
|
763
|
+
B8 src,
|
|
764
|
+
uint64_t cache_policy);
|
|
765
|
+
*/
|
|
766
|
+
#if __cccl_ptx_isa >= 740
|
|
767
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
768
|
+
template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
|
|
769
|
+
_CCCL_DEVICE static inline void
|
|
770
|
+
st_L1_evict_unchanged_L2_cache_hint(space_global_t, _B8* __addr, _B8 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
771
|
+
{
|
|
772
|
+
// __space == space_global (due to parameter type constraint)
|
|
773
|
+
static_assert(sizeof(_B8) == 1, "");
|
|
774
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
775
|
+
asm("st.global.L1::evict_unchanged.L2::cache_hint.b8 [%0], %1, %2;"
|
|
776
|
+
:
|
|
777
|
+
: "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)), "l"(__cache_policy)
|
|
778
|
+
: "memory");
|
|
779
|
+
# else
|
|
780
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
781
|
+
__cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
782
|
+
# endif
|
|
783
|
+
}
|
|
784
|
+
#endif // __cccl_ptx_isa >= 740
|
|
785
|
+
|
|
786
|
+
/*
|
|
787
|
+
// st.space.L1::evict_unchanged.L2::cache_hint.b16 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
788
|
+
// .space = { .global }
|
|
789
|
+
template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
|
|
790
|
+
__device__ static inline void st_L1_evict_unchanged_L2_cache_hint(
|
|
791
|
+
cuda::ptx::space_global_t,
|
|
792
|
+
B16* addr,
|
|
793
|
+
B16 src,
|
|
794
|
+
uint64_t cache_policy);
|
|
795
|
+
*/
|
|
796
|
+
#if __cccl_ptx_isa >= 740
|
|
797
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
798
|
+
template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
|
|
799
|
+
_CCCL_DEVICE static inline void
|
|
800
|
+
st_L1_evict_unchanged_L2_cache_hint(space_global_t, _B16* __addr, _B16 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
801
|
+
{
|
|
802
|
+
// __space == space_global (due to parameter type constraint)
|
|
803
|
+
static_assert(sizeof(_B16) == 2, "");
|
|
804
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
805
|
+
asm("st.global.L1::evict_unchanged.L2::cache_hint.b16 [%0], %1, %2;"
|
|
806
|
+
:
|
|
807
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
808
|
+
"h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src)),
|
|
809
|
+
"l"(__cache_policy)
|
|
810
|
+
: "memory");
|
|
811
|
+
# else
|
|
812
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
813
|
+
__cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
814
|
+
# endif
|
|
815
|
+
}
|
|
816
|
+
#endif // __cccl_ptx_isa >= 740
|
|
817
|
+
|
|
818
|
+
/*
|
|
819
|
+
// st.space.L1::evict_unchanged.L2::cache_hint.b32 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
820
|
+
// .space = { .global }
|
|
821
|
+
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
|
|
822
|
+
__device__ static inline void st_L1_evict_unchanged_L2_cache_hint(
|
|
823
|
+
cuda::ptx::space_global_t,
|
|
824
|
+
B32* addr,
|
|
825
|
+
B32 src,
|
|
826
|
+
uint64_t cache_policy);
|
|
827
|
+
*/
|
|
828
|
+
#if __cccl_ptx_isa >= 740
|
|
829
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
830
|
+
template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
|
|
831
|
+
_CCCL_DEVICE static inline void
|
|
832
|
+
st_L1_evict_unchanged_L2_cache_hint(space_global_t, _B32* __addr, _B32 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
833
|
+
{
|
|
834
|
+
// __space == space_global (due to parameter type constraint)
|
|
835
|
+
static_assert(sizeof(_B32) == 4, "");
|
|
836
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
837
|
+
asm("st.global.L1::evict_unchanged.L2::cache_hint.b32 [%0], %1, %2;"
|
|
838
|
+
:
|
|
839
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
840
|
+
"r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src)),
|
|
841
|
+
"l"(__cache_policy)
|
|
842
|
+
: "memory");
|
|
843
|
+
# else
|
|
844
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
845
|
+
__cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
846
|
+
# endif
|
|
847
|
+
}
|
|
848
|
+
#endif // __cccl_ptx_isa >= 740
|
|
849
|
+
|
|
850
|
+
/*
|
|
851
|
+
// st.space.L1::evict_unchanged.L2::cache_hint.b64 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
852
|
+
// .space = { .global }
|
|
853
|
+
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
|
|
854
|
+
__device__ static inline void st_L1_evict_unchanged_L2_cache_hint(
|
|
855
|
+
cuda::ptx::space_global_t,
|
|
856
|
+
B64* addr,
|
|
857
|
+
B64 src,
|
|
858
|
+
uint64_t cache_policy);
|
|
859
|
+
*/
|
|
860
|
+
#if __cccl_ptx_isa >= 740
|
|
861
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
862
|
+
template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
|
|
863
|
+
_CCCL_DEVICE static inline void
|
|
864
|
+
st_L1_evict_unchanged_L2_cache_hint(space_global_t, _B64* __addr, _B64 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
865
|
+
{
|
|
866
|
+
// __space == space_global (due to parameter type constraint)
|
|
867
|
+
static_assert(sizeof(_B64) == 8, "");
|
|
868
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
869
|
+
asm("st.global.L1::evict_unchanged.L2::cache_hint.b64 [%0], %1, %2;"
|
|
870
|
+
:
|
|
871
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
872
|
+
"l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src)),
|
|
873
|
+
"l"(__cache_policy)
|
|
874
|
+
: "memory");
|
|
875
|
+
# else
|
|
876
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
877
|
+
__cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
878
|
+
# endif
|
|
879
|
+
}
|
|
880
|
+
#endif // __cccl_ptx_isa >= 740
|
|
881
|
+
|
|
882
|
+
/*
|
|
883
|
+
// st.space.L1::evict_unchanged.L2::cache_hint.b128 [addr], src, cache_policy; // PTX ISA 83, SM_80
|
|
884
|
+
// .space = { .global }
|
|
885
|
+
template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
|
|
886
|
+
__device__ static inline void st_L1_evict_unchanged_L2_cache_hint(
|
|
887
|
+
cuda::ptx::space_global_t,
|
|
888
|
+
B128* addr,
|
|
889
|
+
B128 src,
|
|
890
|
+
uint64_t cache_policy);
|
|
891
|
+
*/
|
|
892
|
+
#if __cccl_ptx_isa >= 830
|
|
893
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
894
|
+
template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
|
|
895
|
+
_CCCL_DEVICE static inline void
|
|
896
|
+
st_L1_evict_unchanged_L2_cache_hint(space_global_t, _B128* __addr, _B128 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
897
|
+
{
|
|
898
|
+
// __space == space_global (due to parameter type constraint)
|
|
899
|
+
static_assert(sizeof(_B128) == 16, "");
|
|
900
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
901
|
+
asm("{\n\t .reg .b128 B128_src; \n\t"
|
|
902
|
+
"mov.b128 B128_src, {%1, %2}; \n"
|
|
903
|
+
"st.global.L1::evict_unchanged.L2::cache_hint.b128 [%0], B128_src, %3;\n\t"
|
|
904
|
+
"}"
|
|
905
|
+
:
|
|
906
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
907
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).x),
|
|
908
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).y),
|
|
909
|
+
"l"(__cache_policy)
|
|
910
|
+
: "memory");
|
|
911
|
+
# else
|
|
912
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
913
|
+
__cuda_ptx_st_L1_evict_unchanged_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
914
|
+
# endif
|
|
915
|
+
}
|
|
916
|
+
#endif // __cccl_ptx_isa >= 830
|
|
917
|
+
|
|
918
|
+
/*
|
|
919
|
+
// st.space.L1::evict_first.b8 [addr], src; // PTX ISA 74, SM_70
|
|
920
|
+
// .space = { .global }
|
|
921
|
+
template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
|
|
922
|
+
__device__ static inline void st_L1_evict_first(
|
|
923
|
+
cuda::ptx::space_global_t,
|
|
924
|
+
B8* addr,
|
|
925
|
+
B8 src);
|
|
926
|
+
*/
|
|
927
|
+
#if __cccl_ptx_isa >= 740
|
|
928
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
|
|
929
|
+
template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
|
|
930
|
+
_CCCL_DEVICE static inline void st_L1_evict_first(space_global_t, _B8* __addr, _B8 __src)
|
|
931
|
+
{
|
|
932
|
+
// __space == space_global (due to parameter type constraint)
|
|
933
|
+
static_assert(sizeof(_B8) == 1, "");
|
|
934
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
935
|
+
asm("st.global.L1::evict_first.b8 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)) : "memory");
|
|
936
|
+
# else
|
|
937
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
938
|
+
__cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
|
|
939
|
+
# endif
|
|
940
|
+
}
|
|
941
|
+
#endif // __cccl_ptx_isa >= 740
|
|
942
|
+
|
|
943
|
+
/*
|
|
944
|
+
// st.space.L1::evict_first.b16 [addr], src; // PTX ISA 74, SM_70
|
|
945
|
+
// .space = { .global }
|
|
946
|
+
template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
|
|
947
|
+
__device__ static inline void st_L1_evict_first(
|
|
948
|
+
cuda::ptx::space_global_t,
|
|
949
|
+
B16* addr,
|
|
950
|
+
B16 src);
|
|
951
|
+
*/
|
|
952
|
+
#if __cccl_ptx_isa >= 740
|
|
953
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
|
|
954
|
+
template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
|
|
955
|
+
_CCCL_DEVICE static inline void st_L1_evict_first(space_global_t, _B16* __addr, _B16 __src)
|
|
956
|
+
{
|
|
957
|
+
// __space == space_global (due to parameter type constraint)
|
|
958
|
+
static_assert(sizeof(_B16) == 2, "");
|
|
959
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
960
|
+
asm("st.global.L1::evict_first.b16 [%0], %1;"
|
|
961
|
+
:
|
|
962
|
+
: "l"(__as_ptr_gmem(__addr)), "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src))
|
|
963
|
+
: "memory");
|
|
964
|
+
# else
|
|
965
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
966
|
+
__cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
|
|
967
|
+
# endif
|
|
968
|
+
}
|
|
969
|
+
#endif // __cccl_ptx_isa >= 740
|
|
970
|
+
|
|
971
|
+
/*
|
|
972
|
+
// st.space.L1::evict_first.b32 [addr], src; // PTX ISA 74, SM_70
|
|
973
|
+
// .space = { .global }
|
|
974
|
+
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
|
|
975
|
+
__device__ static inline void st_L1_evict_first(
|
|
976
|
+
cuda::ptx::space_global_t,
|
|
977
|
+
B32* addr,
|
|
978
|
+
B32 src);
|
|
979
|
+
*/
|
|
980
|
+
#if __cccl_ptx_isa >= 740
|
|
981
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
|
|
982
|
+
template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
|
|
983
|
+
_CCCL_DEVICE static inline void st_L1_evict_first(space_global_t, _B32* __addr, _B32 __src)
|
|
984
|
+
{
|
|
985
|
+
// __space == space_global (due to parameter type constraint)
|
|
986
|
+
static_assert(sizeof(_B32) == 4, "");
|
|
987
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
988
|
+
asm("st.global.L1::evict_first.b32 [%0], %1;"
|
|
989
|
+
:
|
|
990
|
+
: "l"(__as_ptr_gmem(__addr)), "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src))
|
|
991
|
+
: "memory");
|
|
992
|
+
# else
|
|
993
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
994
|
+
__cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
|
|
995
|
+
# endif
|
|
996
|
+
}
|
|
997
|
+
#endif // __cccl_ptx_isa >= 740
|
|
998
|
+
|
|
999
|
+
/*
|
|
1000
|
+
// st.space.L1::evict_first.b64 [addr], src; // PTX ISA 74, SM_70
|
|
1001
|
+
// .space = { .global }
|
|
1002
|
+
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
|
|
1003
|
+
__device__ static inline void st_L1_evict_first(
|
|
1004
|
+
cuda::ptx::space_global_t,
|
|
1005
|
+
B64* addr,
|
|
1006
|
+
B64 src);
|
|
1007
|
+
*/
|
|
1008
|
+
#if __cccl_ptx_isa >= 740
|
|
1009
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
|
|
1010
|
+
template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
|
|
1011
|
+
_CCCL_DEVICE static inline void st_L1_evict_first(space_global_t, _B64* __addr, _B64 __src)
|
|
1012
|
+
{
|
|
1013
|
+
// __space == space_global (due to parameter type constraint)
|
|
1014
|
+
static_assert(sizeof(_B64) == 8, "");
|
|
1015
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
1016
|
+
asm("st.global.L1::evict_first.b64 [%0], %1;"
|
|
1017
|
+
:
|
|
1018
|
+
: "l"(__as_ptr_gmem(__addr)), "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src))
|
|
1019
|
+
: "memory");
|
|
1020
|
+
# else
|
|
1021
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1022
|
+
__cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
|
|
1023
|
+
# endif
|
|
1024
|
+
}
|
|
1025
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1026
|
+
|
|
1027
|
+
/*
|
|
1028
|
+
// st.space.L1::evict_first.b128 [addr], src; // PTX ISA 83, SM_70
|
|
1029
|
+
// .space = { .global }
|
|
1030
|
+
template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
|
|
1031
|
+
__device__ static inline void st_L1_evict_first(
|
|
1032
|
+
cuda::ptx::space_global_t,
|
|
1033
|
+
B128* addr,
|
|
1034
|
+
B128 src);
|
|
1035
|
+
*/
|
|
1036
|
+
#if __cccl_ptx_isa >= 830
|
|
1037
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
|
|
1038
|
+
template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
|
|
1039
|
+
_CCCL_DEVICE static inline void st_L1_evict_first(space_global_t, _B128* __addr, _B128 __src)
|
|
1040
|
+
{
|
|
1041
|
+
// __space == space_global (due to parameter type constraint)
|
|
1042
|
+
static_assert(sizeof(_B128) == 16, "");
|
|
1043
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
1044
|
+
asm("{\n\t .reg .b128 B128_src; \n\t"
|
|
1045
|
+
"mov.b128 B128_src, {%1, %2}; \n"
|
|
1046
|
+
"st.global.L1::evict_first.b128 [%0], B128_src;\n\t"
|
|
1047
|
+
"}"
|
|
1048
|
+
:
|
|
1049
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
1050
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).x),
|
|
1051
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).y)
|
|
1052
|
+
: "memory");
|
|
1053
|
+
# else
|
|
1054
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1055
|
+
__cuda_ptx_st_L1_evict_first_is_not_supported_before_SM_70__();
|
|
1056
|
+
# endif
|
|
1057
|
+
}
|
|
1058
|
+
#endif // __cccl_ptx_isa >= 830
|
|
1059
|
+
|
|
1060
|
+
/*
|
|
1061
|
+
// st.space.L1::evict_first.L2::cache_hint.b8 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
1062
|
+
// .space = { .global }
|
|
1063
|
+
template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
|
|
1064
|
+
__device__ static inline void st_L1_evict_first_L2_cache_hint(
|
|
1065
|
+
cuda::ptx::space_global_t,
|
|
1066
|
+
B8* addr,
|
|
1067
|
+
B8 src,
|
|
1068
|
+
uint64_t cache_policy);
|
|
1069
|
+
*/
|
|
1070
|
+
#if __cccl_ptx_isa >= 740
|
|
1071
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1072
|
+
template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
|
|
1073
|
+
_CCCL_DEVICE static inline void
|
|
1074
|
+
st_L1_evict_first_L2_cache_hint(space_global_t, _B8* __addr, _B8 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
1075
|
+
{
|
|
1076
|
+
// __space == space_global (due to parameter type constraint)
|
|
1077
|
+
static_assert(sizeof(_B8) == 1, "");
|
|
1078
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
1079
|
+
asm("st.global.L1::evict_first.L2::cache_hint.b8 [%0], %1, %2;"
|
|
1080
|
+
:
|
|
1081
|
+
: "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)), "l"(__cache_policy)
|
|
1082
|
+
: "memory");
|
|
1083
|
+
# else
|
|
1084
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1085
|
+
__cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1086
|
+
# endif
|
|
1087
|
+
}
|
|
1088
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1089
|
+
|
|
1090
|
+
/*
|
|
1091
|
+
// st.space.L1::evict_first.L2::cache_hint.b16 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
1092
|
+
// .space = { .global }
|
|
1093
|
+
template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
|
|
1094
|
+
__device__ static inline void st_L1_evict_first_L2_cache_hint(
|
|
1095
|
+
cuda::ptx::space_global_t,
|
|
1096
|
+
B16* addr,
|
|
1097
|
+
B16 src,
|
|
1098
|
+
uint64_t cache_policy);
|
|
1099
|
+
*/
|
|
1100
|
+
#if __cccl_ptx_isa >= 740
|
|
1101
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1102
|
+
template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
|
|
1103
|
+
_CCCL_DEVICE static inline void
|
|
1104
|
+
st_L1_evict_first_L2_cache_hint(space_global_t, _B16* __addr, _B16 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
1105
|
+
{
|
|
1106
|
+
// __space == space_global (due to parameter type constraint)
|
|
1107
|
+
static_assert(sizeof(_B16) == 2, "");
|
|
1108
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
1109
|
+
asm("st.global.L1::evict_first.L2::cache_hint.b16 [%0], %1, %2;"
|
|
1110
|
+
:
|
|
1111
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
1112
|
+
"h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src)),
|
|
1113
|
+
"l"(__cache_policy)
|
|
1114
|
+
: "memory");
|
|
1115
|
+
# else
|
|
1116
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1117
|
+
__cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1118
|
+
# endif
|
|
1119
|
+
}
|
|
1120
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1121
|
+
|
|
1122
|
+
/*
|
|
1123
|
+
// st.space.L1::evict_first.L2::cache_hint.b32 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
1124
|
+
// .space = { .global }
|
|
1125
|
+
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
|
|
1126
|
+
__device__ static inline void st_L1_evict_first_L2_cache_hint(
|
|
1127
|
+
cuda::ptx::space_global_t,
|
|
1128
|
+
B32* addr,
|
|
1129
|
+
B32 src,
|
|
1130
|
+
uint64_t cache_policy);
|
|
1131
|
+
*/
|
|
1132
|
+
#if __cccl_ptx_isa >= 740
|
|
1133
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1134
|
+
template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
|
|
1135
|
+
_CCCL_DEVICE static inline void
|
|
1136
|
+
st_L1_evict_first_L2_cache_hint(space_global_t, _B32* __addr, _B32 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
1137
|
+
{
|
|
1138
|
+
// __space == space_global (due to parameter type constraint)
|
|
1139
|
+
static_assert(sizeof(_B32) == 4, "");
|
|
1140
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
1141
|
+
asm("st.global.L1::evict_first.L2::cache_hint.b32 [%0], %1, %2;"
|
|
1142
|
+
:
|
|
1143
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
1144
|
+
"r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src)),
|
|
1145
|
+
"l"(__cache_policy)
|
|
1146
|
+
: "memory");
|
|
1147
|
+
# else
|
|
1148
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1149
|
+
__cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1150
|
+
# endif
|
|
1151
|
+
}
|
|
1152
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1153
|
+
|
|
1154
|
+
/*
|
|
1155
|
+
// st.space.L1::evict_first.L2::cache_hint.b64 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
1156
|
+
// .space = { .global }
|
|
1157
|
+
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
|
|
1158
|
+
__device__ static inline void st_L1_evict_first_L2_cache_hint(
|
|
1159
|
+
cuda::ptx::space_global_t,
|
|
1160
|
+
B64* addr,
|
|
1161
|
+
B64 src,
|
|
1162
|
+
uint64_t cache_policy);
|
|
1163
|
+
*/
|
|
1164
|
+
#if __cccl_ptx_isa >= 740
|
|
1165
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1166
|
+
template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
|
|
1167
|
+
_CCCL_DEVICE static inline void
|
|
1168
|
+
st_L1_evict_first_L2_cache_hint(space_global_t, _B64* __addr, _B64 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
1169
|
+
{
|
|
1170
|
+
// __space == space_global (due to parameter type constraint)
|
|
1171
|
+
static_assert(sizeof(_B64) == 8, "");
|
|
1172
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
1173
|
+
asm("st.global.L1::evict_first.L2::cache_hint.b64 [%0], %1, %2;"
|
|
1174
|
+
:
|
|
1175
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
1176
|
+
"l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src)),
|
|
1177
|
+
"l"(__cache_policy)
|
|
1178
|
+
: "memory");
|
|
1179
|
+
# else
|
|
1180
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1181
|
+
__cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1182
|
+
# endif
|
|
1183
|
+
}
|
|
1184
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1185
|
+
|
|
1186
|
+
/*
|
|
1187
|
+
// st.space.L1::evict_first.L2::cache_hint.b128 [addr], src, cache_policy; // PTX ISA 83, SM_80
|
|
1188
|
+
// .space = { .global }
|
|
1189
|
+
template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
|
|
1190
|
+
__device__ static inline void st_L1_evict_first_L2_cache_hint(
|
|
1191
|
+
cuda::ptx::space_global_t,
|
|
1192
|
+
B128* addr,
|
|
1193
|
+
B128 src,
|
|
1194
|
+
uint64_t cache_policy);
|
|
1195
|
+
*/
|
|
1196
|
+
#if __cccl_ptx_isa >= 830
|
|
1197
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1198
|
+
template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
|
|
1199
|
+
_CCCL_DEVICE static inline void
|
|
1200
|
+
st_L1_evict_first_L2_cache_hint(space_global_t, _B128* __addr, _B128 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
1201
|
+
{
|
|
1202
|
+
// __space == space_global (due to parameter type constraint)
|
|
1203
|
+
static_assert(sizeof(_B128) == 16, "");
|
|
1204
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
1205
|
+
asm("{\n\t .reg .b128 B128_src; \n\t"
|
|
1206
|
+
"mov.b128 B128_src, {%1, %2}; \n"
|
|
1207
|
+
"st.global.L1::evict_first.L2::cache_hint.b128 [%0], B128_src, %3;\n\t"
|
|
1208
|
+
"}"
|
|
1209
|
+
:
|
|
1210
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
1211
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).x),
|
|
1212
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).y),
|
|
1213
|
+
"l"(__cache_policy)
|
|
1214
|
+
: "memory");
|
|
1215
|
+
# else
|
|
1216
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1217
|
+
__cuda_ptx_st_L1_evict_first_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1218
|
+
# endif
|
|
1219
|
+
}
|
|
1220
|
+
#endif // __cccl_ptx_isa >= 830
|
|
1221
|
+
|
|
1222
|
+
/*
|
|
1223
|
+
// st.space.L1::evict_last.b8 [addr], src; // PTX ISA 74, SM_70
|
|
1224
|
+
// .space = { .global }
|
|
1225
|
+
template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
|
|
1226
|
+
__device__ static inline void st_L1_evict_last(
|
|
1227
|
+
cuda::ptx::space_global_t,
|
|
1228
|
+
B8* addr,
|
|
1229
|
+
B8 src);
|
|
1230
|
+
*/
|
|
1231
|
+
#if __cccl_ptx_isa >= 740
|
|
1232
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
|
|
1233
|
+
template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
|
|
1234
|
+
_CCCL_DEVICE static inline void st_L1_evict_last(space_global_t, _B8* __addr, _B8 __src)
|
|
1235
|
+
{
|
|
1236
|
+
// __space == space_global (due to parameter type constraint)
|
|
1237
|
+
static_assert(sizeof(_B8) == 1, "");
|
|
1238
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
1239
|
+
asm("st.global.L1::evict_last.b8 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)) : "memory");
|
|
1240
|
+
# else
|
|
1241
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1242
|
+
__cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
|
|
1243
|
+
# endif
|
|
1244
|
+
}
|
|
1245
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1246
|
+
|
|
1247
|
+
/*
|
|
1248
|
+
// st.space.L1::evict_last.b16 [addr], src; // PTX ISA 74, SM_70
|
|
1249
|
+
// .space = { .global }
|
|
1250
|
+
template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
|
|
1251
|
+
__device__ static inline void st_L1_evict_last(
|
|
1252
|
+
cuda::ptx::space_global_t,
|
|
1253
|
+
B16* addr,
|
|
1254
|
+
B16 src);
|
|
1255
|
+
*/
|
|
1256
|
+
#if __cccl_ptx_isa >= 740
|
|
1257
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
|
|
1258
|
+
template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
|
|
1259
|
+
_CCCL_DEVICE static inline void st_L1_evict_last(space_global_t, _B16* __addr, _B16 __src)
|
|
1260
|
+
{
|
|
1261
|
+
// __space == space_global (due to parameter type constraint)
|
|
1262
|
+
static_assert(sizeof(_B16) == 2, "");
|
|
1263
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
1264
|
+
asm("st.global.L1::evict_last.b16 [%0], %1;"
|
|
1265
|
+
:
|
|
1266
|
+
: "l"(__as_ptr_gmem(__addr)), "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src))
|
|
1267
|
+
: "memory");
|
|
1268
|
+
# else
|
|
1269
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1270
|
+
__cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
|
|
1271
|
+
# endif
|
|
1272
|
+
}
|
|
1273
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1274
|
+
|
|
1275
|
+
/*
|
|
1276
|
+
// st.space.L1::evict_last.b32 [addr], src; // PTX ISA 74, SM_70
|
|
1277
|
+
// .space = { .global }
|
|
1278
|
+
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
|
|
1279
|
+
__device__ static inline void st_L1_evict_last(
|
|
1280
|
+
cuda::ptx::space_global_t,
|
|
1281
|
+
B32* addr,
|
|
1282
|
+
B32 src);
|
|
1283
|
+
*/
|
|
1284
|
+
#if __cccl_ptx_isa >= 740
|
|
1285
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
|
|
1286
|
+
template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
|
|
1287
|
+
_CCCL_DEVICE static inline void st_L1_evict_last(space_global_t, _B32* __addr, _B32 __src)
|
|
1288
|
+
{
|
|
1289
|
+
// __space == space_global (due to parameter type constraint)
|
|
1290
|
+
static_assert(sizeof(_B32) == 4, "");
|
|
1291
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
1292
|
+
asm("st.global.L1::evict_last.b32 [%0], %1;"
|
|
1293
|
+
:
|
|
1294
|
+
: "l"(__as_ptr_gmem(__addr)), "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src))
|
|
1295
|
+
: "memory");
|
|
1296
|
+
# else
|
|
1297
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1298
|
+
__cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
|
|
1299
|
+
# endif
|
|
1300
|
+
}
|
|
1301
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1302
|
+
|
|
1303
|
+
/*
|
|
1304
|
+
// st.space.L1::evict_last.b64 [addr], src; // PTX ISA 74, SM_70
|
|
1305
|
+
// .space = { .global }
|
|
1306
|
+
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
|
|
1307
|
+
__device__ static inline void st_L1_evict_last(
|
|
1308
|
+
cuda::ptx::space_global_t,
|
|
1309
|
+
B64* addr,
|
|
1310
|
+
B64 src);
|
|
1311
|
+
*/
|
|
1312
|
+
#if __cccl_ptx_isa >= 740
|
|
1313
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
|
|
1314
|
+
template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
|
|
1315
|
+
_CCCL_DEVICE static inline void st_L1_evict_last(space_global_t, _B64* __addr, _B64 __src)
|
|
1316
|
+
{
|
|
1317
|
+
// __space == space_global (due to parameter type constraint)
|
|
1318
|
+
static_assert(sizeof(_B64) == 8, "");
|
|
1319
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
1320
|
+
asm("st.global.L1::evict_last.b64 [%0], %1;"
|
|
1321
|
+
:
|
|
1322
|
+
: "l"(__as_ptr_gmem(__addr)), "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src))
|
|
1323
|
+
: "memory");
|
|
1324
|
+
# else
|
|
1325
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1326
|
+
__cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
|
|
1327
|
+
# endif
|
|
1328
|
+
}
|
|
1329
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1330
|
+
|
|
1331
|
+
/*
|
|
1332
|
+
// st.space.L1::evict_last.b128 [addr], src; // PTX ISA 83, SM_70
|
|
1333
|
+
// .space = { .global }
|
|
1334
|
+
template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
|
|
1335
|
+
__device__ static inline void st_L1_evict_last(
|
|
1336
|
+
cuda::ptx::space_global_t,
|
|
1337
|
+
B128* addr,
|
|
1338
|
+
B128 src);
|
|
1339
|
+
*/
|
|
1340
|
+
#if __cccl_ptx_isa >= 830
|
|
1341
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
|
|
1342
|
+
template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
|
|
1343
|
+
_CCCL_DEVICE static inline void st_L1_evict_last(space_global_t, _B128* __addr, _B128 __src)
|
|
1344
|
+
{
|
|
1345
|
+
// __space == space_global (due to parameter type constraint)
|
|
1346
|
+
static_assert(sizeof(_B128) == 16, "");
|
|
1347
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
1348
|
+
asm("{\n\t .reg .b128 B128_src; \n\t"
|
|
1349
|
+
"mov.b128 B128_src, {%1, %2}; \n"
|
|
1350
|
+
"st.global.L1::evict_last.b128 [%0], B128_src;\n\t"
|
|
1351
|
+
"}"
|
|
1352
|
+
:
|
|
1353
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
1354
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).x),
|
|
1355
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).y)
|
|
1356
|
+
: "memory");
|
|
1357
|
+
# else
|
|
1358
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1359
|
+
__cuda_ptx_st_L1_evict_last_is_not_supported_before_SM_70__();
|
|
1360
|
+
# endif
|
|
1361
|
+
}
|
|
1362
|
+
#endif // __cccl_ptx_isa >= 830
|
|
1363
|
+
|
|
1364
|
+
/*
|
|
1365
|
+
// st.space.L1::evict_last.L2::cache_hint.b8 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
1366
|
+
// .space = { .global }
|
|
1367
|
+
template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
|
|
1368
|
+
__device__ static inline void st_L1_evict_last_L2_cache_hint(
|
|
1369
|
+
cuda::ptx::space_global_t,
|
|
1370
|
+
B8* addr,
|
|
1371
|
+
B8 src,
|
|
1372
|
+
uint64_t cache_policy);
|
|
1373
|
+
*/
|
|
1374
|
+
#if __cccl_ptx_isa >= 740
|
|
1375
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1376
|
+
template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
|
|
1377
|
+
_CCCL_DEVICE static inline void
|
|
1378
|
+
st_L1_evict_last_L2_cache_hint(space_global_t, _B8* __addr, _B8 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
1379
|
+
{
|
|
1380
|
+
// __space == space_global (due to parameter type constraint)
|
|
1381
|
+
static_assert(sizeof(_B8) == 1, "");
|
|
1382
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
1383
|
+
asm("st.global.L1::evict_last.L2::cache_hint.b8 [%0], %1, %2;"
|
|
1384
|
+
:
|
|
1385
|
+
: "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)), "l"(__cache_policy)
|
|
1386
|
+
: "memory");
|
|
1387
|
+
# else
|
|
1388
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1389
|
+
__cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1390
|
+
# endif
|
|
1391
|
+
}
|
|
1392
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1393
|
+
|
|
1394
|
+
/*
|
|
1395
|
+
// st.space.L1::evict_last.L2::cache_hint.b16 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
1396
|
+
// .space = { .global }
|
|
1397
|
+
template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
|
|
1398
|
+
__device__ static inline void st_L1_evict_last_L2_cache_hint(
|
|
1399
|
+
cuda::ptx::space_global_t,
|
|
1400
|
+
B16* addr,
|
|
1401
|
+
B16 src,
|
|
1402
|
+
uint64_t cache_policy);
|
|
1403
|
+
*/
|
|
1404
|
+
#if __cccl_ptx_isa >= 740
|
|
1405
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1406
|
+
template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
|
|
1407
|
+
_CCCL_DEVICE static inline void
|
|
1408
|
+
st_L1_evict_last_L2_cache_hint(space_global_t, _B16* __addr, _B16 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
1409
|
+
{
|
|
1410
|
+
// __space == space_global (due to parameter type constraint)
|
|
1411
|
+
static_assert(sizeof(_B16) == 2, "");
|
|
1412
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
1413
|
+
asm("st.global.L1::evict_last.L2::cache_hint.b16 [%0], %1, %2;"
|
|
1414
|
+
:
|
|
1415
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
1416
|
+
"h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src)),
|
|
1417
|
+
"l"(__cache_policy)
|
|
1418
|
+
: "memory");
|
|
1419
|
+
# else
|
|
1420
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1421
|
+
__cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1422
|
+
# endif
|
|
1423
|
+
}
|
|
1424
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1425
|
+
|
|
1426
|
+
/*
|
|
1427
|
+
// st.space.L1::evict_last.L2::cache_hint.b32 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
1428
|
+
// .space = { .global }
|
|
1429
|
+
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
|
|
1430
|
+
__device__ static inline void st_L1_evict_last_L2_cache_hint(
|
|
1431
|
+
cuda::ptx::space_global_t,
|
|
1432
|
+
B32* addr,
|
|
1433
|
+
B32 src,
|
|
1434
|
+
uint64_t cache_policy);
|
|
1435
|
+
*/
|
|
1436
|
+
#if __cccl_ptx_isa >= 740
|
|
1437
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1438
|
+
template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
|
|
1439
|
+
_CCCL_DEVICE static inline void
|
|
1440
|
+
st_L1_evict_last_L2_cache_hint(space_global_t, _B32* __addr, _B32 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
1441
|
+
{
|
|
1442
|
+
// __space == space_global (due to parameter type constraint)
|
|
1443
|
+
static_assert(sizeof(_B32) == 4, "");
|
|
1444
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
1445
|
+
asm("st.global.L1::evict_last.L2::cache_hint.b32 [%0], %1, %2;"
|
|
1446
|
+
:
|
|
1447
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
1448
|
+
"r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src)),
|
|
1449
|
+
"l"(__cache_policy)
|
|
1450
|
+
: "memory");
|
|
1451
|
+
# else
|
|
1452
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1453
|
+
__cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1454
|
+
# endif
|
|
1455
|
+
}
|
|
1456
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1457
|
+
|
|
1458
|
+
/*
|
|
1459
|
+
// st.space.L1::evict_last.L2::cache_hint.b64 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
1460
|
+
// .space = { .global }
|
|
1461
|
+
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
|
|
1462
|
+
__device__ static inline void st_L1_evict_last_L2_cache_hint(
|
|
1463
|
+
cuda::ptx::space_global_t,
|
|
1464
|
+
B64* addr,
|
|
1465
|
+
B64 src,
|
|
1466
|
+
uint64_t cache_policy);
|
|
1467
|
+
*/
|
|
1468
|
+
#if __cccl_ptx_isa >= 740
|
|
1469
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1470
|
+
template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
|
|
1471
|
+
_CCCL_DEVICE static inline void
|
|
1472
|
+
st_L1_evict_last_L2_cache_hint(space_global_t, _B64* __addr, _B64 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
1473
|
+
{
|
|
1474
|
+
// __space == space_global (due to parameter type constraint)
|
|
1475
|
+
static_assert(sizeof(_B64) == 8, "");
|
|
1476
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
1477
|
+
asm("st.global.L1::evict_last.L2::cache_hint.b64 [%0], %1, %2;"
|
|
1478
|
+
:
|
|
1479
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
1480
|
+
"l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src)),
|
|
1481
|
+
"l"(__cache_policy)
|
|
1482
|
+
: "memory");
|
|
1483
|
+
# else
|
|
1484
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1485
|
+
__cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1486
|
+
# endif
|
|
1487
|
+
}
|
|
1488
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1489
|
+
|
|
1490
|
+
/*
|
|
1491
|
+
// st.space.L1::evict_last.L2::cache_hint.b128 [addr], src, cache_policy; // PTX ISA 83, SM_80
|
|
1492
|
+
// .space = { .global }
|
|
1493
|
+
template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
|
|
1494
|
+
__device__ static inline void st_L1_evict_last_L2_cache_hint(
|
|
1495
|
+
cuda::ptx::space_global_t,
|
|
1496
|
+
B128* addr,
|
|
1497
|
+
B128 src,
|
|
1498
|
+
uint64_t cache_policy);
|
|
1499
|
+
*/
|
|
1500
|
+
#if __cccl_ptx_isa >= 830
|
|
1501
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1502
|
+
template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
|
|
1503
|
+
_CCCL_DEVICE static inline void
|
|
1504
|
+
st_L1_evict_last_L2_cache_hint(space_global_t, _B128* __addr, _B128 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
1505
|
+
{
|
|
1506
|
+
// __space == space_global (due to parameter type constraint)
|
|
1507
|
+
static_assert(sizeof(_B128) == 16, "");
|
|
1508
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
1509
|
+
asm("{\n\t .reg .b128 B128_src; \n\t"
|
|
1510
|
+
"mov.b128 B128_src, {%1, %2}; \n"
|
|
1511
|
+
"st.global.L1::evict_last.L2::cache_hint.b128 [%0], B128_src, %3;\n\t"
|
|
1512
|
+
"}"
|
|
1513
|
+
:
|
|
1514
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
1515
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).x),
|
|
1516
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).y),
|
|
1517
|
+
"l"(__cache_policy)
|
|
1518
|
+
: "memory");
|
|
1519
|
+
# else
|
|
1520
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1521
|
+
__cuda_ptx_st_L1_evict_last_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1522
|
+
# endif
|
|
1523
|
+
}
|
|
1524
|
+
#endif // __cccl_ptx_isa >= 830
|
|
1525
|
+
|
|
1526
|
+
/*
|
|
1527
|
+
// st.space.L1::no_allocate.b8 [addr], src; // PTX ISA 74, SM_70
|
|
1528
|
+
// .space = { .global }
|
|
1529
|
+
template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
|
|
1530
|
+
__device__ static inline void st_L1_no_allocate(
|
|
1531
|
+
cuda::ptx::space_global_t,
|
|
1532
|
+
B8* addr,
|
|
1533
|
+
B8 src);
|
|
1534
|
+
*/
|
|
1535
|
+
#if __cccl_ptx_isa >= 740
|
|
1536
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
|
|
1537
|
+
template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
|
|
1538
|
+
_CCCL_DEVICE static inline void st_L1_no_allocate(space_global_t, _B8* __addr, _B8 __src)
|
|
1539
|
+
{
|
|
1540
|
+
// __space == space_global (due to parameter type constraint)
|
|
1541
|
+
static_assert(sizeof(_B8) == 1, "");
|
|
1542
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
1543
|
+
asm("st.global.L1::no_allocate.b8 [%0], %1;" : : "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)) : "memory");
|
|
1544
|
+
# else
|
|
1545
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1546
|
+
__cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
|
|
1547
|
+
# endif
|
|
1548
|
+
}
|
|
1549
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1550
|
+
|
|
1551
|
+
/*
|
|
1552
|
+
// st.space.L1::no_allocate.b16 [addr], src; // PTX ISA 74, SM_70
|
|
1553
|
+
// .space = { .global }
|
|
1554
|
+
template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
|
|
1555
|
+
__device__ static inline void st_L1_no_allocate(
|
|
1556
|
+
cuda::ptx::space_global_t,
|
|
1557
|
+
B16* addr,
|
|
1558
|
+
B16 src);
|
|
1559
|
+
*/
|
|
1560
|
+
#if __cccl_ptx_isa >= 740
|
|
1561
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
|
|
1562
|
+
template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
|
|
1563
|
+
_CCCL_DEVICE static inline void st_L1_no_allocate(space_global_t, _B16* __addr, _B16 __src)
|
|
1564
|
+
{
|
|
1565
|
+
// __space == space_global (due to parameter type constraint)
|
|
1566
|
+
static_assert(sizeof(_B16) == 2, "");
|
|
1567
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
1568
|
+
asm("st.global.L1::no_allocate.b16 [%0], %1;"
|
|
1569
|
+
:
|
|
1570
|
+
: "l"(__as_ptr_gmem(__addr)), "h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src))
|
|
1571
|
+
: "memory");
|
|
1572
|
+
# else
|
|
1573
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1574
|
+
__cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
|
|
1575
|
+
# endif
|
|
1576
|
+
}
|
|
1577
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1578
|
+
|
|
1579
|
+
/*
|
|
1580
|
+
// st.space.L1::no_allocate.b32 [addr], src; // PTX ISA 74, SM_70
|
|
1581
|
+
// .space = { .global }
|
|
1582
|
+
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
|
|
1583
|
+
__device__ static inline void st_L1_no_allocate(
|
|
1584
|
+
cuda::ptx::space_global_t,
|
|
1585
|
+
B32* addr,
|
|
1586
|
+
B32 src);
|
|
1587
|
+
*/
|
|
1588
|
+
#if __cccl_ptx_isa >= 740
|
|
1589
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
|
|
1590
|
+
template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
|
|
1591
|
+
_CCCL_DEVICE static inline void st_L1_no_allocate(space_global_t, _B32* __addr, _B32 __src)
|
|
1592
|
+
{
|
|
1593
|
+
// __space == space_global (due to parameter type constraint)
|
|
1594
|
+
static_assert(sizeof(_B32) == 4, "");
|
|
1595
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
1596
|
+
asm("st.global.L1::no_allocate.b32 [%0], %1;"
|
|
1597
|
+
:
|
|
1598
|
+
: "l"(__as_ptr_gmem(__addr)), "r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src))
|
|
1599
|
+
: "memory");
|
|
1600
|
+
# else
|
|
1601
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1602
|
+
__cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
|
|
1603
|
+
# endif
|
|
1604
|
+
}
|
|
1605
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1606
|
+
|
|
1607
|
+
/*
|
|
1608
|
+
// st.space.L1::no_allocate.b64 [addr], src; // PTX ISA 74, SM_70
|
|
1609
|
+
// .space = { .global }
|
|
1610
|
+
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
|
|
1611
|
+
__device__ static inline void st_L1_no_allocate(
|
|
1612
|
+
cuda::ptx::space_global_t,
|
|
1613
|
+
B64* addr,
|
|
1614
|
+
B64 src);
|
|
1615
|
+
*/
|
|
1616
|
+
#if __cccl_ptx_isa >= 740
|
|
1617
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
|
|
1618
|
+
template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
|
|
1619
|
+
_CCCL_DEVICE static inline void st_L1_no_allocate(space_global_t, _B64* __addr, _B64 __src)
|
|
1620
|
+
{
|
|
1621
|
+
// __space == space_global (due to parameter type constraint)
|
|
1622
|
+
static_assert(sizeof(_B64) == 8, "");
|
|
1623
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
1624
|
+
asm("st.global.L1::no_allocate.b64 [%0], %1;"
|
|
1625
|
+
:
|
|
1626
|
+
: "l"(__as_ptr_gmem(__addr)), "l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src))
|
|
1627
|
+
: "memory");
|
|
1628
|
+
# else
|
|
1629
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1630
|
+
__cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
|
|
1631
|
+
# endif
|
|
1632
|
+
}
|
|
1633
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1634
|
+
|
|
1635
|
+
/*
|
|
1636
|
+
// st.space.L1::no_allocate.b128 [addr], src; // PTX ISA 83, SM_70
|
|
1637
|
+
// .space = { .global }
|
|
1638
|
+
template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
|
|
1639
|
+
__device__ static inline void st_L1_no_allocate(
|
|
1640
|
+
cuda::ptx::space_global_t,
|
|
1641
|
+
B128* addr,
|
|
1642
|
+
B128 src);
|
|
1643
|
+
*/
|
|
1644
|
+
#if __cccl_ptx_isa >= 830
|
|
1645
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
|
|
1646
|
+
template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
|
|
1647
|
+
_CCCL_DEVICE static inline void st_L1_no_allocate(space_global_t, _B128* __addr, _B128 __src)
|
|
1648
|
+
{
|
|
1649
|
+
// __space == space_global (due to parameter type constraint)
|
|
1650
|
+
static_assert(sizeof(_B128) == 16, "");
|
|
1651
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 700
|
|
1652
|
+
asm("{\n\t .reg .b128 B128_src; \n\t"
|
|
1653
|
+
"mov.b128 B128_src, {%1, %2}; \n"
|
|
1654
|
+
"st.global.L1::no_allocate.b128 [%0], B128_src;\n\t"
|
|
1655
|
+
"}"
|
|
1656
|
+
:
|
|
1657
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
1658
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).x),
|
|
1659
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).y)
|
|
1660
|
+
: "memory");
|
|
1661
|
+
# else
|
|
1662
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1663
|
+
__cuda_ptx_st_L1_no_allocate_is_not_supported_before_SM_70__();
|
|
1664
|
+
# endif
|
|
1665
|
+
}
|
|
1666
|
+
#endif // __cccl_ptx_isa >= 830
|
|
1667
|
+
|
|
1668
|
+
/*
|
|
1669
|
+
// st.space.L1::no_allocate.L2::cache_hint.b8 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
1670
|
+
// .space = { .global }
|
|
1671
|
+
template <typename B8, enable_if_t<sizeof(B8) == 1, bool> = true>
|
|
1672
|
+
__device__ static inline void st_L1_no_allocate_L2_cache_hint(
|
|
1673
|
+
cuda::ptx::space_global_t,
|
|
1674
|
+
B8* addr,
|
|
1675
|
+
B8 src,
|
|
1676
|
+
uint64_t cache_policy);
|
|
1677
|
+
*/
|
|
1678
|
+
#if __cccl_ptx_isa >= 740
|
|
1679
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1680
|
+
template <typename _B8, _CUDA_VSTD::enable_if_t<sizeof(_B8) == 1, bool> = true>
|
|
1681
|
+
_CCCL_DEVICE static inline void
|
|
1682
|
+
st_L1_no_allocate_L2_cache_hint(space_global_t, _B8* __addr, _B8 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
1683
|
+
{
|
|
1684
|
+
// __space == space_global (due to parameter type constraint)
|
|
1685
|
+
static_assert(sizeof(_B8) == 1, "");
|
|
1686
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
1687
|
+
asm("st.global.L1::no_allocate.L2::cache_hint.b8 [%0], %1, %2;"
|
|
1688
|
+
:
|
|
1689
|
+
: "l"(__as_ptr_gmem(__addr)), "r"(__b8_as_u32(__src)), "l"(__cache_policy)
|
|
1690
|
+
: "memory");
|
|
1691
|
+
# else
|
|
1692
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1693
|
+
__cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1694
|
+
# endif
|
|
1695
|
+
}
|
|
1696
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1697
|
+
|
|
1698
|
+
/*
|
|
1699
|
+
// st.space.L1::no_allocate.L2::cache_hint.b16 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
1700
|
+
// .space = { .global }
|
|
1701
|
+
template <typename B16, enable_if_t<sizeof(B16) == 2, bool> = true>
|
|
1702
|
+
__device__ static inline void st_L1_no_allocate_L2_cache_hint(
|
|
1703
|
+
cuda::ptx::space_global_t,
|
|
1704
|
+
B16* addr,
|
|
1705
|
+
B16 src,
|
|
1706
|
+
uint64_t cache_policy);
|
|
1707
|
+
*/
|
|
1708
|
+
#if __cccl_ptx_isa >= 740
|
|
1709
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1710
|
+
template <typename _B16, _CUDA_VSTD::enable_if_t<sizeof(_B16) == 2, bool> = true>
|
|
1711
|
+
_CCCL_DEVICE static inline void
|
|
1712
|
+
st_L1_no_allocate_L2_cache_hint(space_global_t, _B16* __addr, _B16 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
1713
|
+
{
|
|
1714
|
+
// __space == space_global (due to parameter type constraint)
|
|
1715
|
+
static_assert(sizeof(_B16) == 2, "");
|
|
1716
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
1717
|
+
asm("st.global.L1::no_allocate.L2::cache_hint.b16 [%0], %1, %2;"
|
|
1718
|
+
:
|
|
1719
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
1720
|
+
"h"(/*as_b16*/ *reinterpret_cast<const _CUDA_VSTD::int16_t*>(&__src)),
|
|
1721
|
+
"l"(__cache_policy)
|
|
1722
|
+
: "memory");
|
|
1723
|
+
# else
|
|
1724
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1725
|
+
__cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1726
|
+
# endif
|
|
1727
|
+
}
|
|
1728
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1729
|
+
|
|
1730
|
+
/*
|
|
1731
|
+
// st.space.L1::no_allocate.L2::cache_hint.b32 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
1732
|
+
// .space = { .global }
|
|
1733
|
+
template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true>
|
|
1734
|
+
__device__ static inline void st_L1_no_allocate_L2_cache_hint(
|
|
1735
|
+
cuda::ptx::space_global_t,
|
|
1736
|
+
B32* addr,
|
|
1737
|
+
B32 src,
|
|
1738
|
+
uint64_t cache_policy);
|
|
1739
|
+
*/
|
|
1740
|
+
#if __cccl_ptx_isa >= 740
|
|
1741
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1742
|
+
template <typename _B32, _CUDA_VSTD::enable_if_t<sizeof(_B32) == 4, bool> = true>
|
|
1743
|
+
_CCCL_DEVICE static inline void
|
|
1744
|
+
st_L1_no_allocate_L2_cache_hint(space_global_t, _B32* __addr, _B32 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
1745
|
+
{
|
|
1746
|
+
// __space == space_global (due to parameter type constraint)
|
|
1747
|
+
static_assert(sizeof(_B32) == 4, "");
|
|
1748
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
1749
|
+
asm("st.global.L1::no_allocate.L2::cache_hint.b32 [%0], %1, %2;"
|
|
1750
|
+
:
|
|
1751
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
1752
|
+
"r"(/*as_b32*/ *reinterpret_cast<const _CUDA_VSTD::int32_t*>(&__src)),
|
|
1753
|
+
"l"(__cache_policy)
|
|
1754
|
+
: "memory");
|
|
1755
|
+
# else
|
|
1756
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1757
|
+
__cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1758
|
+
# endif
|
|
1759
|
+
}
|
|
1760
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1761
|
+
|
|
1762
|
+
/*
|
|
1763
|
+
// st.space.L1::no_allocate.L2::cache_hint.b64 [addr], src, cache_policy; // PTX ISA 74, SM_80
|
|
1764
|
+
// .space = { .global }
|
|
1765
|
+
template <typename B64, enable_if_t<sizeof(B64) == 8, bool> = true>
|
|
1766
|
+
__device__ static inline void st_L1_no_allocate_L2_cache_hint(
|
|
1767
|
+
cuda::ptx::space_global_t,
|
|
1768
|
+
B64* addr,
|
|
1769
|
+
B64 src,
|
|
1770
|
+
uint64_t cache_policy);
|
|
1771
|
+
*/
|
|
1772
|
+
#if __cccl_ptx_isa >= 740
|
|
1773
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1774
|
+
template <typename _B64, _CUDA_VSTD::enable_if_t<sizeof(_B64) == 8, bool> = true>
|
|
1775
|
+
_CCCL_DEVICE static inline void
|
|
1776
|
+
st_L1_no_allocate_L2_cache_hint(space_global_t, _B64* __addr, _B64 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
1777
|
+
{
|
|
1778
|
+
// __space == space_global (due to parameter type constraint)
|
|
1779
|
+
static_assert(sizeof(_B64) == 8, "");
|
|
1780
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
1781
|
+
asm("st.global.L1::no_allocate.L2::cache_hint.b64 [%0], %1, %2;"
|
|
1782
|
+
:
|
|
1783
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
1784
|
+
"l"(/*as_b64*/ *reinterpret_cast<const _CUDA_VSTD::int64_t*>(&__src)),
|
|
1785
|
+
"l"(__cache_policy)
|
|
1786
|
+
: "memory");
|
|
1787
|
+
# else
|
|
1788
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1789
|
+
__cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1790
|
+
# endif
|
|
1791
|
+
}
|
|
1792
|
+
#endif // __cccl_ptx_isa >= 740
|
|
1793
|
+
|
|
1794
|
+
/*
|
|
1795
|
+
// st.space.L1::no_allocate.L2::cache_hint.b128 [addr], src, cache_policy; // PTX ISA 83, SM_80
|
|
1796
|
+
// .space = { .global }
|
|
1797
|
+
template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
|
|
1798
|
+
__device__ static inline void st_L1_no_allocate_L2_cache_hint(
|
|
1799
|
+
cuda::ptx::space_global_t,
|
|
1800
|
+
B128* addr,
|
|
1801
|
+
B128 src,
|
|
1802
|
+
uint64_t cache_policy);
|
|
1803
|
+
*/
|
|
1804
|
+
#if __cccl_ptx_isa >= 830
|
|
1805
|
+
extern "C" _CCCL_DEVICE void __cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1806
|
+
template <typename _B128, _CUDA_VSTD::enable_if_t<sizeof(_B128) == 16, bool> = true>
|
|
1807
|
+
_CCCL_DEVICE static inline void
|
|
1808
|
+
st_L1_no_allocate_L2_cache_hint(space_global_t, _B128* __addr, _B128 __src, _CUDA_VSTD::uint64_t __cache_policy)
|
|
1809
|
+
{
|
|
1810
|
+
// __space == space_global (due to parameter type constraint)
|
|
1811
|
+
static_assert(sizeof(_B128) == 16, "");
|
|
1812
|
+
# if _CCCL_CUDA_COMPILER(NVHPC) || __CUDA_ARCH__ >= 800
|
|
1813
|
+
asm("{\n\t .reg .b128 B128_src; \n\t"
|
|
1814
|
+
"mov.b128 B128_src, {%1, %2}; \n"
|
|
1815
|
+
"st.global.L1::no_allocate.L2::cache_hint.b128 [%0], B128_src, %3;\n\t"
|
|
1816
|
+
"}"
|
|
1817
|
+
:
|
|
1818
|
+
: "l"(__as_ptr_gmem(__addr)),
|
|
1819
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).x),
|
|
1820
|
+
"l"((*reinterpret_cast<longlong2*>(&__src)).y),
|
|
1821
|
+
"l"(__cache_policy)
|
|
1822
|
+
: "memory");
|
|
1823
|
+
# else
|
|
1824
|
+
// Unsupported architectures will have a linker error with a semi-decent error message
|
|
1825
|
+
__cuda_ptx_st_L1_no_allocate_L2_cache_hint_is_not_supported_before_SM_80__();
|
|
1826
|
+
# endif
|
|
1827
|
+
}
|
|
1828
|
+
#endif // __cccl_ptx_isa >= 830
|
|
1829
|
+
|
|
1830
|
+
#endif // _CUDA_PTX_GENERATED_ST_H_
|