cuda-cccl 0.1.3.1.0.dev1678__cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/__init__.py +14 -0
- cuda/cccl/cooperative/__init__.py +3 -0
- cuda/cccl/cooperative/experimental/__init__.py +8 -0
- cuda/cccl/cooperative/experimental/_caching.py +48 -0
- cuda/cccl/cooperative/experimental/_common.py +273 -0
- cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
- cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
- cuda/cccl/cooperative/experimental/_types.py +935 -0
- cuda/cccl/cooperative/experimental/_typing.py +107 -0
- cuda/cccl/cooperative/experimental/block/__init__.py +33 -0
- cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
- cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
- cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
- cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
- cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
- cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
- cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +98 -0
- cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
- cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
- cuda/cccl/headers/__init__.py +7 -0
- cuda/cccl/headers/include/__init__.py +1 -0
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +261 -0
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +925 -0
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +227 -0
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +753 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +678 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +997 -0
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1032 -0
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +342 -0
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1346 -0
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +1259 -0
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +629 -0
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +2583 -0
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
- cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +259 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
- cuda/cccl/headers/include/cub/config.cuh +60 -0
- cuda/cccl/headers/include/cub/cub.cuh +112 -0
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +120 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +74 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
- cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +206 -0
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
- cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
- cuda/cccl/headers/include/cub/device/device_for.cuh +990 -0
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
- cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
- cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3435 -0
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +1815 -0
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +368 -0
- cuda/cccl/headers/include/cub/device/device_scan.cuh +1901 -0
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
- cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
- cuda/cccl/headers/include/cub/device/device_transform.cuh +313 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1051 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1748 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +625 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +497 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +548 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +1374 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +439 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +552 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +467 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +338 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +525 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +936 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +397 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +555 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +353 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +215 -0
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +238 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +629 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +504 -0
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +340 -0
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +406 -0
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
- cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
- cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
- cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
- cuda/cccl/headers/include/cub/util_device.cuh +779 -0
- cuda/cccl/headers/include/cub/util_macro.cuh +91 -0
- cuda/cccl/headers/include/cub/util_math.cuh +115 -0
- cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
- cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
- cuda/cccl/headers/include/cub/util_type.cuh +1136 -0
- cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
- cuda/cccl/headers/include/cub/version.cuh +89 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +688 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +437 -0
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1156 -0
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +210 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +84 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +127 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +209 -0
- cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
- cuda/cccl/headers/include/cuda/__barrier/aligned_size.h +61 -0
- cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +100 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +454 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +72 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
- cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
- cuda/cccl/headers/include/cuda/__bit/bitfield.h +121 -0
- cuda/cccl/headers/include/cuda/__bit/bitmask.h +88 -0
- cuda/cccl/headers/include/cuda/__cccl_config +36 -0
- cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
- cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
- cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
- cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
- cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
- cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
- cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
- cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
- cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +90 -0
- cuda/cccl/headers/include/cuda/__execution/require.h +74 -0
- cuda/cccl/headers/include/cuda/__execution/tune.h +69 -0
- cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
- cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +276 -0
- cuda/cccl/headers/include/cuda/__functional/get_device_address.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
- cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +257 -0
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +460 -0
- cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
- cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +421 -0
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +321 -0
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +333 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +465 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +456 -0
- cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +462 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +122 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +98 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +162 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +49 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +99 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
- cuda/cccl/headers/include/cuda/__memory/address_space.h +86 -0
- cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +94 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +157 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +73 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +129 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +653 -0
- cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +57 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +101 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +193 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +957 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +288 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +596 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1445 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +117 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +62 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +101 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +62 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +15074 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +385 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +176 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +94 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +137 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +138 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +280 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +282 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2148 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1272 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +228 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +430 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1830 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +105 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +81 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +612 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4446 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4061 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6438 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +36 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4582 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +67 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +750 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +151 -0
- cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +163 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
- cuda/cccl/headers/include/cuda/__utility/static_for.h +74 -0
- cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
- cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
- cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +249 -0
- cuda/cccl/headers/include/cuda/access_property +26 -0
- cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
- cuda/cccl/headers/include/cuda/atomic +27 -0
- cuda/cccl/headers/include/cuda/barrier +262 -0
- cuda/cccl/headers/include/cuda/bit +29 -0
- cuda/cccl/headers/include/cuda/cmath +35 -0
- cuda/cccl/headers/include/cuda/discard_memory +60 -0
- cuda/cccl/headers/include/cuda/functional +31 -0
- cuda/cccl/headers/include/cuda/iterator +34 -0
- cuda/cccl/headers/include/cuda/latch +27 -0
- cuda/cccl/headers/include/cuda/mdspan +28 -0
- cuda/cccl/headers/include/cuda/memory +32 -0
- cuda/cccl/headers/include/cuda/memory_resource +41 -0
- cuda/cccl/headers/include/cuda/numeric +28 -0
- cuda/cccl/headers/include/cuda/pipeline +577 -0
- cuda/cccl/headers/include/cuda/ptx +124 -0
- cuda/cccl/headers/include/cuda/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +52 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +140 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move.h +87 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +94 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +101 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +218 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
- cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
- cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +250 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +105 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +73 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
- cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
- cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
- cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
- cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +77 -0
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +183 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
- cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
- cuda/cccl/headers/include/cuda/std/__bit/integral.h +124 -0
- cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +1270 -0
- cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
- cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
- cuda/cccl/headers/include/cuda/std/__cccl/assert.h +150 -0
- cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +207 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +787 -0
- cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +43 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
- cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
- cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +128 -0
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +126 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +326 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +79 -0
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
- cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +133 -0
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
- cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1276 -0
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +267 -0
- cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +176 -0
- cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
- cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
- cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
- cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
- cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
- cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +115 -0
- cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv_ +30 -0
- cuda/cccl/headers/include/cuda/std/__cmath/abs.h +246 -0
- cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +193 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +724 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +216 -0
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +224 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +180 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
- cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +104 -0
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +582 -0
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +248 -0
- cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
- cuda/cccl/headers/include/cuda/std/__cmath/nvbf16.h +58 -0
- cuda/cccl/headers/include/cuda/std/__cmath/nvfp16.h +58 -0
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
- cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +155 -0
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +170 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
- cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +676 -0
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +388 -0
- cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +215 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +106 -0
- cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +53 -0
- cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
- cuda/cccl/headers/include/cuda/std/__complex/roots.h +64 -0
- cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
- cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
- cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +131 -0
- cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
- cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
- cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +46 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +273 -0
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
- cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +71 -0
- cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +57 -0
- cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
- cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
- cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
- cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
- cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
- cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
- cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
- cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +142 -0
- cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
- cuda/cccl/headers/include/cuda/std/__execution/env.h +436 -0
- cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +2001 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1080 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +175 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +172 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +103 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +64 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/nvfp_types.h +58 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
- cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
- cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +128 -0
- cuda/cccl/headers/include/cuda/std/__format_ +28 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +337 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
- cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
- cuda/cccl/headers/include/cuda/std/__functional/function.h +1277 -0
- cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
- cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +558 -0
- cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +213 -0
- cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
- cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
- cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +127 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
- cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +277 -0
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +36 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
- cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/iterator_traits.h +40 -0
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +73 -0
- cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +38 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
- cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
- cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +102 -0
- cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
- cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +100 -0
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +254 -0
- cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
- cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
- cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
- cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +95 -0
- cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +152 -0
- cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +102 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +140 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +160 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +932 -0
- cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +400 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +98 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
- cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +91 -0
- cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +605 -0
- cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
- cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +247 -0
- cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +781 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +322 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +98 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +358 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +757 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +315 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +308 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +507 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
- cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
- cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +555 -0
- cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
- cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
- cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +230 -0
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
- cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
- cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +261 -0
- cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
- cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +683 -0
- cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +768 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +55 -0
- cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
- cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
- cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
- cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
- cuda/cccl/headers/include/cuda/std/__new_ +29 -0
- cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
- cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
- cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +80 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
- cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
- cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +100 -0
- cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
- cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
- cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +75 -0
- cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
- cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
- cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +900 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +430 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
- cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
- cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
- cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +397 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
- cuda/cccl/headers/include/cuda/std/__random_ +29 -0
- cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
- cuda/cccl/headers/include/cuda/std/__ranges/all.h +97 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +313 -0
- cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
- cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
- cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +77 -0
- cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
- cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
- cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +113 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +174 -0
- cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rend.h +181 -0
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
- cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
- cuda/cccl/headers/include/cuda/std/__ranges/size.h +199 -0
- cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +475 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
- cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
- cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
- cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +181 -0
- cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
- cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
- cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
- cuda/cccl/headers/include/cuda/std/__string_ +29 -0
- cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
- cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +142 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +269 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +216 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +277 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +174 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +87 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +201 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +132 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
- cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +32 -0
- cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
- cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
- cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
- cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
- cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
- cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
- cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +802 -0
- cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
- cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +510 -0
- cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
- cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
- cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
- cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
- cuda/cccl/headers/include/cuda/std/array +520 -0
- cuda/cccl/headers/include/cuda/std/atomic +818 -0
- cuda/cccl/headers/include/cuda/std/barrier +43 -0
- cuda/cccl/headers/include/cuda/std/bit +35 -0
- cuda/cccl/headers/include/cuda/std/bitset +994 -0
- cuda/cccl/headers/include/cuda/std/cassert +28 -0
- cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
- cuda/cccl/headers/include/cuda/std/cfloat +59 -0
- cuda/cccl/headers/include/cuda/std/chrono +26 -0
- cuda/cccl/headers/include/cuda/std/climits +61 -0
- cuda/cccl/headers/include/cuda/std/cmath +25 -0
- cuda/cccl/headers/include/cuda/std/complex +50 -0
- cuda/cccl/headers/include/cuda/std/concepts +48 -0
- cuda/cccl/headers/include/cuda/std/cstddef +28 -0
- cuda/cccl/headers/include/cuda/std/cstdint +178 -0
- cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
- cuda/cccl/headers/include/cuda/std/cstring +110 -0
- cuda/cccl/headers/include/cuda/std/ctime +152 -0
- cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +235 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1720 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3628 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +667 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1367 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2154 -0
- cuda/cccl/headers/include/cuda/std/execution +27 -0
- cuda/cccl/headers/include/cuda/std/expected +30 -0
- cuda/cccl/headers/include/cuda/std/functional +56 -0
- cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +2163 -0
- cuda/cccl/headers/include/cuda/std/iterator +70 -0
- cuda/cccl/headers/include/cuda/std/latch +34 -0
- cuda/cccl/headers/include/cuda/std/limits +28 -0
- cuda/cccl/headers/include/cuda/std/linalg +30 -0
- cuda/cccl/headers/include/cuda/std/mdspan +38 -0
- cuda/cccl/headers/include/cuda/std/memory +39 -0
- cuda/cccl/headers/include/cuda/std/numbers +335 -0
- cuda/cccl/headers/include/cuda/std/numeric +41 -0
- cuda/cccl/headers/include/cuda/std/optional +31 -0
- cuda/cccl/headers/include/cuda/std/ranges +69 -0
- cuda/cccl/headers/include/cuda/std/ratio +417 -0
- cuda/cccl/headers/include/cuda/std/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/source_location +83 -0
- cuda/cccl/headers/include/cuda/std/span +640 -0
- cuda/cccl/headers/include/cuda/std/string_view +788 -0
- cuda/cccl/headers/include/cuda/std/tuple +26 -0
- cuda/cccl/headers/include/cuda/std/type_traits +176 -0
- cuda/cccl/headers/include/cuda/std/utility +70 -0
- cuda/cccl/headers/include/cuda/std/variant +25 -0
- cuda/cccl/headers/include/cuda/std/version +245 -0
- cuda/cccl/headers/include/cuda/stream_ref +54 -0
- cuda/cccl/headers/include/cuda/type_traits +27 -0
- cuda/cccl/headers/include/cuda/utility +27 -0
- cuda/cccl/headers/include/cuda/version +16 -0
- cuda/cccl/headers/include/cuda/warp +28 -0
- cuda/cccl/headers/include/cuda/work_stealing +26 -0
- cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
- cuda/cccl/headers/include/nv/detail/__target_macros +641 -0
- cuda/cccl/headers/include/nv/target +240 -0
- cuda/cccl/headers/include/thrust/addressof.h +22 -0
- cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
- cuda/cccl/headers/include/thrust/advance.h +59 -0
- cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
- cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
- cuda/cccl/headers/include/thrust/complex.h +859 -0
- cuda/cccl/headers/include/thrust/copy.h +506 -0
- cuda/cccl/headers/include/thrust/count.h +245 -0
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
- cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
- cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
- cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
- cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
- cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
- cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
- cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
- cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
- cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config.h +36 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
- cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
- cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
- cuda/cccl/headers/include/thrust/detail/count.h +55 -0
- cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
- cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
- cuda/cccl/headers/include/thrust/detail/device_malloc.inl +60 -0
- cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
- cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
- cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
- cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
- cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
- cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
- cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
- cuda/cccl/headers/include/thrust/detail/function.h +49 -0
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
- cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
- cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
- cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +289 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
- cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
- cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
- cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
- cuda/cccl/headers/include/thrust/detail/pointer.h +217 -0
- cuda/cccl/headers/include/thrust/detail/pointer.inl +172 -0
- cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
- cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +189 -0
- cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
- cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
- cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
- cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
- cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
- cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
- cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
- cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
- cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
- cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
- cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
- cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
- cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
- cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
- cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.h +615 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +1212 -0
- cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
- cuda/cccl/headers/include/thrust/device_delete.h +59 -0
- cuda/cccl/headers/include/thrust/device_free.h +72 -0
- cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
- cuda/cccl/headers/include/thrust/device_malloc.h +108 -0
- cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
- cuda/cccl/headers/include/thrust/device_new.h +91 -0
- cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
- cuda/cccl/headers/include/thrust/device_ptr.h +202 -0
- cuda/cccl/headers/include/thrust/device_reference.h +986 -0
- cuda/cccl/headers/include/thrust/device_vector.h +574 -0
- cuda/cccl/headers/include/thrust/distance.h +43 -0
- cuda/cccl/headers/include/thrust/equal.h +247 -0
- cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
- cuda/cccl/headers/include/thrust/extrema.h +657 -0
- cuda/cccl/headers/include/thrust/fill.h +201 -0
- cuda/cccl/headers/include/thrust/find.h +382 -0
- cuda/cccl/headers/include/thrust/for_each.h +261 -0
- cuda/cccl/headers/include/thrust/functional.h +396 -0
- cuda/cccl/headers/include/thrust/gather.h +464 -0
- cuda/cccl/headers/include/thrust/generate.h +193 -0
- cuda/cccl/headers/include/thrust/host_vector.h +576 -0
- cuda/cccl/headers/include/thrust/inner_product.h +264 -0
- cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +219 -0
- cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
- cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
- cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
- cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +275 -0
- cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +192 -0
- cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
- cuda/cccl/headers/include/thrust/iterator/retag.h +74 -0
- cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +221 -0
- cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +184 -0
- cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
- cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
- cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +357 -0
- cuda/cccl/headers/include/thrust/logical.h +290 -0
- cuda/cccl/headers/include/thrust/memory.h +395 -0
- cuda/cccl/headers/include/thrust/merge.h +725 -0
- cuda/cccl/headers/include/thrust/mismatch.h +261 -0
- cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +68 -0
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
- cuda/cccl/headers/include/thrust/mr/new.h +100 -0
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
- cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
- cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +65 -0
- cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
- cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
- cuda/cccl/headers/include/thrust/pair.h +102 -0
- cuda/cccl/headers/include/thrust/partition.h +1383 -0
- cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
- cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
- cuda/cccl/headers/include/thrust/random.h +120 -0
- cuda/cccl/headers/include/thrust/reduce.h +1112 -0
- cuda/cccl/headers/include/thrust/remove.h +768 -0
- cuda/cccl/headers/include/thrust/replace.h +827 -0
- cuda/cccl/headers/include/thrust/reverse.h +213 -0
- cuda/cccl/headers/include/thrust/scan.h +1671 -0
- cuda/cccl/headers/include/thrust/scatter.h +446 -0
- cuda/cccl/headers/include/thrust/sequence.h +277 -0
- cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
- cuda/cccl/headers/include/thrust/shuffle.h +182 -0
- cuda/cccl/headers/include/thrust/sort.h +1320 -0
- cuda/cccl/headers/include/thrust/swap.h +147 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/vector.inl +130 -0
- cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +161 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +123 -0
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/load_iterator.h +58 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/make_load_iterator.h +53 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +611 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +89 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +781 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1000 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +152 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +88 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1736 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +403 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +94 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +646 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
- cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
- cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
- cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +160 -0
- cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system/system_error.h +184 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +160 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system_error.h +57 -0
- cuda/cccl/headers/include/thrust/tabulate.h +125 -0
- cuda/cccl/headers/include/thrust/transform.h +1045 -0
- cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
- cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
- cuda/cccl/headers/include/thrust/tuple.h +142 -0
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
- cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
- cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +96 -0
- cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
- cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
- cuda/cccl/headers/include/thrust/unique.h +1090 -0
- cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
- cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
- cuda/cccl/headers/include/thrust/version.h +93 -0
- cuda/cccl/headers/include/thrust/zip_function.h +176 -0
- cuda/cccl/headers/include_paths.py +72 -0
- cuda/cccl/parallel/__init__.py +9 -0
- cuda/cccl/parallel/experimental/__init__.py +47 -0
- cuda/cccl/parallel/experimental/_bindings.py +24 -0
- cuda/cccl/parallel/experimental/_bindings.pyi +388 -0
- cuda/cccl/parallel/experimental/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/_bindings_impl.pyx +2158 -0
- cuda/cccl/parallel/experimental/_caching.py +71 -0
- cuda/cccl/parallel/experimental/_cccl_interop.py +382 -0
- cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
- cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
- cuda/cccl/parallel/experimental/algorithms/__init__.py +28 -0
- cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +172 -0
- cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +244 -0
- cuda/cccl/parallel/experimental/algorithms/_reduce.py +136 -0
- cuda/cccl/parallel/experimental/algorithms/_scan.py +179 -0
- cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +183 -0
- cuda/cccl/parallel/experimental/algorithms/_transform.py +213 -0
- cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +179 -0
- cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
- cuda/cccl/parallel/experimental/cccl/libcccl.c.parallel.so +0 -0
- cuda/cccl/parallel/experimental/iterators/__init__.py +17 -0
- cuda/cccl/parallel/experimental/iterators/_factories.py +157 -0
- cuda/cccl/parallel/experimental/iterators/_iterators.py +650 -0
- cuda/cccl/parallel/experimental/numba_utils.py +6 -0
- cuda/cccl/parallel/experimental/struct.py +150 -0
- cuda/cccl/parallel/experimental/typing.py +27 -0
- cuda/cccl/py.typed +0 -0
- cuda_cccl-0.1.3.1.0.dev1678.dist-info/METADATA +28 -0
- cuda_cccl-0.1.3.1.0.dev1678.dist-info/RECORD +1860 -0
- cuda_cccl-0.1.3.1.0.dev1678.dist-info/WHEEL +6 -0
- cuda_cccl-0.1.3.1.0.dev1678.dist-info/licenses/LICENSE +1 -0
|
@@ -0,0 +1,936 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
3
|
+
|
|
4
|
+
#pragma once
|
|
5
|
+
|
|
6
|
+
#include <cub/config.cuh>
|
|
7
|
+
|
|
8
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
9
|
+
# pragma GCC system_header
|
|
10
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
11
|
+
# pragma clang system_header
|
|
12
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
13
|
+
# pragma system_header
|
|
14
|
+
#endif // no system header
|
|
15
|
+
|
|
16
|
+
#include <cub/device/dispatch/tuning/tuning_transform.cuh>
|
|
17
|
+
#include <cub/util_type.cuh>
|
|
18
|
+
#include <cub/util_vsmem.cuh>
|
|
19
|
+
|
|
20
|
+
#include <thrust/detail/raw_reference_cast.h>
|
|
21
|
+
#include <thrust/system/cuda/detail/core/util.h>
|
|
22
|
+
#include <thrust/type_traits/is_contiguous_iterator.h>
|
|
23
|
+
|
|
24
|
+
#include <cuda/__barrier/aligned_size.h> // cannot include <cuda/barrier> directly on CUDA_ARCH < 700
|
|
25
|
+
#include <cuda/cmath>
|
|
26
|
+
#include <cuda/ptx>
|
|
27
|
+
#include <cuda/std/bit>
|
|
28
|
+
#include <cuda/std/cstdint>
|
|
29
|
+
#include <cuda/std/expected>
|
|
30
|
+
|
|
31
|
+
#include <cuda_pipeline_primitives.h>
|
|
32
|
+
|
|
33
|
+
CUB_NAMESPACE_BEGIN
|
|
34
|
+
|
|
35
|
+
namespace detail::transform
|
|
36
|
+
{
|
|
37
|
+
|
|
38
|
+
template <typename T>
|
|
39
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char* round_down_ptr(const T* ptr, unsigned alignment)
|
|
40
|
+
{
|
|
41
|
+
_CCCL_ASSERT(::cuda::std::has_single_bit(alignment), "");
|
|
42
|
+
return reinterpret_cast<const char*>(
|
|
43
|
+
reinterpret_cast<::cuda::std::uintptr_t>(ptr) & ~::cuda::std::uintptr_t{alignment - 1});
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Prefetches (at least on Hopper) a 128 byte cache line. Prefetching out-of-bounds addresses has no side effects
|
|
47
|
+
// TODO(bgruber): there is also the cp.async.bulk.prefetch instruction available on Hopper. May improve perf a tiny bit
|
|
48
|
+
// as we need to create less instructions to prefetch the same amount of data.
|
|
49
|
+
template <typename T>
|
|
50
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch(const T* addr)
|
|
51
|
+
{
|
|
52
|
+
// TODO(bgruber): prefetch to L1 may be even better
|
|
53
|
+
asm volatile("prefetch.global.L2 [%0];" : : "l"(__cvta_generic_to_global(addr)) : "memory");
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
template <int BlockDim, typename It>
|
|
57
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile(It begin, int items)
|
|
58
|
+
{
|
|
59
|
+
if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<It>)
|
|
60
|
+
{
|
|
61
|
+
constexpr int prefetch_byte_stride = 128; // TODO(bgruber): should correspond to cache line size. Does this need to
|
|
62
|
+
// be architecture dependent?
|
|
63
|
+
const int items_bytes = items * sizeof(it_value_t<It>);
|
|
64
|
+
|
|
65
|
+
// prefetch does not stall and unrolling just generates a lot of unnecessary computations and predicate handling
|
|
66
|
+
_CCCL_PRAGMA_NOUNROLL()
|
|
67
|
+
for (int offset = threadIdx.x * prefetch_byte_stride; offset < items_bytes;
|
|
68
|
+
offset += BlockDim * prefetch_byte_stride)
|
|
69
|
+
{
|
|
70
|
+
prefetch(reinterpret_cast<const char*>(::cuda::std::to_address(begin)) + offset);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// This kernel guarantees that objects passed as arguments to the user-provided transformation function f reside in
|
|
76
|
+
// global memory. No intermediate copies are taken. If the parameter type of f is a reference, taking the address of the
|
|
77
|
+
// parameter yields a global memory address.
|
|
78
|
+
template <typename PrefetchPolicy,
|
|
79
|
+
typename Offset,
|
|
80
|
+
typename F,
|
|
81
|
+
typename RandomAccessIteratorOut,
|
|
82
|
+
typename... RandomAccessIteratorIn>
|
|
83
|
+
_CCCL_DEVICE void transform_kernel_prefetch(
|
|
84
|
+
Offset num_items, int num_elem_per_thread, F f, RandomAccessIteratorOut out, RandomAccessIteratorIn... ins)
|
|
85
|
+
{
|
|
86
|
+
constexpr int block_threads = PrefetchPolicy::block_threads;
|
|
87
|
+
const int tile_size = block_threads * num_elem_per_thread;
|
|
88
|
+
const Offset offset = static_cast<Offset>(blockIdx.x) * tile_size;
|
|
89
|
+
const int valid_items = static_cast<int>((::cuda::std::min) (num_items - offset, Offset{tile_size}));
|
|
90
|
+
|
|
91
|
+
// move index and iterator domain to the block/thread index, to reduce arithmetic in the loops below
|
|
92
|
+
{
|
|
93
|
+
(..., (ins += offset));
|
|
94
|
+
out += offset;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
(..., prefetch_tile<block_threads>(ins, valid_items));
|
|
98
|
+
|
|
99
|
+
auto process_tile = [&](auto full_tile, auto... ins2 /* nvcc fails to compile when just using the captured ins */) {
|
|
100
|
+
// ahendriksen: various unrolling yields less <1% gains at much higher compile-time cost
|
|
101
|
+
// bgruber: but A6000 and H100 show small gains without pragma
|
|
102
|
+
// _CCCL_PRAGMA_NOUNROLL()
|
|
103
|
+
for (int j = 0; j < num_elem_per_thread; ++j)
|
|
104
|
+
{
|
|
105
|
+
const int idx = j * block_threads + threadIdx.x;
|
|
106
|
+
if (full_tile || idx < valid_items)
|
|
107
|
+
{
|
|
108
|
+
// we have to unwrap Thrust's proxy references here for backward compatibility (try zip_iterator.cu test)
|
|
109
|
+
out[idx] = f(THRUST_NS_QUALIFIER::raw_reference_cast(ins2[idx])...);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
};
|
|
113
|
+
if (tile_size == valid_items)
|
|
114
|
+
{
|
|
115
|
+
process_tile(::cuda::std::true_type{}, ins...);
|
|
116
|
+
}
|
|
117
|
+
else
|
|
118
|
+
{
|
|
119
|
+
process_tile(::cuda::std::false_type{}, ins...);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
#if _CCCL_CTK_BELOW(13, 0)
|
|
124
|
+
struct alignas(32) aligned32_t
|
|
125
|
+
{
|
|
126
|
+
longlong4 data;
|
|
127
|
+
};
|
|
128
|
+
#endif // _CCCL_CTK_BELOW(13, 0)
|
|
129
|
+
|
|
130
|
+
template <int Bytes>
|
|
131
|
+
_CCCL_HOST_DEVICE _CCCL_CONSTEVAL auto load_store_type()
|
|
132
|
+
{
|
|
133
|
+
static_assert(::cuda::is_power_of_two(Bytes));
|
|
134
|
+
if constexpr (Bytes == 1)
|
|
135
|
+
{
|
|
136
|
+
return ::cuda::std::int8_t{};
|
|
137
|
+
}
|
|
138
|
+
else if constexpr (Bytes == 2)
|
|
139
|
+
{
|
|
140
|
+
return ::cuda::std::int16_t{};
|
|
141
|
+
}
|
|
142
|
+
else if constexpr (Bytes == 4)
|
|
143
|
+
{
|
|
144
|
+
return ::cuda::std::int32_t{};
|
|
145
|
+
}
|
|
146
|
+
else if constexpr (Bytes == 8)
|
|
147
|
+
{
|
|
148
|
+
return ::cuda::std::int64_t{};
|
|
149
|
+
}
|
|
150
|
+
else if constexpr (Bytes == 16)
|
|
151
|
+
{
|
|
152
|
+
static_assert(alignof(int4) == 16);
|
|
153
|
+
return int4{};
|
|
154
|
+
}
|
|
155
|
+
else if constexpr (Bytes == 32)
|
|
156
|
+
{
|
|
157
|
+
#if _CCCL_CTK_BELOW(13, 0)
|
|
158
|
+
static_assert(alignof(aligned32_t) == 32);
|
|
159
|
+
return aligned32_t{};
|
|
160
|
+
#else // ^^^ _CCCL_CTK_BELOW(13, 0) ^^^ / vvv _CCCL_CTK_AT_LEAST(13, 0) vvv
|
|
161
|
+
return longlong4_32a{};
|
|
162
|
+
#endif // _CCCL_CTK_AT_LEAST(13, 0)
|
|
163
|
+
}
|
|
164
|
+
else
|
|
165
|
+
{
|
|
166
|
+
return ::cuda::std::array<int, Bytes / sizeof(int)>{};
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
template <typename T>
|
|
171
|
+
inline constexpr size_t size_of = sizeof(T);
|
|
172
|
+
|
|
173
|
+
template <>
|
|
174
|
+
inline constexpr size_t size_of<void> = 0;
|
|
175
|
+
|
|
176
|
+
template <typename VectorizedPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InputT>
|
|
177
|
+
_CCCL_DEVICE void transform_kernel_vectorized(
|
|
178
|
+
Offset num_items,
|
|
179
|
+
int num_elem_per_thread_prefetch,
|
|
180
|
+
bool can_vectorize,
|
|
181
|
+
F f,
|
|
182
|
+
RandomAccessIteratorOut out,
|
|
183
|
+
const InputT*... ins)
|
|
184
|
+
{
|
|
185
|
+
constexpr int block_dim = VectorizedPolicy::block_threads;
|
|
186
|
+
constexpr int items_per_thread = VectorizedPolicy::items_per_thread_vectorized;
|
|
187
|
+
_CCCL_ASSERT(!can_vectorize || (items_per_thread == num_elem_per_thread_prefetch), "");
|
|
188
|
+
constexpr int tile_size = block_dim * items_per_thread;
|
|
189
|
+
const Offset offset = static_cast<Offset>(blockIdx.x) * tile_size;
|
|
190
|
+
const int valid_items = static_cast<int>((::cuda::std::min) (num_items - offset, Offset{tile_size}));
|
|
191
|
+
|
|
192
|
+
// if we cannot vectorize or don't have a full tile, fall back to prefetch kernel
|
|
193
|
+
if (!can_vectorize || valid_items != tile_size)
|
|
194
|
+
{
|
|
195
|
+
transform_kernel_prefetch<VectorizedPolicy>(
|
|
196
|
+
num_items, num_elem_per_thread_prefetch, ::cuda::std::move(f), ::cuda::std::move(out), ins...);
|
|
197
|
+
return;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// move index and iterator domain to the block/thread index, to reduce arithmetic in the loops below
|
|
201
|
+
{
|
|
202
|
+
(..., (ins += offset));
|
|
203
|
+
out += offset;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
constexpr int load_store_size = VectorizedPolicy::load_store_word_size;
|
|
207
|
+
using load_store_t = decltype(load_store_type<load_store_size>());
|
|
208
|
+
using result_t = ::cuda::std::invoke_result_t<F, const InputT&...>;
|
|
209
|
+
using output_t = it_value_t<RandomAccessIteratorOut>;
|
|
210
|
+
constexpr int input_type_size = int{first_item(sizeof(InputT)...)};
|
|
211
|
+
constexpr int load_store_count = (items_per_thread * input_type_size) / load_store_size;
|
|
212
|
+
static_assert((items_per_thread * input_type_size) % load_store_size == 0);
|
|
213
|
+
static_assert(load_store_size % input_type_size == 0);
|
|
214
|
+
|
|
215
|
+
constexpr bool can_vectorize_store =
|
|
216
|
+
THRUST_NS_QUALIFIER::is_contiguous_iterator_v<RandomAccessIteratorOut>
|
|
217
|
+
&& THRUST_NS_QUALIFIER::is_trivially_relocatable_v<output_t> && size_of<output_t> == input_type_size;
|
|
218
|
+
|
|
219
|
+
// if we can vectorize, we convert f's return type to the output type right away, so we can reinterpret later
|
|
220
|
+
using THRUST_NS_QUALIFIER::cuda_cub::core::detail::uninitialized_array;
|
|
221
|
+
uninitialized_array<::cuda::std::conditional_t<can_vectorize_store, output_t, result_t>, items_per_thread> output;
|
|
222
|
+
|
|
223
|
+
auto provide_array = [&](auto... inputs) {
|
|
224
|
+
// load inputs
|
|
225
|
+
// TODO(bgruber): we could support fancy iterators for loading here as well (and only vectorize some inputs)
|
|
226
|
+
[[maybe_unused]] auto load_tile_vectorized = [&](auto* in, auto& input) {
|
|
227
|
+
auto in_vec = reinterpret_cast<const load_store_t*>(in);
|
|
228
|
+
auto input_vec = reinterpret_cast<load_store_t*>(input.data());
|
|
229
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
230
|
+
for (int i = 0; i < load_store_count; ++i)
|
|
231
|
+
{
|
|
232
|
+
input_vec[i] = in_vec[i * VectorizedPolicy::block_threads + threadIdx.x];
|
|
233
|
+
}
|
|
234
|
+
};
|
|
235
|
+
(load_tile_vectorized(ins, inputs), ...);
|
|
236
|
+
|
|
237
|
+
// process
|
|
238
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
239
|
+
for (int i = 0; i < items_per_thread; ++i)
|
|
240
|
+
{
|
|
241
|
+
output[i] = f(inputs[i]...);
|
|
242
|
+
}
|
|
243
|
+
};
|
|
244
|
+
provide_array(uninitialized_array<InputT, items_per_thread>{}...);
|
|
245
|
+
|
|
246
|
+
// write output
|
|
247
|
+
if constexpr (can_vectorize_store)
|
|
248
|
+
{
|
|
249
|
+
// vector path
|
|
250
|
+
auto output_vec = reinterpret_cast<const load_store_t*>(output.data());
|
|
251
|
+
auto out_vec = reinterpret_cast<load_store_t*>(out) + threadIdx.x;
|
|
252
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
253
|
+
for (int i = 0; i < load_store_count; ++i)
|
|
254
|
+
{
|
|
255
|
+
out_vec[i * VectorizedPolicy::block_threads] = output_vec[i];
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
else
|
|
259
|
+
{
|
|
260
|
+
// serial path
|
|
261
|
+
constexpr int elems = load_store_size / input_type_size;
|
|
262
|
+
out += threadIdx.x * elems;
|
|
263
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
264
|
+
for (int i = 0; i < load_store_count; ++i)
|
|
265
|
+
{
|
|
266
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
267
|
+
for (int j = 0; j < elems; ++j)
|
|
268
|
+
{
|
|
269
|
+
out[i * elems * VectorizedPolicy::block_threads + j] = output[i * elems + j];
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// Implementation notes on memcpy_async and UBLKCP kernels regarding copy alignment and padding
|
|
276
|
+
//
|
|
277
|
+
// For performance considerations of memcpy_async:
|
|
278
|
+
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#performance-guidance-for-memcpy-async
|
|
279
|
+
//
|
|
280
|
+
// We basically have to align the base pointer to 16 bytes, and copy a multiple of 16 bytes. To achieve this, when we
|
|
281
|
+
// copy a tile of data from an input buffer, we round down the pointer to the start of the tile to the next lower
|
|
282
|
+
// address that is a multiple of 16 bytes. This introduces head padding. We also round up the total number of bytes to
|
|
283
|
+
// copy (including head padding) to a multiple of 16 bytes, which introduces tail padding. For the bulk copy kernel, we
|
|
284
|
+
// have to align to 128 bytes instead of 16 on Hopper.
|
|
285
|
+
//
|
|
286
|
+
// However, padding memory copies like that may access the input buffer out-of-bounds. Here are some thoughts:
|
|
287
|
+
// * According to the CUDA programming guide
|
|
288
|
+
// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses), "any address of a variable
|
|
289
|
+
// residing in global memory or returned by one of the memory allocation routines from the driver or runtime API is
|
|
290
|
+
// always aligned to at least 256 bytes."
|
|
291
|
+
// * Memory protection is usually done on memory page level, which is even larger than 256 bytes for CUDA and 4KiB on
|
|
292
|
+
// Intel x86 and 4KiB+ ARM. Front and tail padding thus never leaves the memory page of the input buffer.
|
|
293
|
+
// * This should count for device memory, but also for device accessible memory living on the host.
|
|
294
|
+
// * The base pointer alignment and size rounding also never leaves the size of a cache line.
|
|
295
|
+
//
|
|
296
|
+
// Copying larger data blocks with head and tail padding should thus be legal. Nevertheless, an out-of-bounds read is
|
|
297
|
+
// still technically undefined behavior in C++. Also, compute-sanitizer flags at least such reads after the end of a
|
|
298
|
+
// buffer. Therefore, we lean on the safer side and protect against out of bounds reads at the beginning and end.
|
|
299
|
+
|
|
300
|
+
// A note on size and alignment: The size of a type is at least as large as its alignment. We rely on this fact in some
|
|
301
|
+
// conditions.
|
|
302
|
+
// This is guaranteed by the C++ standard, and follows from the definition of arrays: the difference between neighboring
|
|
303
|
+
// array element addresses is sizeof element type and each array element needs to fulfill the alignment requirement of
|
|
304
|
+
// the element type.
|
|
305
|
+
|
|
306
|
+
// Pointer with metadata to describe readonly input memory for memcpy_async and UBLKCP kernels.
|
|
307
|
+
// LDGSTS is most efficient when the data is 16-byte aligned and the size a multiple of 16 bytes
|
|
308
|
+
// UBLKCP is most efficient when the data is 128/16-byte aligned (Hopper/Blackwell) and the size a multiple of 16 bytes
|
|
309
|
+
template <typename T> // Cannot add alignment to signature, because we need a uniform kernel template instantiation
|
|
310
|
+
struct aligned_base_ptr
|
|
311
|
+
{
|
|
312
|
+
using value_type = T;
|
|
313
|
+
|
|
314
|
+
const char* ptr; // aligned pointer before the original pointer (16-byte or 128-byte). May not be aligned to
|
|
315
|
+
// alignof(T). E.g.: array of int3 starting at address 4, ptr == 0
|
|
316
|
+
int head_padding; // byte offset between ptr and the original pointer. Value inside [0;15] or [0;127].
|
|
317
|
+
|
|
318
|
+
_CCCL_HOST_DEVICE const T* ptr_to_elements() const
|
|
319
|
+
{
|
|
320
|
+
return reinterpret_cast<const T*>(ptr + head_padding);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
_CCCL_HOST_DEVICE friend bool operator==(const aligned_base_ptr& a, const aligned_base_ptr& b)
|
|
324
|
+
{
|
|
325
|
+
return a.ptr == b.ptr && a.head_padding == b.head_padding;
|
|
326
|
+
}
|
|
327
|
+
};
|
|
328
|
+
|
|
329
|
+
template <typename T>
|
|
330
|
+
_CCCL_HOST_DEVICE auto make_aligned_base_ptr(const T* ptr, int alignment) -> aligned_base_ptr<T>
|
|
331
|
+
{
|
|
332
|
+
const char* base_ptr = round_down_ptr(ptr, alignment);
|
|
333
|
+
return aligned_base_ptr<T>{base_ptr, static_cast<int>(reinterpret_cast<const char*>(ptr) - base_ptr)};
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
template <int BlockThreads>
|
|
337
|
+
_CCCL_DEVICE void memcpy_async_aligned(void* dst, const void* src, unsigned int bytes_to_copy)
|
|
338
|
+
{
|
|
339
|
+
_CCCL_ASSERT(::cuda::std::bit_cast<uintptr_t>(src) % ldgsts_size_and_align == 0, "");
|
|
340
|
+
_CCCL_ASSERT(::cuda::std::bit_cast<uintptr_t>(dst) % ldgsts_size_and_align == 0, "");
|
|
341
|
+
_CCCL_ASSERT(bytes_to_copy % ldgsts_size_and_align == 0, "");
|
|
342
|
+
|
|
343
|
+
// allowing unrolling generates a LOT more instructions and is usually slower (confirmed by benchmark)
|
|
344
|
+
_CCCL_PRAGMA_NOUNROLL()
|
|
345
|
+
for (unsigned int offset = threadIdx.x * ldgsts_size_and_align; offset < bytes_to_copy;
|
|
346
|
+
offset += BlockThreads * ldgsts_size_and_align)
|
|
347
|
+
{
|
|
348
|
+
__pipeline_memcpy_async(
|
|
349
|
+
static_cast<char*>(dst) + offset, static_cast<const char*>(src) + offset, ldgsts_size_and_align);
|
|
350
|
+
}
|
|
351
|
+
__pipeline_commit();
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
template <int BlockThreads>
|
|
355
|
+
_CCCL_DEVICE void memcpy_async_maybe_unaligned(void* dst, const void* src, unsigned int bytes_to_copy, int head_padding)
|
|
356
|
+
{
|
|
357
|
+
// early exiting if (head_padding == 0 && bytes_to_copy % ldgsts_size_and_align == 0) does not yield a benefit
|
|
358
|
+
|
|
359
|
+
const char* src_ptr = static_cast<const char*>(src);
|
|
360
|
+
char* dst_ptr = static_cast<char*>(dst);
|
|
361
|
+
|
|
362
|
+
// handle tiny copies to simplify head/tail bytes computations below
|
|
363
|
+
if (bytes_to_copy < ldgsts_size_and_align)
|
|
364
|
+
{
|
|
365
|
+
if (threadIdx.x < bytes_to_copy)
|
|
366
|
+
{
|
|
367
|
+
dst_ptr[threadIdx.x] = src_ptr[threadIdx.x];
|
|
368
|
+
}
|
|
369
|
+
return;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
const unsigned int head_bytes = (ldgsts_size_and_align - head_padding) % ldgsts_size_and_align;
|
|
373
|
+
const unsigned int tail_bytes = (bytes_to_copy - head_bytes) % ldgsts_size_and_align;
|
|
374
|
+
|
|
375
|
+
// pipeline the async copies before loading the head and tail elements
|
|
376
|
+
_CCCL_ASSERT(bytes_to_copy >= (head_bytes + tail_bytes), "");
|
|
377
|
+
const unsigned int aligned_bytes_to_copy = bytes_to_copy - head_bytes - tail_bytes;
|
|
378
|
+
if (aligned_bytes_to_copy > 0)
|
|
379
|
+
{
|
|
380
|
+
_CCCL_ASSERT(::cuda::std::bit_cast<uintptr_t>(dst_ptr + head_bytes) % ldgsts_size_and_align == 0, "");
|
|
381
|
+
_CCCL_ASSERT(::cuda::std::bit_cast<uintptr_t>(src_ptr + head_bytes) % ldgsts_size_and_align == 0, "");
|
|
382
|
+
_CCCL_ASSERT(aligned_bytes_to_copy % ldgsts_size_and_align == 0, "");
|
|
383
|
+
memcpy_async_aligned<BlockThreads>(dst_ptr + head_bytes, src_ptr + head_bytes, aligned_bytes_to_copy);
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
// TODO(bgruber): ahendriksen suggested to copy elements instead of bytes, but it generates about 20 instructions more
|
|
387
|
+
// ahendriksen: we perform both loads first and then both writes. this reduces the total latency
|
|
388
|
+
char head_byte, tail_byte;
|
|
389
|
+
if (threadIdx.x < head_bytes)
|
|
390
|
+
{
|
|
391
|
+
head_byte = src_ptr[threadIdx.x];
|
|
392
|
+
}
|
|
393
|
+
if (threadIdx.x < tail_bytes)
|
|
394
|
+
{
|
|
395
|
+
tail_byte = src_ptr[bytes_to_copy - tail_bytes + threadIdx.x];
|
|
396
|
+
}
|
|
397
|
+
if (threadIdx.x < head_bytes)
|
|
398
|
+
{
|
|
399
|
+
dst_ptr[threadIdx.x] = head_byte;
|
|
400
|
+
}
|
|
401
|
+
if (threadIdx.x < tail_bytes)
|
|
402
|
+
{
|
|
403
|
+
dst_ptr[bytes_to_copy - tail_bytes + threadIdx.x] = tail_byte;
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
// Turning this function into a lambda will make nvcc generate it once for each iterator instead of for each distinct
|
|
408
|
+
// value type (which may be less).
|
|
409
|
+
template <int BlockThreads, typename AlignedPtr, typename Offset>
|
|
410
|
+
_CCCL_DEVICE auto
|
|
411
|
+
copy_and_return_smem_dst(AlignedPtr aligned_ptr, int& smem_offset, Offset offset, char* smem, int valid_items)
|
|
412
|
+
{
|
|
413
|
+
using T = typename decltype(aligned_ptr)::value_type;
|
|
414
|
+
// because SMEM base pointer and bytes_to_copy are always multiples of ldgsts_size_and_align, we only need to align
|
|
415
|
+
// the SMEM start for types with larger alignment
|
|
416
|
+
_CCCL_ASSERT(smem_offset % ldgsts_size_and_align == 0, "");
|
|
417
|
+
if constexpr (alignof(T) > ldgsts_size_and_align)
|
|
418
|
+
{
|
|
419
|
+
smem_offset = ::cuda::round_up(smem_offset, int{alignof(T)});
|
|
420
|
+
}
|
|
421
|
+
const char* const src = aligned_ptr.ptr + offset * unsigned{sizeof(T)}; // compute expression in U32 if Offset==I32
|
|
422
|
+
char* const dst = smem + smem_offset;
|
|
423
|
+
_CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % ldgsts_size_and_align == 0, "");
|
|
424
|
+
_CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % ldgsts_size_and_align == 0, "");
|
|
425
|
+
|
|
426
|
+
int bytes_to_copy;
|
|
427
|
+
if constexpr (alignof(T) < ldgsts_size_and_align)
|
|
428
|
+
{
|
|
429
|
+
bytes_to_copy = ::cuda::round_up(aligned_ptr.head_padding + int{sizeof(T)} * valid_items, ldgsts_size_and_align);
|
|
430
|
+
}
|
|
431
|
+
else
|
|
432
|
+
{
|
|
433
|
+
_CCCL_ASSERT(aligned_ptr.head_padding == 0, "");
|
|
434
|
+
bytes_to_copy = int{sizeof(T)} * valid_items;
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
smem_offset += bytes_to_copy; // leaves aligned address for follow-up copy
|
|
438
|
+
memcpy_async_aligned<BlockThreads>(dst, src, bytes_to_copy);
|
|
439
|
+
const char* const dst_start_of_data = dst + (alignof(T) < ldgsts_size_and_align ? aligned_ptr.head_padding : 0);
|
|
440
|
+
_CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst_start_of_data) % alignof(T) == 0, "");
|
|
441
|
+
return reinterpret_cast<const T*>(dst_start_of_data);
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
template <int BlockThreads, typename AlignedPtr, typename Offset>
|
|
445
|
+
_CCCL_DEVICE auto copy_and_return_smem_dst_fallback(
|
|
446
|
+
AlignedPtr aligned_ptr, int& smem_offset, Offset offset, char* smem, int valid_items, int tile_size)
|
|
447
|
+
{
|
|
448
|
+
// TODO(bgruber): drop handling of head bytes and just read OOB, since gmem buffers are always sufficiently aligned
|
|
449
|
+
|
|
450
|
+
using T = typename decltype(aligned_ptr)::value_type;
|
|
451
|
+
// because SMEM base pointer and tile_size are always multiples of 16-byte, we only need to align the SMEM start
|
|
452
|
+
// for types with larger alignment
|
|
453
|
+
_CCCL_ASSERT(tile_size % ldgsts_size_and_align == 0, "");
|
|
454
|
+
_CCCL_ASSERT(smem_offset % ldgsts_size_and_align == 0, "");
|
|
455
|
+
if constexpr (alignof(T) > ldgsts_size_and_align)
|
|
456
|
+
{
|
|
457
|
+
smem_offset = ::cuda::round_up(smem_offset, int{alignof(T)});
|
|
458
|
+
}
|
|
459
|
+
_CCCL_ASSERT(alignof(T) < ldgsts_size_and_align || aligned_ptr.head_padding == 0, "");
|
|
460
|
+
const int head_padding = alignof(T) < ldgsts_size_and_align ? aligned_ptr.head_padding : 0;
|
|
461
|
+
|
|
462
|
+
const char* src = aligned_ptr.ptr + offset * unsigned{sizeof(T)} + head_padding; // compute expression in U32 if
|
|
463
|
+
// Offset==I32
|
|
464
|
+
char* dst = smem + smem_offset + head_padding;
|
|
465
|
+
_CCCL_ASSERT(::cuda::std::bit_cast<uintptr_t>(src) % alignof(T) == 0, "");
|
|
466
|
+
_CCCL_ASSERT(::cuda::std::bit_cast<uintptr_t>(dst) % alignof(T) == 0, "");
|
|
467
|
+
const int bytes_to_copy = int{sizeof(T)} * valid_items;
|
|
468
|
+
memcpy_async_maybe_unaligned<BlockThreads>(dst, src, bytes_to_copy, head_padding);
|
|
469
|
+
|
|
470
|
+
// add ldgsts_size_and_align to account for this tile's head padding
|
|
471
|
+
smem_offset += ldgsts_size_and_align + int{sizeof(T)} * tile_size;
|
|
472
|
+
|
|
473
|
+
return reinterpret_cast<T*>(dst);
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
template <typename LdgstsPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InTs>
|
|
477
|
+
_CCCL_DEVICE void transform_kernel_ldgsts(
|
|
478
|
+
Offset num_items, int num_elem_per_thread, F f, RandomAccessIteratorOut out, aligned_base_ptr<InTs>... aligned_ptrs)
|
|
479
|
+
{
|
|
480
|
+
// SMEM is 16-byte aligned by default
|
|
481
|
+
extern __shared__ char smem[];
|
|
482
|
+
static_assert(ldgsts_size_and_align <= 16);
|
|
483
|
+
_CCCL_ASSERT(reinterpret_cast<uintptr_t>(smem) % ldgsts_size_and_align == 0, "");
|
|
484
|
+
|
|
485
|
+
constexpr int block_threads = LdgstsPolicy::block_threads;
|
|
486
|
+
const int tile_size = block_threads * num_elem_per_thread;
|
|
487
|
+
const Offset offset = static_cast<Offset>(blockIdx.x) * tile_size;
|
|
488
|
+
const int valid_items = static_cast<int>(::cuda::std::min(num_items - offset, Offset{tile_size}));
|
|
489
|
+
|
|
490
|
+
[[maybe_unused]] int smem_offset = 0;
|
|
491
|
+
// TODO(bgruber): drop checking first block, since gmem buffers are always sufficiently aligned. But this would not
|
|
492
|
+
// work for inputs in host stack memory ...
|
|
493
|
+
const bool inner_blocks = 0 < blockIdx.x && blockIdx.x + 2 < gridDim.x;
|
|
494
|
+
// TODO(bgruber): if we used SMEM offsets instead of pointers, we need less registers (but no perf increase)
|
|
495
|
+
[[maybe_unused]] const auto smem_ptrs = ::cuda::std::tuple<const InTs*...>{
|
|
496
|
+
(inner_blocks ? copy_and_return_smem_dst<block_threads>(aligned_ptrs, smem_offset, offset, smem, valid_items)
|
|
497
|
+
: copy_and_return_smem_dst_fallback<block_threads>(
|
|
498
|
+
aligned_ptrs, smem_offset, offset, smem, valid_items, tile_size))...};
|
|
499
|
+
__pipeline_wait_prior(0);
|
|
500
|
+
__syncthreads();
|
|
501
|
+
|
|
502
|
+
// move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below
|
|
503
|
+
out += offset;
|
|
504
|
+
|
|
505
|
+
// TODO(bgruber): fbusato suggests to move the valid_items and smem_base_ptrs by threadIdx.x before the loop below
|
|
506
|
+
|
|
507
|
+
auto process_tile = [&](auto full_tile) {
|
|
508
|
+
// Unroll 1 tends to improve performance, especially for smaller data types (confirmed by benchmark)
|
|
509
|
+
_CCCL_PRAGMA_NOUNROLL()
|
|
510
|
+
for (int j = 0; j < num_elem_per_thread; ++j)
|
|
511
|
+
{
|
|
512
|
+
const int idx = j * block_threads + threadIdx.x;
|
|
513
|
+
if (full_tile || idx < valid_items)
|
|
514
|
+
{
|
|
515
|
+
out[idx] = ::cuda::std::apply(
|
|
516
|
+
[&](const auto* __restrict__... smem_base_ptrs) {
|
|
517
|
+
return f(smem_base_ptrs[idx]...);
|
|
518
|
+
},
|
|
519
|
+
smem_ptrs);
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
};
|
|
523
|
+
|
|
524
|
+
// explicitly calling the lambda on literal true/false lets the compiler emit the lambda twice
|
|
525
|
+
if (tile_size == valid_items)
|
|
526
|
+
{
|
|
527
|
+
process_tile(::cuda::std::true_type{});
|
|
528
|
+
}
|
|
529
|
+
else
|
|
530
|
+
{
|
|
531
|
+
process_tile(::cuda::std::false_type{});
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE static bool elect_one()
|
|
536
|
+
{
|
|
537
|
+
return ::cuda::ptx::elect_sync(~0) && threadIdx.x < 32;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
template <int BulkCopyAlignment>
|
|
541
|
+
_CCCL_DEVICE void bulk_copy_maybe_unaligned(
|
|
542
|
+
void* dst,
|
|
543
|
+
const void* src,
|
|
544
|
+
unsigned int bytes_to_copy,
|
|
545
|
+
int head_padding,
|
|
546
|
+
uint64_t& bar,
|
|
547
|
+
/* inout */ ::cuda::std::uint32_t& total_copied,
|
|
548
|
+
bool elected)
|
|
549
|
+
{
|
|
550
|
+
const char* src_ptr = static_cast<const char*>(src);
|
|
551
|
+
char* dst_ptr = static_cast<char*>(dst);
|
|
552
|
+
|
|
553
|
+
// handle tiny copies to simplify head/tail bytes computations below
|
|
554
|
+
if (bytes_to_copy < BulkCopyAlignment)
|
|
555
|
+
{
|
|
556
|
+
if (threadIdx.x < bytes_to_copy)
|
|
557
|
+
{
|
|
558
|
+
dst_ptr[threadIdx.x] = src_ptr[threadIdx.x];
|
|
559
|
+
}
|
|
560
|
+
return;
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
const unsigned int head_bytes = (BulkCopyAlignment - head_padding) % BulkCopyAlignment;
|
|
564
|
+
const unsigned int tail_bytes = (bytes_to_copy - head_bytes) % bulk_copy_size_multiple;
|
|
565
|
+
|
|
566
|
+
// launch the bulk copy only from the elected thread
|
|
567
|
+
if (elected)
|
|
568
|
+
{
|
|
569
|
+
_CCCL_ASSERT(bytes_to_copy >= (head_bytes + tail_bytes), "");
|
|
570
|
+
const unsigned int aligned_bytes_to_copy = bytes_to_copy - head_bytes - tail_bytes;
|
|
571
|
+
if (aligned_bytes_to_copy > 0)
|
|
572
|
+
{
|
|
573
|
+
_CCCL_ASSERT(::cuda::std::bit_cast<uintptr_t>(dst_ptr + head_bytes) % BulkCopyAlignment == 0, "");
|
|
574
|
+
_CCCL_ASSERT(::cuda::std::bit_cast<uintptr_t>(src_ptr + head_bytes) % BulkCopyAlignment == 0, "");
|
|
575
|
+
_CCCL_ASSERT(aligned_bytes_to_copy % bulk_copy_size_multiple == 0, "");
|
|
576
|
+
|
|
577
|
+
::cuda::ptx::cp_async_bulk(
|
|
578
|
+
::cuda::ptx::space_cluster,
|
|
579
|
+
::cuda::ptx::space_global,
|
|
580
|
+
dst_ptr + head_bytes,
|
|
581
|
+
src_ptr + head_bytes,
|
|
582
|
+
aligned_bytes_to_copy,
|
|
583
|
+
&bar);
|
|
584
|
+
total_copied += aligned_bytes_to_copy;
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
// ahendriksen: we perform both loads first and then both writes. this reduces the total latency
|
|
589
|
+
char head_byte, tail_byte;
|
|
590
|
+
if (threadIdx.x < head_bytes)
|
|
591
|
+
{
|
|
592
|
+
head_byte = src_ptr[threadIdx.x];
|
|
593
|
+
}
|
|
594
|
+
if (threadIdx.x < tail_bytes)
|
|
595
|
+
{
|
|
596
|
+
tail_byte = src_ptr[bytes_to_copy - tail_bytes + threadIdx.x];
|
|
597
|
+
}
|
|
598
|
+
if (threadIdx.x < head_bytes)
|
|
599
|
+
{
|
|
600
|
+
dst_ptr[threadIdx.x] = head_byte;
|
|
601
|
+
}
|
|
602
|
+
if (threadIdx.x < tail_bytes)
|
|
603
|
+
{
|
|
604
|
+
dst_ptr[bytes_to_copy - tail_bytes + threadIdx.x] = tail_byte;
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
template <int Alignment>
|
|
609
|
+
_CCCL_DEVICE auto round_up_smem_ptr(char* p) -> char*
|
|
610
|
+
{
|
|
611
|
+
uint32_t smem32 = __cvta_generic_to_shared(p);
|
|
612
|
+
smem32 = ::cuda::round_up(smem32, Alignment);
|
|
613
|
+
return static_cast<char*>(_CCCL_BUILTIN_ASSUME_ALIGNED(__cvta_shared_to_generic(smem32), Alignment));
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
template <typename BulkCopyPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InTs>
|
|
617
|
+
_CCCL_DEVICE void transform_kernel_ublkcp(
|
|
618
|
+
Offset num_items, int num_elem_per_thread, F f, RandomAccessIteratorOut out, aligned_base_ptr<InTs>... aligned_ptrs)
|
|
619
|
+
{
|
|
620
|
+
constexpr int block_threads = BulkCopyPolicy::block_threads;
|
|
621
|
+
constexpr int bulk_copy_alignment = BulkCopyPolicy::bulk_copy_alignment;
|
|
622
|
+
|
|
623
|
+
constexpr int min_retained_alignment = ::cuda::std::min({sizeof(InTs)...}) * block_threads;
|
|
624
|
+
constexpr int max_alignment = ::cuda::std::max({alignof(InTs)...});
|
|
625
|
+
constexpr bool tile_sizes_retain_max_alignment = max_alignment <= min_retained_alignment;
|
|
626
|
+
|
|
627
|
+
// add padding after a tile in shared memory to make space for the next tile's head padding, and retain alignment
|
|
628
|
+
constexpr int tile_padding =
|
|
629
|
+
tile_sizes_retain_max_alignment ? ::cuda::std::max(bulk_copy_alignment, max_alignment) : bulk_copy_alignment;
|
|
630
|
+
|
|
631
|
+
__shared__ uint64_t bar;
|
|
632
|
+
|
|
633
|
+
// We use an attribute to align the shared memory. This is not respected on all drivers though. The compiler correctly
|
|
634
|
+
// takes the alignment into account and even emits an alignment specifier into ptx. However, this sometimes randomly
|
|
635
|
+
// fails at runtime because the shared memory start pointer is not correctly provided by the driver/runtime. See also
|
|
636
|
+
// NVBug 5093902, NVBug 5329745, and discussion in PR #5122.
|
|
637
|
+
extern __shared__ char __align__(tile_padding) smem_base_unaligned[];
|
|
638
|
+
|
|
639
|
+
// However, any manual alignment of the shared memory start address outweighs the performance benefits of a faster
|
|
640
|
+
// bulk copy by introducing about 7 additional SASS instructions at the start of the kernel. This also has to be done
|
|
641
|
+
// carefully using a fake read on the address to prevent NVVM to pull the aligning deeper into the kernel.
|
|
642
|
+
// extern __shared__ char smem_base[];
|
|
643
|
+
// uint32_t smem32 = __cvta_generic_to_shared(smem_base);
|
|
644
|
+
// smem32 = cuda::round_up(smem32, bulk_copy_alignment);
|
|
645
|
+
// char* smem = static_cast<char*>(_CCCL_BUILTIN_ASSUME_ALIGNED(__cvta_shared_to_generic(smem32),
|
|
646
|
+
// bulk_copy_alignment));
|
|
647
|
+
|
|
648
|
+
// What gets closest to a working attribute is to rely on the following observations:
|
|
649
|
+
// * static shared memory is aligned to 1KiB
|
|
650
|
+
// * dynamic shared memory is aligned to 16 bytes and comes right after static shared memory
|
|
651
|
+
// In this case we could:
|
|
652
|
+
// extern __shared__ char smem_base[];
|
|
653
|
+
// uint32_t smem32 = __cvta_generic_to_shared(smem_base) + bulk_copy_alignment - 16;
|
|
654
|
+
// asm("" : "+r"(smem32));
|
|
655
|
+
// char* smem = static_cast<char*>(_CCCL_BUILTIN_ASSUME_ALIGNED(__cvta_shared_to_generic(smem32),
|
|
656
|
+
// bulk_copy_alignment));
|
|
657
|
+
// However, CUDA currently does not provide this guarantee.
|
|
658
|
+
|
|
659
|
+
// We cannot assert that shared memory is sufficiently aligned, since it fails on some systems (e.g. with driver
|
|
660
|
+
// 565.57.01 on RTX 2080 when cub::DeviceTransform is called from another kernel via CDP. See
|
|
661
|
+
// thrust.cpp.cuda.cpp20.test.cuda.transform.cdp_1). This will lead to slightly reduced performance of bulk copy, but
|
|
662
|
+
// correctness is maintained.
|
|
663
|
+
//_CCCL_ASSERT(::cuda::is_aligned(smem, bulk_copy_alignment), "");
|
|
664
|
+
|
|
665
|
+
char* smem_base = smem_base_unaligned;
|
|
666
|
+
// Since alignment via the attribute may not work, we have to align explicitly if it's larger than the default dynamic
|
|
667
|
+
// shared memory alignment (16). This is not needed when the tile size does not retain the alignment, since we align
|
|
668
|
+
// each tile separately later
|
|
669
|
+
if constexpr (tile_sizes_retain_max_alignment && max_alignment > 16)
|
|
670
|
+
{
|
|
671
|
+
smem_base = round_up_smem_ptr<tile_padding>(smem_base);
|
|
672
|
+
asm("" : "+l"(smem_base)); // keep the compiler from pulling the alignment deeper into the kernel
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
namespace ptx = ::cuda::ptx;
|
|
676
|
+
|
|
677
|
+
const int tile_size = block_threads * num_elem_per_thread;
|
|
678
|
+
const Offset offset = static_cast<Offset>(blockIdx.x) * tile_size;
|
|
679
|
+
const int valid_items = (::cuda::std::min) (num_items - offset, Offset{tile_size});
|
|
680
|
+
|
|
681
|
+
const bool inner_blocks = 0 < blockIdx.x && blockIdx.x + 2 < gridDim.x;
|
|
682
|
+
if (inner_blocks)
|
|
683
|
+
{
|
|
684
|
+
// use one thread to setup the entire bulk copy
|
|
685
|
+
if (elect_one())
|
|
686
|
+
{
|
|
687
|
+
ptx::mbarrier_init(&bar, 1);
|
|
688
|
+
// an update to the CUDA memory model blesses skipping the following fence
|
|
689
|
+
// ptx::fence_proxy_async(ptx::space_shared);
|
|
690
|
+
|
|
691
|
+
char* smem = smem_base;
|
|
692
|
+
::cuda::std::uint32_t total_copied = 0;
|
|
693
|
+
|
|
694
|
+
// turning this lambda into a function does not change SASS
|
|
695
|
+
auto bulk_copy_tile = [&](auto aligned_ptr) {
|
|
696
|
+
using T = typename decltype(aligned_ptr)::value_type;
|
|
697
|
+
const char* src = aligned_ptr.ptr + offset * unsigned{sizeof(T)}; // compute expression in U32 if Offset==I32
|
|
698
|
+
if constexpr (!tile_sizes_retain_max_alignment)
|
|
699
|
+
{
|
|
700
|
+
smem = round_up_smem_ptr<alignof(T)>(smem);
|
|
701
|
+
}
|
|
702
|
+
char* dst = smem;
|
|
703
|
+
_CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % bulk_copy_alignment == 0, "");
|
|
704
|
+
_CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % bulk_copy_alignment == 0, "");
|
|
705
|
+
|
|
706
|
+
// TODO(bgruber): we could precompute bytes_to_copy on the host
|
|
707
|
+
int bytes_to_copy;
|
|
708
|
+
if constexpr (alignof(T) < bulk_copy_size_multiple)
|
|
709
|
+
{
|
|
710
|
+
bytes_to_copy =
|
|
711
|
+
::cuda::round_up(aligned_ptr.head_padding + int{sizeof(T)} * tile_size, bulk_copy_size_multiple);
|
|
712
|
+
}
|
|
713
|
+
else
|
|
714
|
+
{
|
|
715
|
+
_CCCL_ASSERT(aligned_ptr.head_padding == 0, "");
|
|
716
|
+
bytes_to_copy = int{sizeof(T)} * tile_size;
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
::cuda::ptx::cp_async_bulk(::cuda::ptx::space_cluster, ::cuda::ptx::space_global, dst, src, bytes_to_copy, &bar);
|
|
720
|
+
total_copied += bytes_to_copy;
|
|
721
|
+
|
|
722
|
+
smem += tile_padding + int{sizeof(T)} * tile_size;
|
|
723
|
+
_CCCL_ASSERT(bytes_to_copy <= int{sizeof(T)} * tile_size + bulk_copy_alignment, "");
|
|
724
|
+
};
|
|
725
|
+
|
|
726
|
+
// Order of evaluation is left-to-right
|
|
727
|
+
(..., bulk_copy_tile(aligned_ptrs));
|
|
728
|
+
|
|
729
|
+
// TODO(ahendriksen): this could only have ptx::sem_relaxed, but this is not available yet
|
|
730
|
+
ptx::mbarrier_arrive_expect_tx(ptx::sem_release, ptx::scope_cta, ptx::space_shared, &bar, total_copied);
|
|
731
|
+
}
|
|
732
|
+
}
|
|
733
|
+
else
|
|
734
|
+
{
|
|
735
|
+
const bool elected = elect_one();
|
|
736
|
+
if (elected)
|
|
737
|
+
{
|
|
738
|
+
ptx::mbarrier_init(&bar, 1);
|
|
739
|
+
// an update to the CUDA memory model blesses skipping the following fence
|
|
740
|
+
// ptx::fence_proxy_async(ptx::space_shared);
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
// use all threads to copy the head and tail bytes, use the elected thread to start the bulk copy
|
|
744
|
+
char* smem = smem_base;
|
|
745
|
+
::cuda::std::uint32_t total_copied = 0;
|
|
746
|
+
|
|
747
|
+
// turning this lambda into a function does not change SASS
|
|
748
|
+
auto bulk_copy_tile_fallback = [&](auto aligned_ptr) {
|
|
749
|
+
using T = typename decltype(aligned_ptr)::value_type;
|
|
750
|
+
|
|
751
|
+
_CCCL_ASSERT(alignof(T) < bulk_copy_alignment || aligned_ptr.head_padding == 0, "");
|
|
752
|
+
const int head_padding = alignof(T) < bulk_copy_alignment ? aligned_ptr.head_padding : 0;
|
|
753
|
+
|
|
754
|
+
const char* src = aligned_ptr.ptr + offset * unsigned{sizeof(T)} + head_padding; // compute expression in U32 if
|
|
755
|
+
// Offset==I32
|
|
756
|
+
if constexpr (!tile_sizes_retain_max_alignment)
|
|
757
|
+
{
|
|
758
|
+
smem = round_up_smem_ptr<alignof(T)>(smem);
|
|
759
|
+
}
|
|
760
|
+
char* dst = smem + head_padding;
|
|
761
|
+
_CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % alignof(T) == 0, "");
|
|
762
|
+
_CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % alignof(T) == 0, "");
|
|
763
|
+
const int bytes_to_copy = int{sizeof(T)} * valid_items;
|
|
764
|
+
bulk_copy_maybe_unaligned<bulk_copy_alignment>(
|
|
765
|
+
dst, src, bytes_to_copy, aligned_ptr.head_padding, bar, total_copied, elected);
|
|
766
|
+
|
|
767
|
+
// add padding to account for this tile's head padding
|
|
768
|
+
smem += tile_padding + int{sizeof(T)} * tile_size;
|
|
769
|
+
};
|
|
770
|
+
|
|
771
|
+
// Order of evaluation is left-to-right
|
|
772
|
+
(..., bulk_copy_tile_fallback(aligned_ptrs));
|
|
773
|
+
|
|
774
|
+
if (elected)
|
|
775
|
+
{
|
|
776
|
+
// TODO(ahendriksen): this could only have ptx::sem_relaxed, but this is not available yet
|
|
777
|
+
ptx::mbarrier_arrive_expect_tx(ptx::sem_release, ptx::scope_cta, ptx::space_shared, &bar, total_copied);
|
|
778
|
+
}
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
// all threads wait for bulk copy
|
|
782
|
+
__syncthreads(); // TODO: ahendriksen said this is not needed, but compute-sanitizer disagrees
|
|
783
|
+
while (!ptx::mbarrier_try_wait_parity(&bar, 0))
|
|
784
|
+
;
|
|
785
|
+
|
|
786
|
+
// move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below
|
|
787
|
+
out += offset;
|
|
788
|
+
|
|
789
|
+
auto process_tile = [&](auto full_tile) {
|
|
790
|
+
// Unroll 1 tends to improve performance, especially for smaller data types (confirmed by benchmark)
|
|
791
|
+
_CCCL_PRAGMA_NOUNROLL()
|
|
792
|
+
for (int j = 0; j < num_elem_per_thread; ++j)
|
|
793
|
+
{
|
|
794
|
+
// TODO(bgruber): fbusato suggests to hoist threadIdx.x out of the loop below
|
|
795
|
+
const int idx = j * block_threads + threadIdx.x;
|
|
796
|
+
if (full_tile || idx < valid_items)
|
|
797
|
+
{
|
|
798
|
+
char* smem = smem_base;
|
|
799
|
+
auto fetch_operand = [&](auto aligned_ptr) {
|
|
800
|
+
using T = typename decltype(aligned_ptr)::value_type;
|
|
801
|
+
const int head_padding = alignof(T) < bulk_copy_alignment ? aligned_ptr.head_padding : 0;
|
|
802
|
+
if constexpr (!tile_sizes_retain_max_alignment)
|
|
803
|
+
{
|
|
804
|
+
smem = round_up_smem_ptr<alignof(T)>(smem);
|
|
805
|
+
}
|
|
806
|
+
const char* src = smem + head_padding;
|
|
807
|
+
smem += tile_padding + int{sizeof(T)} * tile_size;
|
|
808
|
+
return reinterpret_cast<const T*>(src)[idx];
|
|
809
|
+
};
|
|
810
|
+
|
|
811
|
+
// need to expand into a tuple for guaranteed order of evaluation
|
|
812
|
+
out[idx] = ::cuda::std::apply(
|
|
813
|
+
[&](auto... values) {
|
|
814
|
+
return f(values...);
|
|
815
|
+
},
|
|
816
|
+
::cuda::std::tuple<InTs...>{fetch_operand(aligned_ptrs)...});
|
|
817
|
+
}
|
|
818
|
+
}
|
|
819
|
+
};
|
|
820
|
+
// explicitly calling the lambda on literal true/false lets the compiler emit the lambda twice
|
|
821
|
+
if (tile_size == valid_items)
|
|
822
|
+
{
|
|
823
|
+
process_tile(::cuda::std::true_type{});
|
|
824
|
+
}
|
|
825
|
+
else
|
|
826
|
+
{
|
|
827
|
+
process_tile(::cuda::std::false_type{});
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
template <typename It>
|
|
832
|
+
union kernel_arg
|
|
833
|
+
{
|
|
834
|
+
aligned_base_ptr<it_value_t<It>> aligned_ptr; // first member is trivial
|
|
835
|
+
static_assert(::cuda::std::is_trivial_v<decltype(aligned_ptr)>, "");
|
|
836
|
+
It iterator; // may not be trivially [default|copy]-constructible
|
|
837
|
+
|
|
838
|
+
// Sometimes It is not trivially [default|copy]-constructible (e.g.
|
|
839
|
+
// thrust::normal_iterator<thrust::device_pointer<T>>), so because of
|
|
840
|
+
// https://eel.is/c++draft/class.union#general-note-3, kernel_args's special members are deleted. We work around it by
|
|
841
|
+
// explicitly defining them.
|
|
842
|
+
_CCCL_HOST_DEVICE kernel_arg() noexcept {}
|
|
843
|
+
_CCCL_HOST_DEVICE ~kernel_arg() noexcept {}
|
|
844
|
+
|
|
845
|
+
_CCCL_HOST_DEVICE kernel_arg(const kernel_arg& other)
|
|
846
|
+
{
|
|
847
|
+
// since we use kernel_arg only to pass data to the device, the contained data is semantically trivially copyable,
|
|
848
|
+
// even if the type system is telling us otherwise.
|
|
849
|
+
::cuda::std::memcpy(reinterpret_cast<char*>(this), reinterpret_cast<const char*>(&other), sizeof(kernel_arg));
|
|
850
|
+
}
|
|
851
|
+
};
|
|
852
|
+
|
|
853
|
+
template <typename It>
|
|
854
|
+
_CCCL_HOST_DEVICE auto make_iterator_kernel_arg(It it) -> kernel_arg<It>
|
|
855
|
+
{
|
|
856
|
+
kernel_arg<It> arg;
|
|
857
|
+
// since we switch the active member of the union, we must use placement new or construct_at. This also uses the copy
|
|
858
|
+
// constructor of It, which works in more cases than assignment (e.g. thrust::transform_iterator with
|
|
859
|
+
// non-copy-assignable functor, e.g. in merge sort tests)
|
|
860
|
+
::cuda::std::__construct_at(&arg.iterator, it);
|
|
861
|
+
return arg;
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
template <typename It>
|
|
865
|
+
_CCCL_HOST_DEVICE auto make_aligned_base_ptr_kernel_arg(It ptr, int alignment) -> kernel_arg<It>
|
|
866
|
+
{
|
|
867
|
+
kernel_arg<It> arg;
|
|
868
|
+
arg.aligned_ptr = make_aligned_base_ptr(ptr, alignment);
|
|
869
|
+
return arg;
|
|
870
|
+
}
|
|
871
|
+
|
|
872
|
+
// There is only one kernel for all algorithms, that dispatches based on the selected policy. It must be instantiated
|
|
873
|
+
// with the same arguments for each algorithm. Only the device compiler will then select the implementation. This
|
|
874
|
+
// saves some compile-time and binary size.
|
|
875
|
+
template <typename MaxPolicy,
|
|
876
|
+
typename Offset,
|
|
877
|
+
typename F,
|
|
878
|
+
typename RandomAccessIteratorOut,
|
|
879
|
+
typename... RandomAccessIteartorsIn>
|
|
880
|
+
__launch_bounds__(MaxPolicy::ActivePolicy::algo_policy::block_threads)
|
|
881
|
+
CUB_DETAIL_KERNEL_ATTRIBUTES void transform_kernel(
|
|
882
|
+
Offset num_items,
|
|
883
|
+
int num_elem_per_thread,
|
|
884
|
+
[[maybe_unused]] bool can_vectorize,
|
|
885
|
+
F f,
|
|
886
|
+
RandomAccessIteratorOut out,
|
|
887
|
+
kernel_arg<RandomAccessIteartorsIn>... ins)
|
|
888
|
+
{
|
|
889
|
+
_CCCL_ASSERT(blockDim.y == 1 && blockDim.z == 1, "transform_kernel only supports 1D blocks");
|
|
890
|
+
|
|
891
|
+
if constexpr (MaxPolicy::ActivePolicy::algorithm == Algorithm::prefetch)
|
|
892
|
+
{
|
|
893
|
+
transform_kernel_prefetch<typename MaxPolicy::ActivePolicy::algo_policy>(
|
|
894
|
+
num_items, num_elem_per_thread, ::cuda::std::move(f), ::cuda::std::move(out), ::cuda::std::move(ins.iterator)...);
|
|
895
|
+
}
|
|
896
|
+
else if constexpr (MaxPolicy::ActivePolicy::algorithm == Algorithm::vectorized)
|
|
897
|
+
{
|
|
898
|
+
transform_kernel_vectorized<typename MaxPolicy::ActivePolicy::algo_policy>(
|
|
899
|
+
num_items,
|
|
900
|
+
num_elem_per_thread,
|
|
901
|
+
can_vectorize,
|
|
902
|
+
::cuda::std::move(f),
|
|
903
|
+
::cuda::std::move(out),
|
|
904
|
+
::cuda::std::move(ins.iterator)...);
|
|
905
|
+
}
|
|
906
|
+
else if constexpr (MaxPolicy::ActivePolicy::algorithm == Algorithm::memcpy_async)
|
|
907
|
+
{
|
|
908
|
+
NV_IF_TARGET(
|
|
909
|
+
NV_PROVIDES_SM_80,
|
|
910
|
+
(transform_kernel_ldgsts<typename MaxPolicy::ActivePolicy::algo_policy>(
|
|
911
|
+
num_items,
|
|
912
|
+
num_elem_per_thread,
|
|
913
|
+
::cuda::std::move(f),
|
|
914
|
+
::cuda::std::move(out),
|
|
915
|
+
::cuda::std::move(ins.aligned_ptr)...);));
|
|
916
|
+
}
|
|
917
|
+
else if constexpr (MaxPolicy::ActivePolicy::algorithm == Algorithm::ublkcp)
|
|
918
|
+
{
|
|
919
|
+
NV_IF_TARGET(
|
|
920
|
+
NV_PROVIDES_SM_90,
|
|
921
|
+
(transform_kernel_ublkcp<typename MaxPolicy::ActivePolicy::algo_policy>(
|
|
922
|
+
num_items,
|
|
923
|
+
num_elem_per_thread,
|
|
924
|
+
::cuda::std::move(f),
|
|
925
|
+
::cuda::std::move(out),
|
|
926
|
+
::cuda::std::move(ins.aligned_ptr)...);));
|
|
927
|
+
}
|
|
928
|
+
else
|
|
929
|
+
{
|
|
930
|
+
static_assert(!sizeof(Offset), "Algorithm not implemented");
|
|
931
|
+
}
|
|
932
|
+
}
|
|
933
|
+
|
|
934
|
+
} // namespace detail::transform
|
|
935
|
+
|
|
936
|
+
CUB_NAMESPACE_END
|