cuda-cccl 0.1.3.1.0.dev1678__cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/__init__.py +14 -0
- cuda/cccl/cooperative/__init__.py +3 -0
- cuda/cccl/cooperative/experimental/__init__.py +8 -0
- cuda/cccl/cooperative/experimental/_caching.py +48 -0
- cuda/cccl/cooperative/experimental/_common.py +273 -0
- cuda/cccl/cooperative/experimental/_nvrtc.py +91 -0
- cuda/cccl/cooperative/experimental/_scan_op.py +181 -0
- cuda/cccl/cooperative/experimental/_types.py +935 -0
- cuda/cccl/cooperative/experimental/_typing.py +107 -0
- cuda/cccl/cooperative/experimental/block/__init__.py +33 -0
- cuda/cccl/cooperative/experimental/block/_block_load_store.py +215 -0
- cuda/cccl/cooperative/experimental/block/_block_merge_sort.py +125 -0
- cuda/cccl/cooperative/experimental/block/_block_radix_sort.py +214 -0
- cuda/cccl/cooperative/experimental/block/_block_reduce.py +294 -0
- cuda/cccl/cooperative/experimental/block/_block_scan.py +983 -0
- cuda/cccl/cooperative/experimental/warp/__init__.py +9 -0
- cuda/cccl/cooperative/experimental/warp/_warp_merge_sort.py +98 -0
- cuda/cccl/cooperative/experimental/warp/_warp_reduce.py +153 -0
- cuda/cccl/cooperative/experimental/warp/_warp_scan.py +78 -0
- cuda/cccl/headers/__init__.py +7 -0
- cuda/cccl/headers/include/__init__.py +1 -0
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +261 -0
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1181 -0
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +84 -0
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +925 -0
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +227 -0
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +753 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +766 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +704 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +557 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +678 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +804 -0
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +997 -0
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +561 -0
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +473 -0
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +292 -0
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1032 -0
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +342 -0
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +592 -0
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +614 -0
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1346 -0
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +965 -0
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1217 -0
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +1306 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +420 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +1259 -0
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +787 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1218 -0
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2193 -0
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +629 -0
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +437 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +2583 -0
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
- cuda/cccl/headers/include/cub/block/block_store.cuh +1246 -0
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +620 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +259 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
- cuda/cccl/headers/include/cub/config.cuh +60 -0
- cuda/cccl/headers/include/cub/cub.cuh +112 -0
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +77 -0
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +155 -0
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +93 -0
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +246 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +120 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +74 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +118 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +61 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +71 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +93 -0
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
- cuda/cccl/headers/include/cub/detail/rfa.cuh +724 -0
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +206 -0
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +72 -0
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
- cuda/cccl/headers/include/cub/device/device_copy.cuh +187 -0
- cuda/cccl/headers/include/cub/device/device_for.cuh +990 -0
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +1507 -0
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
- cuda/cccl/headers/include/cub/device/device_merge.cuh +202 -0
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
- cuda/cccl/headers/include/cub/device/device_partition.cuh +664 -0
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3435 -0
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +1815 -0
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +368 -0
- cuda/cccl/headers/include/cub/device/device_scan.cuh +1901 -0
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1512 -0
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
- cuda/cccl/headers/include/cub/device/device_select.cuh +1224 -0
- cuda/cccl/headers/include/cub/device/device_transform.cuh +313 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +314 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +109 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +718 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +45 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +197 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1051 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +305 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1748 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1316 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +625 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +497 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +548 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +497 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +598 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +1374 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +838 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +439 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +552 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +467 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +543 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +218 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +338 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +799 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +525 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +194 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +330 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +936 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +70 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +121 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +63 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +278 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +91 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +118 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1068 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +397 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +945 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +675 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +555 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1013 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +249 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1587 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +407 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +353 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +874 -0
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +215 -0
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +256 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +238 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +252 -0
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +322 -0
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +347 -0
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +629 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +504 -0
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +340 -0
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +198 -0
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +406 -0
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +101 -0
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +364 -0
- cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
- cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
- cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
- cuda/cccl/headers/include/cub/util_device.cuh +779 -0
- cuda/cccl/headers/include/cub/util_macro.cuh +91 -0
- cuda/cccl/headers/include/cub/util_math.cuh +115 -0
- cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
- cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
- cuda/cccl/headers/include/cub/util_type.cuh +1136 -0
- cuda/cccl/headers/include/cub/util_vsmem.cuh +251 -0
- cuda/cccl/headers/include/cub/version.cuh +89 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +729 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +405 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +688 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +437 -0
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +822 -0
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1156 -0
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +520 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +210 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +84 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +127 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +209 -0
- cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
- cuda/cccl/headers/include/cuda/__barrier/aligned_size.h +61 -0
- cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +100 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +454 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +72 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
- cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
- cuda/cccl/headers/include/cuda/__bit/bitfield.h +121 -0
- cuda/cccl/headers/include/cuda/__bit/bitmask.h +88 -0
- cuda/cccl/headers/include/cuda/__cccl_config +36 -0
- cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
- cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
- cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
- cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
- cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
- cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
- cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
- cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
- cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +90 -0
- cuda/cccl/headers/include/cuda/__execution/require.h +74 -0
- cuda/cccl/headers/include/cuda/__execution/tune.h +69 -0
- cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
- cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +276 -0
- cuda/cccl/headers/include/cuda/__functional/get_device_address.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
- cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +257 -0
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +460 -0
- cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +314 -0
- cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +421 -0
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +321 -0
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +333 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +465 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +456 -0
- cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +462 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +63 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +122 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +51 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +98 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +162 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +49 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +99 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
- cuda/cccl/headers/include/cuda/__memory/address_space.h +86 -0
- cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +94 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +157 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +73 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +129 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +653 -0
- cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +57 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +101 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2982 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +193 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +957 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +288 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +596 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1445 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +117 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +62 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +101 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +62 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +15074 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +385 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +176 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +94 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +137 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +138 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +280 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +282 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2148 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1272 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +228 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +430 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1830 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +105 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +81 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +612 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4446 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4061 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6438 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +36 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4582 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +67 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +750 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +151 -0
- cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +163 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
- cuda/cccl/headers/include/cuda/__utility/static_for.h +74 -0
- cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
- cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
- cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +249 -0
- cuda/cccl/headers/include/cuda/access_property +26 -0
- cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
- cuda/cccl/headers/include/cuda/atomic +27 -0
- cuda/cccl/headers/include/cuda/barrier +262 -0
- cuda/cccl/headers/include/cuda/bit +29 -0
- cuda/cccl/headers/include/cuda/cmath +35 -0
- cuda/cccl/headers/include/cuda/discard_memory +60 -0
- cuda/cccl/headers/include/cuda/functional +31 -0
- cuda/cccl/headers/include/cuda/iterator +34 -0
- cuda/cccl/headers/include/cuda/latch +27 -0
- cuda/cccl/headers/include/cuda/mdspan +28 -0
- cuda/cccl/headers/include/cuda/memory +32 -0
- cuda/cccl/headers/include/cuda/memory_resource +41 -0
- cuda/cccl/headers/include/cuda/numeric +28 -0
- cuda/cccl/headers/include/cuda/pipeline +577 -0
- cuda/cccl/headers/include/cuda/ptx +124 -0
- cuda/cccl/headers/include/cuda/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +52 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +64 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +92 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +96 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +140 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move.h +87 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +94 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +101 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +138 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +218 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
- cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
- cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +250 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +105 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +73 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
- cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
- cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
- cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
- cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +77 -0
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +183 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +167 -0
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
- cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
- cuda/cccl/headers/include/cuda/std/__bit/integral.h +124 -0
- cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +1270 -0
- cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
- cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
- cuda/cccl/headers/include/cuda/std/__cccl/assert.h +150 -0
- cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +207 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +787 -0
- cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +43 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +53 -0
- cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +69 -0
- cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +128 -0
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +126 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +326 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +79 -0
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +68 -0
- cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +133 -0
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
- cuda/cccl/headers/include/cuda/std/__cccl/os.h +48 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1276 -0
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +267 -0
- cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +176 -0
- cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
- cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
- cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
- cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
- cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
- cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +115 -0
- cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv_ +30 -0
- cuda/cccl/headers/include/cuda/std/__cmath/abs.h +246 -0
- cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +193 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +724 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +216 -0
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +224 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +180 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
- cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +104 -0
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +582 -0
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +248 -0
- cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
- cuda/cccl/headers/include/cuda/std/__cmath/nvbf16.h +58 -0
- cuda/cccl/headers/include/cuda/std/__cmath/nvfp16.h +58 -0
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
- cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +155 -0
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +170 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
- cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +676 -0
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +388 -0
- cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +215 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +106 -0
- cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +53 -0
- cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
- cuda/cccl/headers/include/cuda/std/__complex/roots.h +64 -0
- cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
- cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
- cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +131 -0
- cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
- cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
- cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +46 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +273 -0
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +107 -0
- cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +71 -0
- cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +57 -0
- cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
- cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
- cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
- cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
- cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +42 -0
- cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
- cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +69 -0
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
- cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +72 -0
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +142 -0
- cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
- cuda/cccl/headers/include/cuda/std/__execution/env.h +436 -0
- cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +2001 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1080 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +175 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +809 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +172 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +103 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +39 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +64 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/nvfp_types.h +58 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
- cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
- cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +128 -0
- cuda/cccl/headers/include/cuda/std/__format_ +28 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +337 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
- cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
- cuda/cccl/headers/include/cuda/std/__functional/function.h +1277 -0
- cuda/cccl/headers/include/cuda/std/__functional/hash.h +650 -0
- cuda/cccl/headers/include/cuda/std/__functional/identity.h +61 -0
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +558 -0
- cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +43 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +67 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +213 -0
- cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
- cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
- cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +127 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +64 -0
- cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +277 -0
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +35 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +36 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +49 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
- cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/iterator_traits.h +40 -0
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +73 -0
- cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +38 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +83 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +59 -0
- cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
- cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +71 -0
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +102 -0
- cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
- cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +100 -0
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +254 -0
- cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
- cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
- cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
- cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +95 -0
- cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +152 -0
- cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +102 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +140 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +160 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +932 -0
- cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +400 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +98 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
- cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +91 -0
- cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +185 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +605 -0
- cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
- cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +247 -0
- cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +781 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +322 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +98 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +358 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +757 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +315 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +308 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +507 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
- cuda/cccl/headers/include/cuda/std/__memory/addressof.h +64 -0
- cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +555 -0
- cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
- cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
- cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +230 -0
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +248 -0
- cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +43 -0
- cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +261 -0
- cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
- cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +683 -0
- cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +768 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +55 -0
- cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
- cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
- cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
- cuda/cccl/headers/include/cuda/std/__new/launder.h +49 -0
- cuda/cccl/headers/include/cuda/std/__new_ +29 -0
- cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
- cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
- cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +80 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
- cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
- cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +100 -0
- cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
- cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
- cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +75 -0
- cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
- cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
- cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +900 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +430 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
- cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
- cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
- cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +397 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
- cuda/cccl/headers/include/cuda/std/__random_ +29 -0
- cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
- cuda/cccl/headers/include/cuda/std/__ranges/all.h +97 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +313 -0
- cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
- cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
- cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +77 -0
- cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
- cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +161 -0
- cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +113 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +174 -0
- cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rend.h +181 -0
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
- cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
- cuda/cccl/headers/include/cuda/std/__ranges/size.h +199 -0
- cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +475 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
- cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
- cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
- cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +181 -0
- cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +233 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
- cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
- cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
- cuda/cccl/headers/include/cuda/std/__string_ +29 -0
- cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
- cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +105 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +142 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +98 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +269 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +216 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +90 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +73 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +277 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +69 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +174 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +51 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +83 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +54 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +214 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +81 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +87 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +63 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +119 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +201 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +70 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +82 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +61 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +123 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +132 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
- cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +32 -0
- cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
- cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
- cuda/cccl/headers/include/cuda/std/__utility/declval.h +63 -0
- cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
- cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +77 -0
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
- cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
- cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +802 -0
- cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
- cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +510 -0
- cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
- cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
- cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
- cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
- cuda/cccl/headers/include/cuda/std/array +520 -0
- cuda/cccl/headers/include/cuda/std/atomic +818 -0
- cuda/cccl/headers/include/cuda/std/barrier +43 -0
- cuda/cccl/headers/include/cuda/std/bit +35 -0
- cuda/cccl/headers/include/cuda/std/bitset +994 -0
- cuda/cccl/headers/include/cuda/std/cassert +28 -0
- cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
- cuda/cccl/headers/include/cuda/std/cfloat +59 -0
- cuda/cccl/headers/include/cuda/std/chrono +26 -0
- cuda/cccl/headers/include/cuda/std/climits +61 -0
- cuda/cccl/headers/include/cuda/std/cmath +25 -0
- cuda/cccl/headers/include/cuda/std/complex +50 -0
- cuda/cccl/headers/include/cuda/std/concepts +48 -0
- cuda/cccl/headers/include/cuda/std/cstddef +28 -0
- cuda/cccl/headers/include/cuda/std/cstdint +178 -0
- cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
- cuda/cccl/headers/include/cuda/std/cstring +110 -0
- cuda/cccl/headers/include/cuda/std/ctime +152 -0
- cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +235 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1720 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +3628 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +667 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1367 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2154 -0
- cuda/cccl/headers/include/cuda/std/execution +27 -0
- cuda/cccl/headers/include/cuda/std/expected +30 -0
- cuda/cccl/headers/include/cuda/std/functional +56 -0
- cuda/cccl/headers/include/cuda/std/initializer_list +36 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +2163 -0
- cuda/cccl/headers/include/cuda/std/iterator +70 -0
- cuda/cccl/headers/include/cuda/std/latch +34 -0
- cuda/cccl/headers/include/cuda/std/limits +28 -0
- cuda/cccl/headers/include/cuda/std/linalg +30 -0
- cuda/cccl/headers/include/cuda/std/mdspan +38 -0
- cuda/cccl/headers/include/cuda/std/memory +39 -0
- cuda/cccl/headers/include/cuda/std/numbers +335 -0
- cuda/cccl/headers/include/cuda/std/numeric +41 -0
- cuda/cccl/headers/include/cuda/std/optional +31 -0
- cuda/cccl/headers/include/cuda/std/ranges +69 -0
- cuda/cccl/headers/include/cuda/std/ratio +417 -0
- cuda/cccl/headers/include/cuda/std/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/source_location +83 -0
- cuda/cccl/headers/include/cuda/std/span +640 -0
- cuda/cccl/headers/include/cuda/std/string_view +788 -0
- cuda/cccl/headers/include/cuda/std/tuple +26 -0
- cuda/cccl/headers/include/cuda/std/type_traits +176 -0
- cuda/cccl/headers/include/cuda/std/utility +70 -0
- cuda/cccl/headers/include/cuda/std/variant +25 -0
- cuda/cccl/headers/include/cuda/std/version +245 -0
- cuda/cccl/headers/include/cuda/stream_ref +54 -0
- cuda/cccl/headers/include/cuda/type_traits +27 -0
- cuda/cccl/headers/include/cuda/utility +27 -0
- cuda/cccl/headers/include/cuda/version +16 -0
- cuda/cccl/headers/include/cuda/warp +28 -0
- cuda/cccl/headers/include/cuda/work_stealing +26 -0
- cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
- cuda/cccl/headers/include/nv/detail/__target_macros +641 -0
- cuda/cccl/headers/include/nv/target +240 -0
- cuda/cccl/headers/include/thrust/addressof.h +22 -0
- cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
- cuda/cccl/headers/include/thrust/advance.h +59 -0
- cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
- cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
- cuda/cccl/headers/include/thrust/complex.h +859 -0
- cuda/cccl/headers/include/thrust/copy.h +506 -0
- cuda/cccl/headers/include/thrust/count.h +245 -0
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +37 -0
- cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +350 -0
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +371 -0
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +45 -0
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +242 -0
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +39 -0
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +137 -0
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +39 -0
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +99 -0
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +53 -0
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +68 -0
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +102 -0
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +79 -0
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +39 -0
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +98 -0
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
- cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
- cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
- cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
- cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
- cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
- cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
- cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
- cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config.h +36 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
- cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
- cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
- cuda/cccl/headers/include/thrust/detail/count.h +55 -0
- cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +52 -0
- cuda/cccl/headers/include/thrust/detail/device_free.inl +47 -0
- cuda/cccl/headers/include/thrust/detail/device_malloc.inl +60 -0
- cuda/cccl/headers/include/thrust/detail/device_new.inl +61 -0
- cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
- cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
- cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +80 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
- cuda/cccl/headers/include/thrust/detail/execution_policy.h +80 -0
- cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
- cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
- cuda/cccl/headers/include/thrust/detail/function.h +49 -0
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
- cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
- cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
- cuda/cccl/headers/include/thrust/detail/integer_math.h +130 -0
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +289 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +40 -0
- cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
- cuda/cccl/headers/include/thrust/detail/mpl/math.h +164 -0
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +37 -0
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
- cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
- cuda/cccl/headers/include/thrust/detail/pointer.h +217 -0
- cuda/cccl/headers/include/thrust/detail/pointer.inl +172 -0
- cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
- cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +189 -0
- cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
- cuda/cccl/headers/include/thrust/detail/reference.h +500 -0
- cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
- cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
- cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
- cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
- cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
- cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
- cuda/cccl/headers/include/thrust/detail/seq.h +54 -0
- cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
- cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
- cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
- cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +44 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
- cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
- cuda/cccl/headers/include/thrust/detail/util/align.h +59 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.h +615 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +1212 -0
- cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
- cuda/cccl/headers/include/thrust/device_delete.h +59 -0
- cuda/cccl/headers/include/thrust/device_free.h +72 -0
- cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
- cuda/cccl/headers/include/thrust/device_malloc.h +108 -0
- cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
- cuda/cccl/headers/include/thrust/device_new.h +91 -0
- cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
- cuda/cccl/headers/include/thrust/device_ptr.h +202 -0
- cuda/cccl/headers/include/thrust/device_reference.h +986 -0
- cuda/cccl/headers/include/thrust/device_vector.h +574 -0
- cuda/cccl/headers/include/thrust/distance.h +43 -0
- cuda/cccl/headers/include/thrust/equal.h +247 -0
- cuda/cccl/headers/include/thrust/execution_policy.h +384 -0
- cuda/cccl/headers/include/thrust/extrema.h +657 -0
- cuda/cccl/headers/include/thrust/fill.h +201 -0
- cuda/cccl/headers/include/thrust/find.h +382 -0
- cuda/cccl/headers/include/thrust/for_each.h +261 -0
- cuda/cccl/headers/include/thrust/functional.h +396 -0
- cuda/cccl/headers/include/thrust/gather.h +464 -0
- cuda/cccl/headers/include/thrust/generate.h +193 -0
- cuda/cccl/headers/include/thrust/host_vector.h +576 -0
- cuda/cccl/headers/include/thrust/inner_product.h +264 -0
- cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +219 -0
- cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +51 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +62 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +199 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_traversal_tags.h +50 -0
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +53 -0
- cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
- cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
- cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +215 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +660 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +275 -0
- cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +192 -0
- cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
- cuda/cccl/headers/include/thrust/iterator/retag.h +74 -0
- cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +221 -0
- cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +184 -0
- cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +149 -0
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
- cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
- cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +357 -0
- cuda/cccl/headers/include/thrust/logical.h +290 -0
- cuda/cccl/headers/include/thrust/memory.h +395 -0
- cuda/cccl/headers/include/thrust/merge.h +725 -0
- cuda/cccl/headers/include/thrust/mismatch.h +261 -0
- cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +68 -0
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
- cuda/cccl/headers/include/thrust/mr/new.h +100 -0
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
- cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
- cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +65 -0
- cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
- cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
- cuda/cccl/headers/include/thrust/pair.h +102 -0
- cuda/cccl/headers/include/thrust/partition.h +1383 -0
- cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
- cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +217 -0
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
- cuda/cccl/headers/include/thrust/random.h +120 -0
- cuda/cccl/headers/include/thrust/reduce.h +1112 -0
- cuda/cccl/headers/include/thrust/remove.h +768 -0
- cuda/cccl/headers/include/thrust/replace.h +827 -0
- cuda/cccl/headers/include/thrust/reverse.h +213 -0
- cuda/cccl/headers/include/thrust/scan.h +1671 -0
- cuda/cccl/headers/include/thrust/scatter.h +446 -0
- cuda/cccl/headers/include/thrust/sequence.h +277 -0
- cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
- cuda/cccl/headers/include/thrust/shuffle.h +182 -0
- cuda/cccl/headers/include/thrust/sort.h +1320 -0
- cuda/cccl/headers/include/thrust/swap.h +147 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +90 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +62 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/vector.inl +130 -0
- cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +161 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +109 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +123 -0
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system/cuda/config.h +123 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +129 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/load_iterator.h +58 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/make_load_iterator.h +53 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +611 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +210 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +113 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +82 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +272 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +85 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +204 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +89 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +69 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +217 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +237 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +95 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +781 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1000 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +152 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +88 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +342 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +415 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +79 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1736 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +482 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +403 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +94 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +646 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +311 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +251 -0
- cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
- cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +39 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/errno.h +125 -0
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +167 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +391 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +152 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +90 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +66 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +258 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +143 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +79 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +102 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +66 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +78 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +65 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +105 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +154 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +74 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +135 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +213 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +77 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +106 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +89 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +192 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +92 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +127 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +101 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +181 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +78 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +141 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +78 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +91 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +132 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +238 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +79 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +91 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +96 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +288 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +482 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +131 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +119 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +181 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +82 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +53 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +88 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +56 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +62 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +172 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +77 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +87 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +132 -0
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +123 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +76 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +142 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +125 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +77 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +78 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +116 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +68 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +80 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +129 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +147 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +56 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +81 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +151 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +309 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +104 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +185 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +160 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +151 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +212 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +65 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +61 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +362 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +130 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +592 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +121 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +112 -0
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
- cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +113 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/par.h +62 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +160 -0
- cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system/system_error.h +184 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +92 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +62 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +160 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system_error.h +57 -0
- cuda/cccl/headers/include/thrust/tabulate.h +125 -0
- cuda/cccl/headers/include/thrust/transform.h +1045 -0
- cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
- cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
- cuda/cccl/headers/include/thrust/tuple.h +142 -0
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
- cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
- cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +96 -0
- cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
- cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
- cuda/cccl/headers/include/thrust/unique.h +1090 -0
- cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
- cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
- cuda/cccl/headers/include/thrust/version.h +93 -0
- cuda/cccl/headers/include/thrust/zip_function.h +176 -0
- cuda/cccl/headers/include_paths.py +72 -0
- cuda/cccl/parallel/__init__.py +9 -0
- cuda/cccl/parallel/experimental/__init__.py +47 -0
- cuda/cccl/parallel/experimental/_bindings.py +24 -0
- cuda/cccl/parallel/experimental/_bindings.pyi +388 -0
- cuda/cccl/parallel/experimental/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/_bindings_impl.pyx +2158 -0
- cuda/cccl/parallel/experimental/_caching.py +71 -0
- cuda/cccl/parallel/experimental/_cccl_interop.py +382 -0
- cuda/cccl/parallel/experimental/_utils/__init__.py +0 -0
- cuda/cccl/parallel/experimental/_utils/protocols.py +132 -0
- cuda/cccl/parallel/experimental/algorithms/__init__.py +28 -0
- cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +172 -0
- cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +244 -0
- cuda/cccl/parallel/experimental/algorithms/_reduce.py +136 -0
- cuda/cccl/parallel/experimental/algorithms/_scan.py +179 -0
- cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +183 -0
- cuda/cccl/parallel/experimental/algorithms/_transform.py +213 -0
- cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +179 -0
- cuda/cccl/parallel/experimental/cccl/.gitkeep +0 -0
- cuda/cccl/parallel/experimental/cccl/libcccl.c.parallel.so +0 -0
- cuda/cccl/parallel/experimental/iterators/__init__.py +17 -0
- cuda/cccl/parallel/experimental/iterators/_factories.py +157 -0
- cuda/cccl/parallel/experimental/iterators/_iterators.py +650 -0
- cuda/cccl/parallel/experimental/numba_utils.py +6 -0
- cuda/cccl/parallel/experimental/struct.py +150 -0
- cuda/cccl/parallel/experimental/typing.py +27 -0
- cuda/cccl/py.typed +0 -0
- cuda_cccl-0.1.3.1.0.dev1678.dist-info/METADATA +28 -0
- cuda_cccl-0.1.3.1.0.dev1678.dist-info/RECORD +1860 -0
- cuda_cccl-0.1.3.1.0.dev1678.dist-info/WHEEL +6 -0
- cuda_cccl-0.1.3.1.0.dev1678.dist-info/licenses/LICENSE +1 -0
|
@@ -0,0 +1,1346 @@
|
|
|
1
|
+
/******************************************************************************
|
|
2
|
+
* Copyright (c) 2011, Duane Merrill. All rights reserved.
|
|
3
|
+
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
|
|
4
|
+
*
|
|
5
|
+
* Redistribution and use in source and binary forms, with or without
|
|
6
|
+
* modification, are permitted provided that the following conditions are met:
|
|
7
|
+
* * Redistributions of source code must retain the above copyright
|
|
8
|
+
* notice, this list of conditions and the following disclaimer.
|
|
9
|
+
* * Redistributions in binary form must reproduce the above copyright
|
|
10
|
+
* notice, this list of conditions and the following disclaimer in the
|
|
11
|
+
* documentation and/or other materials provided with the distribution.
|
|
12
|
+
* * Neither the name of the NVIDIA CORPORATION nor the
|
|
13
|
+
* names of its contributors may be used to endorse or promote products
|
|
14
|
+
* derived from this software without specific prior written permission.
|
|
15
|
+
*
|
|
16
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
17
|
+
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
18
|
+
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
19
|
+
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
20
|
+
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
21
|
+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
22
|
+
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
23
|
+
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
24
|
+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
25
|
+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
26
|
+
*
|
|
27
|
+
******************************************************************************/
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* \file
|
|
31
|
+
* Callback operator types for supplying BlockScan prefixes
|
|
32
|
+
*/
|
|
33
|
+
|
|
34
|
+
#pragma once
|
|
35
|
+
|
|
36
|
+
#include <cub/config.cuh>
|
|
37
|
+
|
|
38
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
39
|
+
# pragma GCC system_header
|
|
40
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
41
|
+
# pragma clang system_header
|
|
42
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
43
|
+
# pragma system_header
|
|
44
|
+
#endif // no system header
|
|
45
|
+
|
|
46
|
+
#include <cub/detail/strong_load.cuh>
|
|
47
|
+
#include <cub/detail/strong_store.cuh>
|
|
48
|
+
#include <cub/detail/uninitialized_copy.cuh>
|
|
49
|
+
#include <cub/thread/thread_load.cuh>
|
|
50
|
+
#include <cub/thread/thread_store.cuh>
|
|
51
|
+
#include <cub/util_temporary_storage.cuh>
|
|
52
|
+
#include <cub/warp/warp_reduce.cuh>
|
|
53
|
+
|
|
54
|
+
#include <cuda/std/type_traits>
|
|
55
|
+
|
|
56
|
+
#include <nv/target>
|
|
57
|
+
|
|
58
|
+
CUB_NAMESPACE_BEGIN
|
|
59
|
+
|
|
60
|
+
/******************************************************************************
|
|
61
|
+
* Prefix functor type for maintaining a running prefix while scanning a
|
|
62
|
+
* region independent of other thread blocks
|
|
63
|
+
******************************************************************************/
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Stateful callback operator type for supplying BlockScan prefixes.
|
|
67
|
+
* Maintains a running prefix that can be applied to consecutive
|
|
68
|
+
* BlockScan operations.
|
|
69
|
+
*
|
|
70
|
+
* @tparam T
|
|
71
|
+
* BlockScan value type
|
|
72
|
+
*
|
|
73
|
+
* @tparam ScanOpT
|
|
74
|
+
* Wrapped scan operator type
|
|
75
|
+
*/
|
|
76
|
+
template <typename T, typename ScanOpT>
|
|
77
|
+
struct BlockScanRunningPrefixOp
|
|
78
|
+
{
|
|
79
|
+
/// Wrapped scan operator
|
|
80
|
+
ScanOpT op;
|
|
81
|
+
|
|
82
|
+
/// Running block-wide prefix
|
|
83
|
+
T running_total;
|
|
84
|
+
|
|
85
|
+
/// Constructor
|
|
86
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE BlockScanRunningPrefixOp(ScanOpT op)
|
|
87
|
+
: op(op)
|
|
88
|
+
{}
|
|
89
|
+
|
|
90
|
+
/// Constructor
|
|
91
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE BlockScanRunningPrefixOp(T starting_prefix, ScanOpT op)
|
|
92
|
+
: op(op)
|
|
93
|
+
, running_total(starting_prefix)
|
|
94
|
+
{}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Prefix callback operator. Returns the block-wide running_total in thread-0.
|
|
98
|
+
*
|
|
99
|
+
* @param block_aggregate
|
|
100
|
+
* The aggregate sum of the BlockScan inputs
|
|
101
|
+
*/
|
|
102
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE T operator()(const T& block_aggregate)
|
|
103
|
+
{
|
|
104
|
+
T retval = running_total;
|
|
105
|
+
running_total = op(running_total, block_aggregate);
|
|
106
|
+
return retval;
|
|
107
|
+
}
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
/******************************************************************************
|
|
111
|
+
* Generic tile status interface types for block-cooperative scans
|
|
112
|
+
******************************************************************************/
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Enumerations of tile status
|
|
116
|
+
*/
|
|
117
|
+
enum ScanTileStatus
|
|
118
|
+
{
|
|
119
|
+
SCAN_TILE_OOB, // Out-of-bounds (e.g., padding)
|
|
120
|
+
SCAN_TILE_INVALID = 99, // Not yet processed
|
|
121
|
+
SCAN_TILE_PARTIAL, // Tile aggregate is available
|
|
122
|
+
SCAN_TILE_INCLUSIVE, // Inclusive tile prefix is available
|
|
123
|
+
};
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Enum class used for specifying the memory order that shall be enforced while reading and writing the tile status.
|
|
127
|
+
*/
|
|
128
|
+
enum class MemoryOrder
|
|
129
|
+
{
|
|
130
|
+
// Uses relaxed loads when reading a tile's status and relaxed stores when updating a tile's status
|
|
131
|
+
relaxed,
|
|
132
|
+
// Uses load acquire when reading a tile's status and store release when updating a tile's status
|
|
133
|
+
acquire_release
|
|
134
|
+
};
|
|
135
|
+
|
|
136
|
+
namespace detail
|
|
137
|
+
{
|
|
138
|
+
template <int Delay, unsigned int GridThreshold = 500>
|
|
139
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void delay()
|
|
140
|
+
{
|
|
141
|
+
NV_IF_TARGET(NV_PROVIDES_SM_70, (if (Delay > 0) {
|
|
142
|
+
if (gridDim.x < GridThreshold)
|
|
143
|
+
{
|
|
144
|
+
__threadfence_block();
|
|
145
|
+
}
|
|
146
|
+
else
|
|
147
|
+
{
|
|
148
|
+
__nanosleep(Delay);
|
|
149
|
+
}
|
|
150
|
+
}));
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
template <unsigned int GridThreshold = 500>
|
|
154
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void delay(int ns)
|
|
155
|
+
{
|
|
156
|
+
NV_IF_TARGET(NV_PROVIDES_SM_70, (if (ns > 0) {
|
|
157
|
+
if (gridDim.x < GridThreshold)
|
|
158
|
+
{
|
|
159
|
+
__threadfence_block();
|
|
160
|
+
}
|
|
161
|
+
else
|
|
162
|
+
{
|
|
163
|
+
__nanosleep(ns);
|
|
164
|
+
}
|
|
165
|
+
}));
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
template <int Delay>
|
|
169
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void always_delay()
|
|
170
|
+
{
|
|
171
|
+
NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(Delay);));
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void always_delay([[maybe_unused]] int ns)
|
|
175
|
+
{
|
|
176
|
+
NV_IF_TARGET(NV_PROVIDES_SM_70, (__nanosleep(ns);));
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
template <unsigned int Delay = 350, unsigned int GridThreshold = 500>
|
|
180
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void delay_or_prevent_hoisting()
|
|
181
|
+
{
|
|
182
|
+
NV_IF_TARGET(NV_PROVIDES_SM_70, (delay<Delay, GridThreshold>();), (__threadfence_block();));
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
template <unsigned int GridThreshold = 500>
|
|
186
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void delay_or_prevent_hoisting([[maybe_unused]] int ns)
|
|
187
|
+
{
|
|
188
|
+
NV_IF_TARGET(NV_PROVIDES_SM_70, (delay<GridThreshold>(ns);), (__threadfence_block();));
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
template <unsigned int Delay = 350>
|
|
192
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void always_delay_or_prevent_hoisting()
|
|
193
|
+
{
|
|
194
|
+
NV_IF_TARGET(NV_PROVIDES_SM_70, (always_delay(Delay);), (__threadfence_block();));
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void always_delay_or_prevent_hoisting([[maybe_unused]] int ns)
|
|
198
|
+
{
|
|
199
|
+
NV_IF_TARGET(NV_PROVIDES_SM_70, (always_delay(ns);), (__threadfence_block();));
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
template <unsigned int L2WriteLatency>
|
|
203
|
+
struct no_delay_constructor_t
|
|
204
|
+
{
|
|
205
|
+
struct delay_t
|
|
206
|
+
{
|
|
207
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
|
|
208
|
+
{
|
|
209
|
+
NV_IF_TARGET(NV_PROVIDES_SM_70, (), (__threadfence_block();));
|
|
210
|
+
}
|
|
211
|
+
};
|
|
212
|
+
|
|
213
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE no_delay_constructor_t(unsigned int /* seed */)
|
|
214
|
+
{
|
|
215
|
+
delay<L2WriteLatency>();
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
|
|
219
|
+
{
|
|
220
|
+
return {};
|
|
221
|
+
}
|
|
222
|
+
};
|
|
223
|
+
|
|
224
|
+
template <unsigned int Delay, unsigned int L2WriteLatency, unsigned int GridThreshold = 500>
|
|
225
|
+
struct reduce_by_key_delay_constructor_t
|
|
226
|
+
{
|
|
227
|
+
struct delay_t
|
|
228
|
+
{
|
|
229
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
|
|
230
|
+
{
|
|
231
|
+
NV_DISPATCH_TARGET(
|
|
232
|
+
NV_IS_EXACTLY_SM_80,
|
|
233
|
+
(delay<Delay, GridThreshold>();),
|
|
234
|
+
NV_PROVIDES_SM_70,
|
|
235
|
+
(delay<0, GridThreshold>();),
|
|
236
|
+
NV_IS_DEVICE,
|
|
237
|
+
(__threadfence_block();));
|
|
238
|
+
}
|
|
239
|
+
};
|
|
240
|
+
|
|
241
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE reduce_by_key_delay_constructor_t(unsigned int /* seed */)
|
|
242
|
+
{
|
|
243
|
+
delay<L2WriteLatency>();
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
|
|
247
|
+
{
|
|
248
|
+
return {};
|
|
249
|
+
}
|
|
250
|
+
};
|
|
251
|
+
|
|
252
|
+
template <unsigned int Delay, unsigned int L2WriteLatency>
|
|
253
|
+
struct fixed_delay_constructor_t
|
|
254
|
+
{
|
|
255
|
+
struct delay_t
|
|
256
|
+
{
|
|
257
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
|
|
258
|
+
{
|
|
259
|
+
delay_or_prevent_hoisting<Delay>();
|
|
260
|
+
}
|
|
261
|
+
};
|
|
262
|
+
|
|
263
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE fixed_delay_constructor_t(unsigned int /* seed */)
|
|
264
|
+
{
|
|
265
|
+
delay<L2WriteLatency>();
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
|
|
269
|
+
{
|
|
270
|
+
return {};
|
|
271
|
+
}
|
|
272
|
+
};
|
|
273
|
+
|
|
274
|
+
template <unsigned int InitialDelay, unsigned int L2WriteLatency>
|
|
275
|
+
struct exponential_backoff_constructor_t
|
|
276
|
+
{
|
|
277
|
+
struct delay_t
|
|
278
|
+
{
|
|
279
|
+
int delay;
|
|
280
|
+
|
|
281
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
|
|
282
|
+
{
|
|
283
|
+
always_delay_or_prevent_hoisting(delay);
|
|
284
|
+
delay <<= 1;
|
|
285
|
+
}
|
|
286
|
+
};
|
|
287
|
+
|
|
288
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE exponential_backoff_constructor_t(unsigned int /* seed */)
|
|
289
|
+
{
|
|
290
|
+
always_delay<L2WriteLatency>();
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
|
|
294
|
+
{
|
|
295
|
+
return {InitialDelay};
|
|
296
|
+
}
|
|
297
|
+
};
|
|
298
|
+
|
|
299
|
+
template <unsigned int InitialDelay, unsigned int L2WriteLatency>
|
|
300
|
+
struct exponential_backoff_jitter_constructor_t
|
|
301
|
+
{
|
|
302
|
+
struct delay_t
|
|
303
|
+
{
|
|
304
|
+
static constexpr unsigned int a = 16807;
|
|
305
|
+
static constexpr unsigned int c = 0;
|
|
306
|
+
static constexpr unsigned int m = 1u << 31;
|
|
307
|
+
|
|
308
|
+
unsigned int max_delay;
|
|
309
|
+
unsigned int& seed;
|
|
310
|
+
|
|
311
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int next(unsigned int min, unsigned int max)
|
|
312
|
+
{
|
|
313
|
+
return (seed = (a * seed + c) % m) % (max + 1 - min) + min;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
|
|
317
|
+
{
|
|
318
|
+
always_delay_or_prevent_hoisting(next(0, max_delay));
|
|
319
|
+
max_delay <<= 1;
|
|
320
|
+
}
|
|
321
|
+
};
|
|
322
|
+
|
|
323
|
+
unsigned int seed;
|
|
324
|
+
|
|
325
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE exponential_backoff_jitter_constructor_t(unsigned int seed)
|
|
326
|
+
: seed(seed)
|
|
327
|
+
{
|
|
328
|
+
always_delay<L2WriteLatency>();
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
|
|
332
|
+
{
|
|
333
|
+
return {InitialDelay, seed};
|
|
334
|
+
}
|
|
335
|
+
};
|
|
336
|
+
|
|
337
|
+
template <unsigned int InitialDelay, unsigned int L2WriteLatency>
|
|
338
|
+
struct exponential_backoff_jitter_window_constructor_t
|
|
339
|
+
{
|
|
340
|
+
struct delay_t
|
|
341
|
+
{
|
|
342
|
+
static constexpr unsigned int a = 16807;
|
|
343
|
+
static constexpr unsigned int c = 0;
|
|
344
|
+
static constexpr unsigned int m = 1u << 31;
|
|
345
|
+
|
|
346
|
+
unsigned int max_delay;
|
|
347
|
+
unsigned int& seed;
|
|
348
|
+
|
|
349
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int next(unsigned int min, unsigned int max)
|
|
350
|
+
{
|
|
351
|
+
return (seed = (a * seed + c) % m) % (max + 1 - min) + min;
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
|
|
355
|
+
{
|
|
356
|
+
unsigned int next_max_delay = max_delay << 1;
|
|
357
|
+
always_delay_or_prevent_hoisting(next(max_delay, next_max_delay));
|
|
358
|
+
max_delay = next_max_delay;
|
|
359
|
+
}
|
|
360
|
+
};
|
|
361
|
+
|
|
362
|
+
unsigned int seed;
|
|
363
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE exponential_backoff_jitter_window_constructor_t(unsigned int seed)
|
|
364
|
+
: seed(seed)
|
|
365
|
+
{
|
|
366
|
+
always_delay<L2WriteLatency>();
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
|
|
370
|
+
{
|
|
371
|
+
return {InitialDelay, seed};
|
|
372
|
+
}
|
|
373
|
+
};
|
|
374
|
+
|
|
375
|
+
template <unsigned int InitialDelay, unsigned int L2WriteLatency>
|
|
376
|
+
struct exponential_backon_jitter_window_constructor_t
|
|
377
|
+
{
|
|
378
|
+
struct delay_t
|
|
379
|
+
{
|
|
380
|
+
static constexpr unsigned int a = 16807;
|
|
381
|
+
static constexpr unsigned int c = 0;
|
|
382
|
+
static constexpr unsigned int m = 1u << 31;
|
|
383
|
+
|
|
384
|
+
unsigned int max_delay;
|
|
385
|
+
unsigned int& seed;
|
|
386
|
+
|
|
387
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int next(unsigned int min, unsigned int max)
|
|
388
|
+
{
|
|
389
|
+
return (seed = (a * seed + c) % m) % (max + 1 - min) + min;
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
|
|
393
|
+
{
|
|
394
|
+
int prev_delay = max_delay >> 1;
|
|
395
|
+
always_delay_or_prevent_hoisting(next(prev_delay, max_delay));
|
|
396
|
+
max_delay = prev_delay;
|
|
397
|
+
}
|
|
398
|
+
};
|
|
399
|
+
|
|
400
|
+
unsigned int seed;
|
|
401
|
+
unsigned int max_delay = InitialDelay;
|
|
402
|
+
|
|
403
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE exponential_backon_jitter_window_constructor_t(unsigned int seed)
|
|
404
|
+
: seed(seed)
|
|
405
|
+
{
|
|
406
|
+
always_delay<L2WriteLatency>();
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
|
|
410
|
+
{
|
|
411
|
+
max_delay >>= 1;
|
|
412
|
+
return {max_delay, seed};
|
|
413
|
+
}
|
|
414
|
+
};
|
|
415
|
+
|
|
416
|
+
template <unsigned int InitialDelay, unsigned int L2WriteLatency>
|
|
417
|
+
struct exponential_backon_jitter_constructor_t
|
|
418
|
+
{
|
|
419
|
+
struct delay_t
|
|
420
|
+
{
|
|
421
|
+
static constexpr unsigned int a = 16807;
|
|
422
|
+
static constexpr unsigned int c = 0;
|
|
423
|
+
static constexpr unsigned int m = 1u << 31;
|
|
424
|
+
|
|
425
|
+
unsigned int max_delay;
|
|
426
|
+
unsigned int& seed;
|
|
427
|
+
|
|
428
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE unsigned int next(unsigned int min, unsigned int max)
|
|
429
|
+
{
|
|
430
|
+
return (seed = (a * seed + c) % m) % (max + 1 - min) + min;
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
|
|
434
|
+
{
|
|
435
|
+
always_delay_or_prevent_hoisting(next(0, max_delay));
|
|
436
|
+
max_delay >>= 1;
|
|
437
|
+
}
|
|
438
|
+
};
|
|
439
|
+
|
|
440
|
+
unsigned int seed;
|
|
441
|
+
unsigned int max_delay = InitialDelay;
|
|
442
|
+
|
|
443
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE exponential_backon_jitter_constructor_t(unsigned int seed)
|
|
444
|
+
: seed(seed)
|
|
445
|
+
{
|
|
446
|
+
always_delay<L2WriteLatency>();
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
|
|
450
|
+
{
|
|
451
|
+
max_delay >>= 1;
|
|
452
|
+
return {max_delay, seed};
|
|
453
|
+
}
|
|
454
|
+
};
|
|
455
|
+
|
|
456
|
+
template <unsigned int InitialDelay, unsigned int L2WriteLatency>
|
|
457
|
+
struct exponential_backon_constructor_t
|
|
458
|
+
{
|
|
459
|
+
struct delay_t
|
|
460
|
+
{
|
|
461
|
+
unsigned int delay;
|
|
462
|
+
|
|
463
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
|
|
464
|
+
{
|
|
465
|
+
always_delay_or_prevent_hoisting(delay);
|
|
466
|
+
delay >>= 1;
|
|
467
|
+
}
|
|
468
|
+
};
|
|
469
|
+
|
|
470
|
+
unsigned int max_delay = InitialDelay;
|
|
471
|
+
|
|
472
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE exponential_backon_constructor_t(unsigned int /* seed */)
|
|
473
|
+
{
|
|
474
|
+
always_delay<L2WriteLatency>();
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE delay_t operator()()
|
|
478
|
+
{
|
|
479
|
+
max_delay >>= 1;
|
|
480
|
+
return {max_delay};
|
|
481
|
+
}
|
|
482
|
+
};
|
|
483
|
+
|
|
484
|
+
using default_no_delay_constructor_t = no_delay_constructor_t<450>;
|
|
485
|
+
using default_no_delay_t = default_no_delay_constructor_t::delay_t;
|
|
486
|
+
|
|
487
|
+
template <class T>
|
|
488
|
+
using default_delay_constructor_t =
|
|
489
|
+
::cuda::std::_If<is_primitive<T>::value, fixed_delay_constructor_t<350, 450>, default_no_delay_constructor_t>;
|
|
490
|
+
|
|
491
|
+
template <class T>
|
|
492
|
+
using default_delay_t = typename default_delay_constructor_t<T>::delay_t;
|
|
493
|
+
|
|
494
|
+
template <class KeyT, class ValueT>
|
|
495
|
+
using default_reduce_by_key_delay_constructor_t =
|
|
496
|
+
::cuda::std::_If<is_primitive<ValueT>::value && (sizeof(ValueT) + sizeof(KeyT) < 16),
|
|
497
|
+
reduce_by_key_delay_constructor_t<350, 450>,
|
|
498
|
+
default_delay_constructor_t<KeyValuePair<KeyT, ValueT>>>;
|
|
499
|
+
|
|
500
|
+
/**
|
|
501
|
+
* @brief Alias template for a ScanTileState specialized for a given value type, `T`, and memory order `Order`.
|
|
502
|
+
*
|
|
503
|
+
* @tparam T The ScanTileState's value type
|
|
504
|
+
* @tparam Order The memory order to be implemented by the ScanTileState
|
|
505
|
+
*/
|
|
506
|
+
template <typename ScanTileStateT, MemoryOrder Order>
|
|
507
|
+
struct tile_state_with_memory_order
|
|
508
|
+
{
|
|
509
|
+
ScanTileStateT& tile_state;
|
|
510
|
+
using T = typename ScanTileStateT::StatusValueT;
|
|
511
|
+
using StatusWord = typename ScanTileStateT::StatusWord;
|
|
512
|
+
|
|
513
|
+
/**
|
|
514
|
+
* Update the specified tile's inclusive value and corresponding status
|
|
515
|
+
*/
|
|
516
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void SetInclusive(int tile_idx, T tile_inclusive)
|
|
517
|
+
{
|
|
518
|
+
tile_state.template SetInclusive<Order>(tile_idx, tile_inclusive);
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
/**
|
|
522
|
+
* Update the specified tile's partial value and corresponding status
|
|
523
|
+
*/
|
|
524
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void SetPartial(int tile_idx, T tile_partial)
|
|
525
|
+
{
|
|
526
|
+
tile_state.template SetPartial<Order>(tile_idx, tile_partial);
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
/**
|
|
530
|
+
* Wait for the corresponding tile to become non-invalid
|
|
531
|
+
*/
|
|
532
|
+
template <class DelayT = detail::default_no_delay_t>
|
|
533
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void WaitForValid(int tile_idx, StatusWord& status, T& value, DelayT delay = {})
|
|
534
|
+
{
|
|
535
|
+
tile_state.template WaitForValid<DelayT, Order>(tile_idx, status, value, delay);
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE T LoadValid(int tile_idx)
|
|
539
|
+
{
|
|
540
|
+
return tile_state.template LoadValid<Order>(tile_idx);
|
|
541
|
+
}
|
|
542
|
+
};
|
|
543
|
+
|
|
544
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE constexpr int num_tiles_to_num_tile_states(int num_tiles)
|
|
545
|
+
{
|
|
546
|
+
return warp_threads + num_tiles;
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE size_t
|
|
550
|
+
tile_state_allocation_size(int bytes_per_description, int bytes_per_payload, int num_tiles)
|
|
551
|
+
{
|
|
552
|
+
int num_tile_states = num_tiles_to_num_tile_states(num_tiles);
|
|
553
|
+
size_t allocation_sizes[]{
|
|
554
|
+
// bytes needed for tile status descriptors
|
|
555
|
+
static_cast<size_t>(num_tile_states * bytes_per_description),
|
|
556
|
+
// bytes needed for partials
|
|
557
|
+
static_cast<size_t>(num_tile_states * bytes_per_payload),
|
|
558
|
+
// bytes needed for inclusives
|
|
559
|
+
static_cast<size_t>(num_tile_states * bytes_per_payload)};
|
|
560
|
+
// Set the necessary size of the blob
|
|
561
|
+
size_t temp_storage_bytes = 0;
|
|
562
|
+
void* allocations[3] = {};
|
|
563
|
+
AliasTemporaries(nullptr, temp_storage_bytes, allocations, allocation_sizes);
|
|
564
|
+
|
|
565
|
+
return temp_storage_bytes;
|
|
566
|
+
};
|
|
567
|
+
|
|
568
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t tile_state_init(
|
|
569
|
+
int bytes_per_description,
|
|
570
|
+
int bytes_per_payload,
|
|
571
|
+
int num_tiles,
|
|
572
|
+
void* d_temp_storage,
|
|
573
|
+
size_t temp_storage_bytes,
|
|
574
|
+
void* (&allocations)[3])
|
|
575
|
+
{
|
|
576
|
+
int num_tile_states = num_tiles_to_num_tile_states(num_tiles);
|
|
577
|
+
size_t allocation_sizes[]{
|
|
578
|
+
// bytes needed for tile status descriptors
|
|
579
|
+
static_cast<size_t>(num_tile_states * bytes_per_description),
|
|
580
|
+
// bytes needed for partials
|
|
581
|
+
static_cast<size_t>(num_tile_states * bytes_per_payload),
|
|
582
|
+
// bytes needed for inclusives
|
|
583
|
+
static_cast<size_t>(num_tile_states * bytes_per_payload)};
|
|
584
|
+
|
|
585
|
+
// Set the necessary size of the blob
|
|
586
|
+
return AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
} // namespace detail
|
|
590
|
+
|
|
591
|
+
/**
|
|
592
|
+
* Tile status interface.
|
|
593
|
+
*/
|
|
594
|
+
template <typename T, bool SINGLE_WORD = detail::is_primitive<T>::value>
|
|
595
|
+
struct ScanTileState;
|
|
596
|
+
|
|
597
|
+
/**
|
|
598
|
+
* Tile status interface specialized for scan status and value types
|
|
599
|
+
* that can be combined into one machine word that can be
|
|
600
|
+
* read/written coherently in a single access.
|
|
601
|
+
*/
|
|
602
|
+
template <typename T>
|
|
603
|
+
struct ScanTileState<T, true>
|
|
604
|
+
{
|
|
605
|
+
using StatusValueT = T;
|
|
606
|
+
|
|
607
|
+
// Status word type
|
|
608
|
+
using StatusWord = ::cuda::std::_If<
|
|
609
|
+
sizeof(T) == 8,
|
|
610
|
+
unsigned long long,
|
|
611
|
+
::cuda::std::_If<sizeof(T) == 4, unsigned int, ::cuda::std::_If<sizeof(T) == 2, unsigned short, unsigned char>>>;
|
|
612
|
+
|
|
613
|
+
// Unit word type
|
|
614
|
+
using TxnWord = ::cuda::std::_If<sizeof(T) == 8, ulonglong2, ::cuda::std::_If<sizeof(T) == 4, uint2, unsigned int>>;
|
|
615
|
+
|
|
616
|
+
// Device word type
|
|
617
|
+
struct TileDescriptor
|
|
618
|
+
{
|
|
619
|
+
StatusWord status;
|
|
620
|
+
T value;
|
|
621
|
+
};
|
|
622
|
+
|
|
623
|
+
// Constants
|
|
624
|
+
enum
|
|
625
|
+
{
|
|
626
|
+
TILE_STATUS_PADDING = detail::warp_threads,
|
|
627
|
+
};
|
|
628
|
+
|
|
629
|
+
// Device storage
|
|
630
|
+
TxnWord* d_tile_descriptors;
|
|
631
|
+
|
|
632
|
+
static constexpr size_t description_bytes_per_tile = sizeof(TxnWord);
|
|
633
|
+
static constexpr size_t payload_bytes_per_tile = 0;
|
|
634
|
+
|
|
635
|
+
/// Constructor
|
|
636
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanTileState()
|
|
637
|
+
: d_tile_descriptors(nullptr)
|
|
638
|
+
{}
|
|
639
|
+
|
|
640
|
+
/**
|
|
641
|
+
* @brief Initializer
|
|
642
|
+
*
|
|
643
|
+
* @param[in] num_tiles
|
|
644
|
+
* Number of tiles
|
|
645
|
+
*
|
|
646
|
+
* @param[in] d_temp_storage
|
|
647
|
+
* Device-accessible allocation of temporary storage.
|
|
648
|
+
* When nullptr, the required allocation size is written to \p temp_storage_bytes and no work is
|
|
649
|
+
* done.
|
|
650
|
+
*
|
|
651
|
+
* @param[in] temp_storage_bytes
|
|
652
|
+
* Size in bytes of \t d_temp_storage allocation
|
|
653
|
+
*/
|
|
654
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t
|
|
655
|
+
Init(int /*num_tiles*/, void* d_temp_storage, size_t /*temp_storage_bytes*/)
|
|
656
|
+
{
|
|
657
|
+
d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
|
|
658
|
+
return cudaSuccess;
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
/**
|
|
662
|
+
* @brief Compute device memory needed for tile status
|
|
663
|
+
*
|
|
664
|
+
* @param[in] num_tiles
|
|
665
|
+
* Number of tiles
|
|
666
|
+
*
|
|
667
|
+
* @param[out] temp_storage_bytes
|
|
668
|
+
* Size in bytes of \t d_temp_storage allocation
|
|
669
|
+
*/
|
|
670
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE static constexpr cudaError_t
|
|
671
|
+
AllocationSize(int num_tiles, size_t& temp_storage_bytes)
|
|
672
|
+
{
|
|
673
|
+
temp_storage_bytes =
|
|
674
|
+
detail::tile_state_allocation_size(description_bytes_per_tile, payload_bytes_per_tile, num_tiles);
|
|
675
|
+
return cudaSuccess;
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
/**
|
|
679
|
+
* Initialize (from device)
|
|
680
|
+
*/
|
|
681
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void InitializeStatus(int num_tiles)
|
|
682
|
+
{
|
|
683
|
+
int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
|
684
|
+
|
|
685
|
+
TxnWord val = TxnWord();
|
|
686
|
+
TileDescriptor* descriptor = reinterpret_cast<TileDescriptor*>(&val);
|
|
687
|
+
|
|
688
|
+
if (tile_idx < num_tiles)
|
|
689
|
+
{
|
|
690
|
+
// Not-yet-set
|
|
691
|
+
descriptor->status = StatusWord(SCAN_TILE_INVALID);
|
|
692
|
+
d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
|
|
696
|
+
{
|
|
697
|
+
// Padding
|
|
698
|
+
descriptor->status = StatusWord(SCAN_TILE_OOB);
|
|
699
|
+
d_tile_descriptors[threadIdx.x] = val;
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
private:
|
|
704
|
+
template <MemoryOrder Order>
|
|
705
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::enable_if_t<(Order == MemoryOrder::relaxed), void>
|
|
706
|
+
StoreStatus(TxnWord* ptr, TxnWord alias)
|
|
707
|
+
{
|
|
708
|
+
detail::store_relaxed(ptr, alias);
|
|
709
|
+
}
|
|
710
|
+
|
|
711
|
+
template <MemoryOrder Order>
|
|
712
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::enable_if_t<(Order == MemoryOrder::acquire_release), void>
|
|
713
|
+
StoreStatus(TxnWord* ptr, TxnWord alias)
|
|
714
|
+
{
|
|
715
|
+
detail::store_release(ptr, alias);
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
template <MemoryOrder Order>
|
|
719
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::enable_if_t<(Order == MemoryOrder::relaxed), TxnWord>
|
|
720
|
+
LoadStatus(TxnWord* ptr)
|
|
721
|
+
{
|
|
722
|
+
return detail::load_relaxed(ptr);
|
|
723
|
+
}
|
|
724
|
+
|
|
725
|
+
template <MemoryOrder Order>
|
|
726
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::enable_if_t<(Order == MemoryOrder::acquire_release), TxnWord>
|
|
727
|
+
LoadStatus(TxnWord* ptr)
|
|
728
|
+
{
|
|
729
|
+
// For pre-volta we hoist the memory barrier to outside the loop, i.e., after reading a valid state
|
|
730
|
+
NV_IF_TARGET(NV_PROVIDES_SM_70, (return detail::load_acquire(ptr);), (return detail::load_relaxed(ptr);));
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
template <MemoryOrder Order>
|
|
734
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::enable_if_t<(Order == MemoryOrder::relaxed), void>
|
|
735
|
+
ThreadfenceForLoadAcqPreVolta()
|
|
736
|
+
{}
|
|
737
|
+
|
|
738
|
+
template <MemoryOrder Order>
|
|
739
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE ::cuda::std::enable_if_t<(Order == MemoryOrder::acquire_release), void>
|
|
740
|
+
ThreadfenceForLoadAcqPreVolta()
|
|
741
|
+
{
|
|
742
|
+
NV_IF_TARGET(NV_PROVIDES_SM_70, (), (__threadfence();));
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
public:
|
|
746
|
+
template <MemoryOrder Order = MemoryOrder::relaxed>
|
|
747
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void SetInclusive(int tile_idx, T tile_inclusive)
|
|
748
|
+
{
|
|
749
|
+
TileDescriptor tile_descriptor;
|
|
750
|
+
tile_descriptor.status = SCAN_TILE_INCLUSIVE;
|
|
751
|
+
tile_descriptor.value = tile_inclusive;
|
|
752
|
+
|
|
753
|
+
TxnWord alias;
|
|
754
|
+
*reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
|
|
755
|
+
|
|
756
|
+
StoreStatus<Order>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
template <MemoryOrder Order = MemoryOrder::relaxed>
|
|
760
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void SetPartial(int tile_idx, T tile_partial)
|
|
761
|
+
{
|
|
762
|
+
TileDescriptor tile_descriptor;
|
|
763
|
+
tile_descriptor.status = SCAN_TILE_PARTIAL;
|
|
764
|
+
tile_descriptor.value = tile_partial;
|
|
765
|
+
|
|
766
|
+
TxnWord alias;
|
|
767
|
+
*reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
|
|
768
|
+
|
|
769
|
+
StoreStatus<Order>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
/**
|
|
773
|
+
* Wait for the corresponding tile to become non-invalid
|
|
774
|
+
*/
|
|
775
|
+
template <class DelayT = detail::default_delay_t<T>, MemoryOrder Order = MemoryOrder::relaxed>
|
|
776
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
777
|
+
WaitForValid(int tile_idx, StatusWord& status, T& value, DelayT delay_or_prevent_hoisting = {})
|
|
778
|
+
{
|
|
779
|
+
TileDescriptor tile_descriptor;
|
|
780
|
+
|
|
781
|
+
{
|
|
782
|
+
TxnWord alias = LoadStatus<Order>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
|
|
783
|
+
tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
while (__any_sync(0xffffffff, (tile_descriptor.status == SCAN_TILE_INVALID)))
|
|
787
|
+
{
|
|
788
|
+
delay_or_prevent_hoisting();
|
|
789
|
+
TxnWord alias = LoadStatus<Order>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
|
|
790
|
+
tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
// For pre-Volta and load acquire we emit relaxed loads in LoadStatus and hoist the threadfence here
|
|
794
|
+
ThreadfenceForLoadAcqPreVolta<Order>();
|
|
795
|
+
|
|
796
|
+
status = tile_descriptor.status;
|
|
797
|
+
value = tile_descriptor.value;
|
|
798
|
+
}
|
|
799
|
+
|
|
800
|
+
/**
|
|
801
|
+
* Loads and returns the tile's value. The returned value is undefined if either (a) the tile's status is invalid or
|
|
802
|
+
* (b) there is no memory fence between reading a non-invalid status and the call to LoadValid.
|
|
803
|
+
*/
|
|
804
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE T LoadValid(int tile_idx)
|
|
805
|
+
{
|
|
806
|
+
TxnWord alias = d_tile_descriptors[TILE_STATUS_PADDING + tile_idx];
|
|
807
|
+
TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
|
|
808
|
+
return tile_descriptor.value;
|
|
809
|
+
}
|
|
810
|
+
};
|
|
811
|
+
|
|
812
|
+
/**
|
|
813
|
+
* Tile status interface specialized for scan status and value types that
|
|
814
|
+
* cannot be combined into one machine word.
|
|
815
|
+
*/
|
|
816
|
+
template <typename T>
|
|
817
|
+
struct ScanTileState<T, false>
|
|
818
|
+
{
|
|
819
|
+
using StatusValueT = T;
|
|
820
|
+
|
|
821
|
+
// Status word type
|
|
822
|
+
using StatusWord = unsigned int;
|
|
823
|
+
|
|
824
|
+
// Constants
|
|
825
|
+
enum
|
|
826
|
+
{
|
|
827
|
+
TILE_STATUS_PADDING = detail::warp_threads,
|
|
828
|
+
};
|
|
829
|
+
|
|
830
|
+
// Device storage
|
|
831
|
+
StatusWord* d_tile_status;
|
|
832
|
+
T* d_tile_partial;
|
|
833
|
+
T* d_tile_inclusive;
|
|
834
|
+
|
|
835
|
+
static constexpr size_t description_bytes_per_tile = sizeof(StatusWord);
|
|
836
|
+
static constexpr size_t payload_bytes_per_tile = sizeof(Uninitialized<T>);
|
|
837
|
+
|
|
838
|
+
/// Constructor
|
|
839
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE ScanTileState()
|
|
840
|
+
: d_tile_status(nullptr)
|
|
841
|
+
, d_tile_partial(nullptr)
|
|
842
|
+
, d_tile_inclusive(nullptr)
|
|
843
|
+
{}
|
|
844
|
+
|
|
845
|
+
/**
|
|
846
|
+
* @brief Initializer
|
|
847
|
+
*
|
|
848
|
+
* @param[in] num_tiles
|
|
849
|
+
* Number of tiles
|
|
850
|
+
*
|
|
851
|
+
* @param[in] d_temp_storage
|
|
852
|
+
* Device-accessible allocation of temporary storage.
|
|
853
|
+
* When nullptr, the required allocation size is written to \p temp_storage_bytes and no work is
|
|
854
|
+
* done.
|
|
855
|
+
*
|
|
856
|
+
* @param[in] temp_storage_bytes
|
|
857
|
+
* Size in bytes of \t d_temp_storage allocation
|
|
858
|
+
*/
|
|
859
|
+
/// Initializer
|
|
860
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t Init(int num_tiles, void* d_temp_storage, size_t temp_storage_bytes)
|
|
861
|
+
{
|
|
862
|
+
cudaError_t error = cudaSuccess;
|
|
863
|
+
do
|
|
864
|
+
{
|
|
865
|
+
void* allocations[3] = {};
|
|
866
|
+
error = detail::tile_state_init(
|
|
867
|
+
description_bytes_per_tile, payload_bytes_per_tile, num_tiles, d_temp_storage, temp_storage_bytes, allocations);
|
|
868
|
+
if (cudaSuccess != error)
|
|
869
|
+
{
|
|
870
|
+
break;
|
|
871
|
+
}
|
|
872
|
+
// Alias the offsets
|
|
873
|
+
d_tile_status = reinterpret_cast<StatusWord*>(allocations[0]);
|
|
874
|
+
d_tile_partial = reinterpret_cast<T*>(allocations[1]);
|
|
875
|
+
d_tile_inclusive = reinterpret_cast<T*>(allocations[2]);
|
|
876
|
+
} while (0);
|
|
877
|
+
|
|
878
|
+
return error;
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
/**
|
|
882
|
+
* @brief Compute device memory needed for tile status
|
|
883
|
+
*
|
|
884
|
+
* @param[in] num_tiles
|
|
885
|
+
* Number of tiles
|
|
886
|
+
*
|
|
887
|
+
* @param[out] temp_storage_bytes
|
|
888
|
+
* Size in bytes of \t d_temp_storage allocation
|
|
889
|
+
*/
|
|
890
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE static constexpr cudaError_t
|
|
891
|
+
AllocationSize(int num_tiles, size_t& temp_storage_bytes)
|
|
892
|
+
{
|
|
893
|
+
temp_storage_bytes =
|
|
894
|
+
detail::tile_state_allocation_size(description_bytes_per_tile, payload_bytes_per_tile, num_tiles);
|
|
895
|
+
return cudaSuccess;
|
|
896
|
+
}
|
|
897
|
+
/**
|
|
898
|
+
* Initialize (from device)
|
|
899
|
+
*/
|
|
900
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void InitializeStatus(int num_tiles)
|
|
901
|
+
{
|
|
902
|
+
int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
|
903
|
+
if (tile_idx < num_tiles)
|
|
904
|
+
{
|
|
905
|
+
// Not-yet-set
|
|
906
|
+
d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
|
|
910
|
+
{
|
|
911
|
+
// Padding
|
|
912
|
+
d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
|
|
913
|
+
}
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
/**
|
|
917
|
+
* Update the specified tile's inclusive value and corresponding status
|
|
918
|
+
*/
|
|
919
|
+
template <MemoryOrder Order = MemoryOrder::relaxed>
|
|
920
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void SetInclusive(int tile_idx, T tile_inclusive)
|
|
921
|
+
{
|
|
922
|
+
// Update tile inclusive value
|
|
923
|
+
ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
|
|
924
|
+
detail::store_release(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
|
|
925
|
+
}
|
|
926
|
+
|
|
927
|
+
/**
|
|
928
|
+
* Update the specified tile's partial value and corresponding status
|
|
929
|
+
*/
|
|
930
|
+
template <MemoryOrder Order = MemoryOrder::relaxed>
|
|
931
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void SetPartial(int tile_idx, T tile_partial)
|
|
932
|
+
{
|
|
933
|
+
// Update tile partial value
|
|
934
|
+
ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
|
|
935
|
+
detail::store_release(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
|
|
936
|
+
}
|
|
937
|
+
|
|
938
|
+
/**
|
|
939
|
+
* Wait for the corresponding tile to become non-invalid
|
|
940
|
+
*/
|
|
941
|
+
template <class DelayT = detail::default_no_delay_t, MemoryOrder Order = MemoryOrder::relaxed>
|
|
942
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void WaitForValid(int tile_idx, StatusWord& status, T& value, DelayT delay = {})
|
|
943
|
+
{
|
|
944
|
+
do
|
|
945
|
+
{
|
|
946
|
+
delay();
|
|
947
|
+
status = detail::load_relaxed(d_tile_status + TILE_STATUS_PADDING + tile_idx);
|
|
948
|
+
__threadfence();
|
|
949
|
+
} while (__any_sync(0xffffffff, (status == SCAN_TILE_INVALID)));
|
|
950
|
+
|
|
951
|
+
if (status == StatusWord(SCAN_TILE_PARTIAL))
|
|
952
|
+
{
|
|
953
|
+
value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
|
|
954
|
+
}
|
|
955
|
+
else if (status == StatusWord(SCAN_TILE_INCLUSIVE))
|
|
956
|
+
{
|
|
957
|
+
value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
|
|
958
|
+
}
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
/**
|
|
962
|
+
* Loads and returns the tile's value. The returned value is undefined if either (a) the tile's status is invalid or
|
|
963
|
+
* (b) there is no memory fence between reading a non-invalid status and the call to LoadValid.
|
|
964
|
+
*/
|
|
965
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE T LoadValid(int tile_idx)
|
|
966
|
+
{
|
|
967
|
+
return d_tile_inclusive[TILE_STATUS_PADDING + tile_idx];
|
|
968
|
+
}
|
|
969
|
+
};
|
|
970
|
+
|
|
971
|
+
/******************************************************************************
|
|
972
|
+
* ReduceByKey tile status interface types for block-cooperative scans
|
|
973
|
+
******************************************************************************/
|
|
974
|
+
|
|
975
|
+
/**
|
|
976
|
+
* Tile status interface for reduction by key.
|
|
977
|
+
*
|
|
978
|
+
*/
|
|
979
|
+
template <typename ValueT,
|
|
980
|
+
typename KeyT,
|
|
981
|
+
bool SINGLE_WORD = detail::is_primitive<ValueT>::value && (sizeof(ValueT) + sizeof(KeyT) < 16)>
|
|
982
|
+
struct ReduceByKeyScanTileState;
|
|
983
|
+
|
|
984
|
+
/**
|
|
985
|
+
* Tile status interface for reduction by key, specialized for scan status and value types that
|
|
986
|
+
* cannot be combined into one machine word.
|
|
987
|
+
*/
|
|
988
|
+
template <typename ValueT, typename KeyT>
|
|
989
|
+
struct ReduceByKeyScanTileState<ValueT, KeyT, false> : ScanTileState<KeyValuePair<KeyT, ValueT>>
|
|
990
|
+
{
|
|
991
|
+
using SuperClass = ScanTileState<KeyValuePair<KeyT, ValueT>>;
|
|
992
|
+
|
|
993
|
+
/// Constructor
|
|
994
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE ReduceByKeyScanTileState()
|
|
995
|
+
: SuperClass()
|
|
996
|
+
{}
|
|
997
|
+
};
|
|
998
|
+
|
|
999
|
+
/**
|
|
1000
|
+
* Tile status interface for reduction by key, specialized for scan status and value types that
|
|
1001
|
+
* can be combined into one machine word that can be read/written coherently in a single access.
|
|
1002
|
+
*/
|
|
1003
|
+
template <typename ValueT, typename KeyT>
|
|
1004
|
+
struct ReduceByKeyScanTileState<ValueT, KeyT, true>
|
|
1005
|
+
{
|
|
1006
|
+
using KeyValuePairT = KeyValuePair<KeyT, ValueT>;
|
|
1007
|
+
|
|
1008
|
+
// Constants
|
|
1009
|
+
enum
|
|
1010
|
+
{
|
|
1011
|
+
PAIR_SIZE = static_cast<int>(sizeof(ValueT) + sizeof(KeyT)),
|
|
1012
|
+
TXN_WORD_SIZE = 1 << Log2<PAIR_SIZE + 1>::VALUE,
|
|
1013
|
+
STATUS_WORD_SIZE = TXN_WORD_SIZE - PAIR_SIZE,
|
|
1014
|
+
|
|
1015
|
+
TILE_STATUS_PADDING = detail::warp_threads,
|
|
1016
|
+
};
|
|
1017
|
+
|
|
1018
|
+
// Status word type
|
|
1019
|
+
using StatusWord = ::cuda::std::_If<
|
|
1020
|
+
STATUS_WORD_SIZE == 8,
|
|
1021
|
+
unsigned long long,
|
|
1022
|
+
::cuda::std::
|
|
1023
|
+
_If<STATUS_WORD_SIZE == 4, unsigned int, ::cuda::std::_If<STATUS_WORD_SIZE == 2, unsigned short, unsigned char>>>;
|
|
1024
|
+
|
|
1025
|
+
// Status word type
|
|
1026
|
+
using TxnWord = ::cuda::std::
|
|
1027
|
+
_If<TXN_WORD_SIZE == 16, ulonglong2, ::cuda::std::_If<TXN_WORD_SIZE == 8, unsigned long long, unsigned int>>;
|
|
1028
|
+
|
|
1029
|
+
// Device word type (for when sizeof(ValueT) == sizeof(KeyT))
|
|
1030
|
+
struct TileDescriptorBigStatus
|
|
1031
|
+
{
|
|
1032
|
+
KeyT key;
|
|
1033
|
+
ValueT value;
|
|
1034
|
+
StatusWord status;
|
|
1035
|
+
};
|
|
1036
|
+
|
|
1037
|
+
// Device word type (for when sizeof(ValueT) != sizeof(KeyT))
|
|
1038
|
+
struct TileDescriptorLittleStatus
|
|
1039
|
+
{
|
|
1040
|
+
ValueT value;
|
|
1041
|
+
StatusWord status;
|
|
1042
|
+
KeyT key;
|
|
1043
|
+
};
|
|
1044
|
+
|
|
1045
|
+
// Device word type
|
|
1046
|
+
using TileDescriptor =
|
|
1047
|
+
::cuda::std::_If<sizeof(ValueT) == sizeof(KeyT), TileDescriptorBigStatus, TileDescriptorLittleStatus>;
|
|
1048
|
+
|
|
1049
|
+
// Device storage
|
|
1050
|
+
TxnWord* d_tile_descriptors;
|
|
1051
|
+
|
|
1052
|
+
/// Constructor
|
|
1053
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE ReduceByKeyScanTileState()
|
|
1054
|
+
: d_tile_descriptors(nullptr)
|
|
1055
|
+
{}
|
|
1056
|
+
|
|
1057
|
+
/**
|
|
1058
|
+
* @brief Initializer
|
|
1059
|
+
*
|
|
1060
|
+
* @param[in] num_tiles
|
|
1061
|
+
* Number of tiles
|
|
1062
|
+
*
|
|
1063
|
+
* @param[in] d_temp_storage
|
|
1064
|
+
* Device-accessible allocation of temporary storage. When nullptr, the required allocation size
|
|
1065
|
+
* is written to \p temp_storage_bytes and no work is done.
|
|
1066
|
+
*
|
|
1067
|
+
* @param[in] temp_storage_bytes
|
|
1068
|
+
* Size in bytes of \t d_temp_storage allocation
|
|
1069
|
+
*/
|
|
1070
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t
|
|
1071
|
+
Init(int /*num_tiles*/, void* d_temp_storage, size_t /*temp_storage_bytes*/)
|
|
1072
|
+
{
|
|
1073
|
+
d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
|
|
1074
|
+
return cudaSuccess;
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
/**
|
|
1078
|
+
* @brief Compute device memory needed for tile status
|
|
1079
|
+
*
|
|
1080
|
+
* @param[in] num_tiles
|
|
1081
|
+
* Number of tiles
|
|
1082
|
+
*
|
|
1083
|
+
* @param[out] temp_storage_bytes
|
|
1084
|
+
* Size in bytes of \t d_temp_storage allocation
|
|
1085
|
+
*/
|
|
1086
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE static cudaError_t AllocationSize(int num_tiles, size_t& temp_storage_bytes)
|
|
1087
|
+
{
|
|
1088
|
+
// bytes needed for tile status descriptors
|
|
1089
|
+
temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord);
|
|
1090
|
+
return cudaSuccess;
|
|
1091
|
+
}
|
|
1092
|
+
|
|
1093
|
+
/**
|
|
1094
|
+
* Initialize (from device)
|
|
1095
|
+
*/
|
|
1096
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void InitializeStatus(int num_tiles)
|
|
1097
|
+
{
|
|
1098
|
+
int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
|
|
1099
|
+
TxnWord val = TxnWord();
|
|
1100
|
+
TileDescriptor* descriptor = reinterpret_cast<TileDescriptor*>(&val);
|
|
1101
|
+
|
|
1102
|
+
if (tile_idx < num_tiles)
|
|
1103
|
+
{
|
|
1104
|
+
// Not-yet-set
|
|
1105
|
+
descriptor->status = StatusWord(SCAN_TILE_INVALID);
|
|
1106
|
+
d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
|
|
1110
|
+
{
|
|
1111
|
+
// Padding
|
|
1112
|
+
descriptor->status = StatusWord(SCAN_TILE_OOB);
|
|
1113
|
+
d_tile_descriptors[threadIdx.x] = val;
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
|
|
1117
|
+
/**
|
|
1118
|
+
* Update the specified tile's inclusive value and corresponding status
|
|
1119
|
+
*/
|
|
1120
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive)
|
|
1121
|
+
{
|
|
1122
|
+
TileDescriptor tile_descriptor;
|
|
1123
|
+
tile_descriptor.status = SCAN_TILE_INCLUSIVE;
|
|
1124
|
+
tile_descriptor.value = tile_inclusive.value;
|
|
1125
|
+
tile_descriptor.key = tile_inclusive.key;
|
|
1126
|
+
|
|
1127
|
+
TxnWord alias;
|
|
1128
|
+
*reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
|
|
1129
|
+
|
|
1130
|
+
detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1133
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void SetPartial(int tile_idx, KeyValuePairT tile_partial)
|
|
1134
|
+
{
|
|
1135
|
+
TileDescriptor tile_descriptor;
|
|
1136
|
+
tile_descriptor.status = SCAN_TILE_PARTIAL;
|
|
1137
|
+
tile_descriptor.value = tile_partial.value;
|
|
1138
|
+
tile_descriptor.key = tile_partial.key;
|
|
1139
|
+
|
|
1140
|
+
TxnWord alias;
|
|
1141
|
+
*reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
|
|
1142
|
+
|
|
1143
|
+
detail::store_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
|
|
1144
|
+
}
|
|
1145
|
+
|
|
1146
|
+
/**
|
|
1147
|
+
* Wait for the corresponding tile to become non-invalid
|
|
1148
|
+
*/
|
|
1149
|
+
template <class DelayT = detail::fixed_delay_constructor_t<350, 450>::delay_t>
|
|
1150
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1151
|
+
WaitForValid(int tile_idx, StatusWord& status, KeyValuePairT& value, DelayT delay_or_prevent_hoisting = {})
|
|
1152
|
+
{
|
|
1153
|
+
// TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING +
|
|
1154
|
+
// tile_idx); TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
|
|
1155
|
+
//
|
|
1156
|
+
// while (tile_descriptor.status == SCAN_TILE_INVALID)
|
|
1157
|
+
// {
|
|
1158
|
+
// __threadfence_block(); // prevent hoisting loads from loop
|
|
1159
|
+
//
|
|
1160
|
+
// alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
|
|
1161
|
+
// tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
|
|
1162
|
+
// }
|
|
1163
|
+
//
|
|
1164
|
+
// status = tile_descriptor.status;
|
|
1165
|
+
// value.value = tile_descriptor.value;
|
|
1166
|
+
// value.key = tile_descriptor.key;
|
|
1167
|
+
|
|
1168
|
+
TileDescriptor tile_descriptor;
|
|
1169
|
+
|
|
1170
|
+
do
|
|
1171
|
+
{
|
|
1172
|
+
delay_or_prevent_hoisting();
|
|
1173
|
+
TxnWord alias = detail::load_relaxed(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
|
|
1174
|
+
tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
|
|
1175
|
+
|
|
1176
|
+
} while (__any_sync(0xffffffff, (tile_descriptor.status == SCAN_TILE_INVALID)));
|
|
1177
|
+
|
|
1178
|
+
status = tile_descriptor.status;
|
|
1179
|
+
value.value = tile_descriptor.value;
|
|
1180
|
+
value.key = tile_descriptor.key;
|
|
1181
|
+
}
|
|
1182
|
+
};
|
|
1183
|
+
|
|
1184
|
+
/******************************************************************************
|
|
1185
|
+
* Prefix call-back operator for coupling local block scan within a
|
|
1186
|
+
* block-cooperative scan
|
|
1187
|
+
******************************************************************************/
|
|
1188
|
+
|
|
1189
|
+
/**
|
|
1190
|
+
* Stateful block-scan prefix functor. Provides the the running prefix for
|
|
1191
|
+
* the current tile by using the call-back warp to wait on on
|
|
1192
|
+
* aggregates/prefixes from predecessor tiles to become available.
|
|
1193
|
+
*
|
|
1194
|
+
* @tparam DelayConstructorT
|
|
1195
|
+
* Implementation detail, do not specify directly, requirements on the
|
|
1196
|
+
* content of this type are subject to breaking change.
|
|
1197
|
+
*/
|
|
1198
|
+
template <typename T,
|
|
1199
|
+
typename ScanOpT,
|
|
1200
|
+
typename ScanTileStateT,
|
|
1201
|
+
typename DelayConstructorT = detail::default_delay_constructor_t<T>>
|
|
1202
|
+
struct TilePrefixCallbackOp
|
|
1203
|
+
{
|
|
1204
|
+
// Parameterized warp reduce
|
|
1205
|
+
using WarpReduceT = WarpReduce<T, (1 << (5))>;
|
|
1206
|
+
|
|
1207
|
+
// Temporary storage type
|
|
1208
|
+
struct _TempStorage
|
|
1209
|
+
{
|
|
1210
|
+
typename WarpReduceT::TempStorage warp_reduce;
|
|
1211
|
+
T exclusive_prefix;
|
|
1212
|
+
T inclusive_prefix;
|
|
1213
|
+
T block_aggregate;
|
|
1214
|
+
};
|
|
1215
|
+
|
|
1216
|
+
// Alias wrapper allowing temporary storage to be unioned
|
|
1217
|
+
struct TempStorage : Uninitialized<_TempStorage>
|
|
1218
|
+
{};
|
|
1219
|
+
|
|
1220
|
+
// Type of status word
|
|
1221
|
+
using StatusWord = typename ScanTileStateT::StatusWord;
|
|
1222
|
+
|
|
1223
|
+
// Fields
|
|
1224
|
+
_TempStorage& temp_storage; ///< Reference to a warp-reduction instance
|
|
1225
|
+
ScanTileStateT& tile_status; ///< Interface to tile status
|
|
1226
|
+
ScanOpT scan_op; ///< Binary scan operator
|
|
1227
|
+
int tile_idx; ///< The current tile index
|
|
1228
|
+
T exclusive_prefix; ///< Exclusive prefix for the tile
|
|
1229
|
+
T inclusive_prefix; ///< Inclusive prefix for the tile
|
|
1230
|
+
|
|
1231
|
+
// Constructs prefix functor for a given tile index.
|
|
1232
|
+
// Precondition: thread blocks processing all of the predecessor tiles were scheduled.
|
|
1233
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE
|
|
1234
|
+
TilePrefixCallbackOp(ScanTileStateT& tile_status, TempStorage& temp_storage, ScanOpT scan_op, int tile_idx)
|
|
1235
|
+
: temp_storage(temp_storage.Alias())
|
|
1236
|
+
, tile_status(tile_status)
|
|
1237
|
+
, scan_op(scan_op)
|
|
1238
|
+
, tile_idx(tile_idx)
|
|
1239
|
+
{}
|
|
1240
|
+
|
|
1241
|
+
// Computes the tile index and constructs prefix functor with it.
|
|
1242
|
+
// Precondition: thread block per tile assignment.
|
|
1243
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE
|
|
1244
|
+
TilePrefixCallbackOp(ScanTileStateT& tile_status, TempStorage& temp_storage, ScanOpT scan_op)
|
|
1245
|
+
: TilePrefixCallbackOp(tile_status, temp_storage, scan_op, blockIdx.x)
|
|
1246
|
+
{}
|
|
1247
|
+
|
|
1248
|
+
/**
|
|
1249
|
+
* @brief Block until all predecessors within the warp-wide window have non-invalid status
|
|
1250
|
+
*
|
|
1251
|
+
* @param predecessor_idx
|
|
1252
|
+
* Preceding tile index to inspect
|
|
1253
|
+
*
|
|
1254
|
+
* @param[out] predecessor_status
|
|
1255
|
+
* Preceding tile status
|
|
1256
|
+
*
|
|
1257
|
+
* @param[out] window_aggregate
|
|
1258
|
+
* Relevant partial reduction from this window of preceding tiles
|
|
1259
|
+
*/
|
|
1260
|
+
template <class DelayT = detail::default_delay_t<T>>
|
|
1261
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1262
|
+
ProcessWindow(int predecessor_idx, StatusWord& predecessor_status, T& window_aggregate, DelayT delay = {})
|
|
1263
|
+
{
|
|
1264
|
+
T value;
|
|
1265
|
+
tile_status.WaitForValid(predecessor_idx, predecessor_status, value, delay);
|
|
1266
|
+
|
|
1267
|
+
// Perform a segmented reduction to get the prefix for the current window.
|
|
1268
|
+
// Use the swizzled scan operator because we are now scanning *down* towards thread0.
|
|
1269
|
+
|
|
1270
|
+
int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
|
|
1271
|
+
window_aggregate =
|
|
1272
|
+
WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce(value, tail_flag, SwizzleScanOp<ScanOpT>(scan_op));
|
|
1273
|
+
}
|
|
1274
|
+
|
|
1275
|
+
// BlockScan prefix callback functor (called by the first warp)
|
|
1276
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE T operator()(T block_aggregate)
|
|
1277
|
+
{
|
|
1278
|
+
// Update our status with our tile-aggregate
|
|
1279
|
+
if (threadIdx.x == 0)
|
|
1280
|
+
{
|
|
1281
|
+
detail::uninitialized_copy_single(&temp_storage.block_aggregate, block_aggregate);
|
|
1282
|
+
|
|
1283
|
+
tile_status.SetPartial(tile_idx, block_aggregate);
|
|
1284
|
+
}
|
|
1285
|
+
|
|
1286
|
+
int predecessor_idx = tile_idx - threadIdx.x - 1;
|
|
1287
|
+
StatusWord predecessor_status;
|
|
1288
|
+
T window_aggregate;
|
|
1289
|
+
|
|
1290
|
+
// Wait for the warp-wide window of predecessor tiles to become valid
|
|
1291
|
+
DelayConstructorT construct_delay(tile_idx);
|
|
1292
|
+
ProcessWindow(predecessor_idx, predecessor_status, window_aggregate, construct_delay());
|
|
1293
|
+
|
|
1294
|
+
// The exclusive tile prefix starts out as the current window aggregate
|
|
1295
|
+
exclusive_prefix = window_aggregate;
|
|
1296
|
+
|
|
1297
|
+
// Keep sliding the window back until we come across a tile whose inclusive prefix is known
|
|
1298
|
+
while (__all_sync(0xffffffff, (predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE))))
|
|
1299
|
+
{
|
|
1300
|
+
predecessor_idx -= detail::warp_threads;
|
|
1301
|
+
|
|
1302
|
+
// Update exclusive tile prefix with the window prefix
|
|
1303
|
+
ProcessWindow(predecessor_idx, predecessor_status, window_aggregate, construct_delay());
|
|
1304
|
+
exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
|
|
1305
|
+
}
|
|
1306
|
+
|
|
1307
|
+
// Compute the inclusive tile prefix and update the status for this tile
|
|
1308
|
+
if (threadIdx.x == 0)
|
|
1309
|
+
{
|
|
1310
|
+
inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
|
|
1311
|
+
tile_status.SetInclusive(tile_idx, inclusive_prefix);
|
|
1312
|
+
|
|
1313
|
+
detail::uninitialized_copy_single(&temp_storage.exclusive_prefix, exclusive_prefix);
|
|
1314
|
+
|
|
1315
|
+
detail::uninitialized_copy_single(&temp_storage.inclusive_prefix, inclusive_prefix);
|
|
1316
|
+
}
|
|
1317
|
+
|
|
1318
|
+
// Return exclusive_prefix
|
|
1319
|
+
return exclusive_prefix;
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
// Get the exclusive prefix stored in temporary storage
|
|
1323
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE T GetExclusivePrefix()
|
|
1324
|
+
{
|
|
1325
|
+
return temp_storage.exclusive_prefix;
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
// Get the inclusive prefix stored in temporary storage
|
|
1329
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE T GetInclusivePrefix()
|
|
1330
|
+
{
|
|
1331
|
+
return temp_storage.inclusive_prefix;
|
|
1332
|
+
}
|
|
1333
|
+
|
|
1334
|
+
// Get the block aggregate stored in temporary storage
|
|
1335
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE T GetBlockAggregate()
|
|
1336
|
+
{
|
|
1337
|
+
return temp_storage.block_aggregate;
|
|
1338
|
+
}
|
|
1339
|
+
|
|
1340
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE int GetTileIdx() const
|
|
1341
|
+
{
|
|
1342
|
+
return tile_idx;
|
|
1343
|
+
}
|
|
1344
|
+
};
|
|
1345
|
+
|
|
1346
|
+
CUB_NAMESPACE_END
|