cuda-cccl 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/__init__.py +27 -0
- cuda/cccl/_cuda_version_utils.py +24 -0
- cuda/cccl/cooperative/__init__.py +9 -0
- cuda/cccl/cooperative/experimental/__init__.py +24 -0
- cuda/cccl/headers/__init__.py +7 -0
- cuda/cccl/headers/include/__init__.py +1 -0
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +259 -0
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1182 -0
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +81 -0
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +709 -0
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +748 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +786 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +703 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +555 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +589 -0
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +474 -0
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +289 -0
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1117 -0
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +606 -0
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +631 -0
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +963 -0
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1227 -0
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +1313 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +424 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +1264 -0
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1225 -0
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2196 -0
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +667 -0
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +2315 -0
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
- cuda/cccl/headers/include/cub/block/block_store.cuh +1247 -0
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
- cuda/cccl/headers/include/cub/config.cuh +53 -0
- cuda/cccl/headers/include/cub/cub.cuh +120 -0
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +114 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
- cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
- cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
- cuda/cccl/headers/include/cub/device/device_for.cuh +1063 -0
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
- cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
- cuda/cccl/headers/include/cub/device/device_partition.cuh +668 -0
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +2518 -0
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
- cuda/cccl/headers/include/cub/device/device_scan.cuh +2212 -0
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1430 -0
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
- cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
- cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
- cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1744 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1310 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +531 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +517 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +975 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +440 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +627 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +569 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +261 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +583 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +189 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +321 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +522 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1028 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +67 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +118 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +60 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +275 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +76 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +126 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1065 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +942 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +673 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +618 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1010 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +398 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +440 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +481 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +884 -0
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +548 -0
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
- cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
- cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
- cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
- cuda/cccl/headers/include/cub/util_device.cuh +800 -0
- cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
- cuda/cccl/headers/include/cub/util_math.cuh +118 -0
- cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
- cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
- cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
- cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
- cuda/cccl/headers/include/cub/version.cuh +89 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +737 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +408 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +829 -0
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1890 -0
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +521 -0
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
- cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
- cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +487 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
- cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
- cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
- cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
- cuda/cccl/headers/include/cuda/__cccl_config +37 -0
- cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
- cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
- cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
- cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
- cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
- cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
- cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
- cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
- cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
- cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
- cuda/cccl/headers/include/cuda/__complex_ +28 -0
- cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
- cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
- cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +835 -0
- cuda/cccl/headers/include/cuda/__event/event.h +171 -0
- cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
- cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
- cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
- cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
- cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
- cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
- cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
- cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
- cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
- cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
- cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +533 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
- cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
- cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
- cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
- cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
- cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
- cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +652 -0
- cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
- cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2983 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
- cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
- cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
- cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
- cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
- cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
- cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
- cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
- cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
- cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
- cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
- cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
- cuda/cccl/headers/include/cuda/access_property +26 -0
- cuda/cccl/headers/include/cuda/algorithm +27 -0
- cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
- cuda/cccl/headers/include/cuda/atomic +27 -0
- cuda/cccl/headers/include/cuda/barrier +267 -0
- cuda/cccl/headers/include/cuda/bit +29 -0
- cuda/cccl/headers/include/cuda/cmath +37 -0
- cuda/cccl/headers/include/cuda/devices +33 -0
- cuda/cccl/headers/include/cuda/discard_memory +32 -0
- cuda/cccl/headers/include/cuda/functional +32 -0
- cuda/cccl/headers/include/cuda/iterator +39 -0
- cuda/cccl/headers/include/cuda/latch +27 -0
- cuda/cccl/headers/include/cuda/mdspan +28 -0
- cuda/cccl/headers/include/cuda/memory +35 -0
- cuda/cccl/headers/include/cuda/memory_resource +35 -0
- cuda/cccl/headers/include/cuda/numeric +29 -0
- cuda/cccl/headers/include/cuda/pipeline +579 -0
- cuda/cccl/headers/include/cuda/ptx +129 -0
- cuda/cccl/headers/include/cuda/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
- cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
- cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
- cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
- cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
- cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
- cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
- cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
- cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
- cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
- cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
- cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
- cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +673 -0
- cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +91 -0
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
- cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
- cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
- cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
- cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
- cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
- cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
- cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
- cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
- cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +259 -0
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
- cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
- cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
- cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
- cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
- cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
- cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
- cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
- cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
- cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
- cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
- cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
- cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
- cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
- cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
- cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
- cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
- cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
- cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
- cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
- cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
- cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
- cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
- cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +139 -0
- cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
- cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
- cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
- cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +165 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
- cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
- cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
- cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
- cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
- cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
- cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
- cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
- cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
- cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
- cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
- cuda/cccl/headers/include/cuda/std/__format_ +45 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
- cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
- cuda/cccl/headers/include/cuda/std/__functional/function.h +1275 -0
- cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
- cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
- cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
- cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
- cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
- cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
- cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
- cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
- cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
- cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
- cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
- cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
- cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +72 -0
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
- cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
- cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
- cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
- cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
- cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
- cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
- cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
- cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
- cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
- cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
- cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
- cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +759 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
- cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +525 -0
- cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
- cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
- cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
- cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
- cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
- cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
- cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
- cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
- cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
- cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
- cuda/cccl/headers/include/cuda/std/__new_ +29 -0
- cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
- cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
- cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
- cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
- cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
- cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
- cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
- cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
- cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
- cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
- cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
- cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
- cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
- cuda/cccl/headers/include/cuda/std/__random_ +29 -0
- cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
- cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
- cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
- cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
- cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
- cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
- cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
- cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
- cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
- cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
- cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
- cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
- cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
- cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
- cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
- cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
- cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
- cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
- cuda/cccl/headers/include/cuda/std/__string_ +29 -0
- cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
- cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +84 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
- cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
- cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
- cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
- cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
- cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
- cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
- cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
- cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
- cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
- cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
- cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
- cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
- cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
- cuda/cccl/headers/include/cuda/std/array +518 -0
- cuda/cccl/headers/include/cuda/std/atomic +810 -0
- cuda/cccl/headers/include/cuda/std/barrier +42 -0
- cuda/cccl/headers/include/cuda/std/bit +35 -0
- cuda/cccl/headers/include/cuda/std/bitset +994 -0
- cuda/cccl/headers/include/cuda/std/cassert +28 -0
- cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
- cuda/cccl/headers/include/cuda/std/cfloat +59 -0
- cuda/cccl/headers/include/cuda/std/chrono +26 -0
- cuda/cccl/headers/include/cuda/std/climits +61 -0
- cuda/cccl/headers/include/cuda/std/cmath +87 -0
- cuda/cccl/headers/include/cuda/std/complex +50 -0
- cuda/cccl/headers/include/cuda/std/concepts +48 -0
- cuda/cccl/headers/include/cuda/std/cstddef +28 -0
- cuda/cccl/headers/include/cuda/std/cstdint +178 -0
- cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
- cuda/cccl/headers/include/cuda/std/cstring +110 -0
- cuda/cccl/headers/include/cuda/std/ctime +154 -0
- cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2144 -0
- cuda/cccl/headers/include/cuda/std/execution +29 -0
- cuda/cccl/headers/include/cuda/std/expected +30 -0
- cuda/cccl/headers/include/cuda/std/functional +56 -0
- cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
- cuda/cccl/headers/include/cuda/std/iterator +70 -0
- cuda/cccl/headers/include/cuda/std/latch +34 -0
- cuda/cccl/headers/include/cuda/std/limits +28 -0
- cuda/cccl/headers/include/cuda/std/linalg +30 -0
- cuda/cccl/headers/include/cuda/std/mdspan +38 -0
- cuda/cccl/headers/include/cuda/std/memory +39 -0
- cuda/cccl/headers/include/cuda/std/numbers +346 -0
- cuda/cccl/headers/include/cuda/std/numeric +41 -0
- cuda/cccl/headers/include/cuda/std/optional +31 -0
- cuda/cccl/headers/include/cuda/std/ranges +69 -0
- cuda/cccl/headers/include/cuda/std/ratio +416 -0
- cuda/cccl/headers/include/cuda/std/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/source_location +83 -0
- cuda/cccl/headers/include/cuda/std/span +628 -0
- cuda/cccl/headers/include/cuda/std/string_view +925 -0
- cuda/cccl/headers/include/cuda/std/tuple +26 -0
- cuda/cccl/headers/include/cuda/std/type_traits +177 -0
- cuda/cccl/headers/include/cuda/std/utility +70 -0
- cuda/cccl/headers/include/cuda/std/variant +25 -0
- cuda/cccl/headers/include/cuda/std/version +240 -0
- cuda/cccl/headers/include/cuda/stream +31 -0
- cuda/cccl/headers/include/cuda/stream_ref +59 -0
- cuda/cccl/headers/include/cuda/type_traits +27 -0
- cuda/cccl/headers/include/cuda/utility +28 -0
- cuda/cccl/headers/include/cuda/version +16 -0
- cuda/cccl/headers/include/cuda/warp +28 -0
- cuda/cccl/headers/include/cuda/work_stealing +26 -0
- cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
- cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
- cuda/cccl/headers/include/nv/target +240 -0
- cuda/cccl/headers/include/thrust/addressof.h +22 -0
- cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
- cuda/cccl/headers/include/thrust/advance.h +57 -0
- cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
- cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
- cuda/cccl/headers/include/thrust/complex.h +858 -0
- cuda/cccl/headers/include/thrust/copy.h +506 -0
- cuda/cccl/headers/include/thrust/count.h +245 -0
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
- cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +626 -0
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +192 -0
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +96 -0
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +78 -0
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +115 -0
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +116 -0
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
- cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
- cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
- cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
- cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
- cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
- cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
- cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
- cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config.h +36 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
- cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
- cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
- cuda/cccl/headers/include/thrust/detail/count.h +55 -0
- cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
- cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
- cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
- cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +81 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
- cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
- cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
- cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
- cuda/cccl/headers/include/thrust/detail/function.h +49 -0
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
- cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
- cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
- cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
- cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
- cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
- cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
- cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
- cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
- cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
- cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
- cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
- cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
- cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
- cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
- cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
- cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
- cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
- cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
- cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
- cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
- cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
- cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
- cuda/cccl/headers/include/thrust/device_delete.h +74 -0
- cuda/cccl/headers/include/thrust/device_free.h +85 -0
- cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
- cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
- cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
- cuda/cccl/headers/include/thrust/device_new.h +112 -0
- cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
- cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
- cuda/cccl/headers/include/thrust/device_reference.h +983 -0
- cuda/cccl/headers/include/thrust/device_vector.h +576 -0
- cuda/cccl/headers/include/thrust/distance.h +43 -0
- cuda/cccl/headers/include/thrust/equal.h +247 -0
- cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
- cuda/cccl/headers/include/thrust/extrema.h +657 -0
- cuda/cccl/headers/include/thrust/fill.h +200 -0
- cuda/cccl/headers/include/thrust/find.h +382 -0
- cuda/cccl/headers/include/thrust/for_each.h +261 -0
- cuda/cccl/headers/include/thrust/functional.h +395 -0
- cuda/cccl/headers/include/thrust/gather.h +464 -0
- cuda/cccl/headers/include/thrust/generate.h +193 -0
- cuda/cccl/headers/include/thrust/host_vector.h +576 -0
- cuda/cccl/headers/include/thrust/inner_product.h +264 -0
- cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
- cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
- cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
- cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
- cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
- cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
- cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
- cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
- cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
- cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
- cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
- cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
- cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
- cuda/cccl/headers/include/thrust/logical.h +290 -0
- cuda/cccl/headers/include/thrust/memory.h +299 -0
- cuda/cccl/headers/include/thrust/merge.h +725 -0
- cuda/cccl/headers/include/thrust/mismatch.h +261 -0
- cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +528 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
- cuda/cccl/headers/include/thrust/mr/new.h +100 -0
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
- cuda/cccl/headers/include/thrust/mr/pool.h +528 -0
- cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
- cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
- cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
- cuda/cccl/headers/include/thrust/pair.h +99 -0
- cuda/cccl/headers/include/thrust/partition.h +1391 -0
- cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
- cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
- cuda/cccl/headers/include/thrust/random.h +120 -0
- cuda/cccl/headers/include/thrust/reduce.h +1113 -0
- cuda/cccl/headers/include/thrust/remove.h +768 -0
- cuda/cccl/headers/include/thrust/replace.h +826 -0
- cuda/cccl/headers/include/thrust/reverse.h +215 -0
- cuda/cccl/headers/include/thrust/scan.h +1671 -0
- cuda/cccl/headers/include/thrust/scatter.h +446 -0
- cuda/cccl/headers/include/thrust/sequence.h +277 -0
- cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
- cuda/cccl/headers/include/thrust/shuffle.h +182 -0
- cuda/cccl/headers/include/thrust/sort.h +1320 -0
- cuda/cccl/headers/include/thrust/swap.h +147 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +273 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +233 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +170 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +223 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +341 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +469 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
- cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
- cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
- cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
- cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +73 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.inl +172 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +36 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +33 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system_error.h +57 -0
- cuda/cccl/headers/include/thrust/tabulate.h +125 -0
- cuda/cccl/headers/include/thrust/transform.h +1045 -0
- cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
- cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
- cuda/cccl/headers/include/thrust/tuple.h +139 -0
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
- cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
- cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
- cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
- cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
- cuda/cccl/headers/include/thrust/unique.h +1088 -0
- cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
- cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
- cuda/cccl/headers/include/thrust/version.h +93 -0
- cuda/cccl/headers/include/thrust/zip_function.h +176 -0
- cuda/cccl/headers/include_paths.py +51 -0
- cuda/cccl/parallel/__init__.py +9 -0
- cuda/cccl/parallel/experimental/__init__.py +24 -0
- cuda/cccl/py.typed +0 -0
- cuda/compute/__init__.py +79 -0
- cuda/compute/_bindings.py +79 -0
- cuda/compute/_bindings.pyi +475 -0
- cuda/compute/_bindings_impl.pyx +2273 -0
- cuda/compute/_caching.py +71 -0
- cuda/compute/_cccl_interop.py +422 -0
- cuda/compute/_utils/__init__.py +0 -0
- cuda/compute/_utils/protocols.py +132 -0
- cuda/compute/_utils/temp_storage_buffer.py +86 -0
- cuda/compute/algorithms/__init__.py +54 -0
- cuda/compute/algorithms/_histogram.py +243 -0
- cuda/compute/algorithms/_merge_sort.py +225 -0
- cuda/compute/algorithms/_radix_sort.py +312 -0
- cuda/compute/algorithms/_reduce.py +182 -0
- cuda/compute/algorithms/_scan.py +331 -0
- cuda/compute/algorithms/_segmented_reduce.py +257 -0
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/compute/algorithms/_transform.py +329 -0
- cuda/compute/algorithms/_unique_by_key.py +252 -0
- cuda/compute/cccl/.gitkeep +0 -0
- cuda/compute/cu12/_bindings_impl.cp313-win_amd64.pyd +0 -0
- cuda/compute/cu12/cccl/cccl.c.parallel.dll +0 -0
- cuda/compute/cu12/cccl/cccl.c.parallel.lib +0 -0
- cuda/compute/cu13/_bindings_impl.cp313-win_amd64.pyd +0 -0
- cuda/compute/cu13/cccl/cccl.c.parallel.dll +0 -0
- cuda/compute/cu13/cccl/cccl.c.parallel.lib +0 -0
- cuda/compute/iterators/__init__.py +21 -0
- cuda/compute/iterators/_factories.py +219 -0
- cuda/compute/iterators/_iterators.py +817 -0
- cuda/compute/iterators/_zip_iterator.py +199 -0
- cuda/compute/numba_utils.py +53 -0
- cuda/compute/op.py +3 -0
- cuda/compute/struct.py +272 -0
- cuda/compute/typing.py +37 -0
- cuda/coop/__init__.py +8 -0
- cuda/coop/_caching.py +48 -0
- cuda/coop/_common.py +275 -0
- cuda/coop/_nvrtc.py +92 -0
- cuda/coop/_scan_op.py +181 -0
- cuda/coop/_types.py +937 -0
- cuda/coop/_typing.py +107 -0
- cuda/coop/block/__init__.py +39 -0
- cuda/coop/block/_block_exchange.py +251 -0
- cuda/coop/block/_block_load_store.py +215 -0
- cuda/coop/block/_block_merge_sort.py +125 -0
- cuda/coop/block/_block_radix_sort.py +214 -0
- cuda/coop/block/_block_reduce.py +294 -0
- cuda/coop/block/_block_scan.py +983 -0
- cuda/coop/warp/__init__.py +9 -0
- cuda/coop/warp/_warp_merge_sort.py +92 -0
- cuda/coop/warp/_warp_reduce.py +153 -0
- cuda/coop/warp/_warp_scan.py +78 -0
- cuda_cccl-0.3.3.dist-info/METADATA +41 -0
- cuda_cccl-0.3.3.dist-info/RECORD +1968 -0
- cuda_cccl-0.3.3.dist-info/WHEEL +5 -0
- cuda_cccl-0.3.3.dist-info/licenses/LICENSE +1 -0
|
@@ -0,0 +1,1264 @@
|
|
|
1
|
+
/******************************************************************************
|
|
2
|
+
* Copyright (c) 2011, Duane Merrill. All rights reserved.
|
|
3
|
+
* Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
|
|
4
|
+
*
|
|
5
|
+
* Redistribution and use in source and binary forms, with or without
|
|
6
|
+
* modification, are permitted provided that the following conditions are met:
|
|
7
|
+
* * Redistributions of source code must retain the above copyright
|
|
8
|
+
* notice, this list of conditions and the following disclaimer.
|
|
9
|
+
* * Redistributions in binary form must reproduce the above copyright
|
|
10
|
+
* notice, this list of conditions and the following disclaimer in the
|
|
11
|
+
* documentation and/or other materials provided with the distribution.
|
|
12
|
+
* * Neither the name of the NVIDIA CORPORATION nor the
|
|
13
|
+
* names of its contributors may be used to endorse or promote products
|
|
14
|
+
* derived from this software without specific prior written permission.
|
|
15
|
+
*
|
|
16
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
17
|
+
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
18
|
+
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
19
|
+
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
20
|
+
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
21
|
+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
22
|
+
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
23
|
+
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
24
|
+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
25
|
+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
26
|
+
*
|
|
27
|
+
******************************************************************************/
|
|
28
|
+
|
|
29
|
+
//! @file
|
|
30
|
+
//! block_load.cuh Operations for reading linear tiles of data into the CUDA thread block.
|
|
31
|
+
|
|
32
|
+
#pragma once
|
|
33
|
+
|
|
34
|
+
#include <cub/config.cuh>
|
|
35
|
+
|
|
36
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
37
|
+
# pragma GCC system_header
|
|
38
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
39
|
+
# pragma clang system_header
|
|
40
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
41
|
+
# pragma system_header
|
|
42
|
+
#endif // no system header
|
|
43
|
+
|
|
44
|
+
#include <cub/block/block_exchange.cuh>
|
|
45
|
+
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
46
|
+
#include <cub/util_ptx.cuh>
|
|
47
|
+
#include <cub/util_type.cuh>
|
|
48
|
+
|
|
49
|
+
CUB_NAMESPACE_BEGIN
|
|
50
|
+
|
|
51
|
+
//! @name Blocked arrangement I/O (direct)
|
|
52
|
+
//! @{
|
|
53
|
+
|
|
54
|
+
//! @rst
|
|
55
|
+
//! Load a linear segment of items into a blocked arrangement across the thread block.
|
|
56
|
+
//!
|
|
57
|
+
//! @blocked
|
|
58
|
+
//!
|
|
59
|
+
//! @endrst
|
|
60
|
+
//!
|
|
61
|
+
//! @tparam T
|
|
62
|
+
//! **[inferred]** The data type to load.
|
|
63
|
+
//!
|
|
64
|
+
//! @tparam ITEMS_PER_THREAD
|
|
65
|
+
//! **[inferred]** The number of consecutive items partitioned onto each thread.
|
|
66
|
+
//!
|
|
67
|
+
//! @tparam RandomAccessIterator
|
|
68
|
+
//! **[inferred]** The random-access iterator type for input iterator.
|
|
69
|
+
//!
|
|
70
|
+
//! @param[in] linear_tid
|
|
71
|
+
//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
|
|
72
|
+
//! thread blocks)
|
|
73
|
+
//!
|
|
74
|
+
//! @param[in] block_src_it
|
|
75
|
+
//! The thread block's base input iterator for loading from
|
|
76
|
+
//!
|
|
77
|
+
//! @param[out] dst_items
|
|
78
|
+
//! Destination to load data into
|
|
79
|
+
template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
|
|
80
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
81
|
+
LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
|
|
82
|
+
{
|
|
83
|
+
// Load directly in thread-blocked order
|
|
84
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
85
|
+
for (int i = 0; i < ITEMS_PER_THREAD; i++)
|
|
86
|
+
{
|
|
87
|
+
dst_items[i] = block_src_it[linear_tid * ITEMS_PER_THREAD + i];
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
//! @rst
|
|
92
|
+
//! Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
|
|
93
|
+
//!
|
|
94
|
+
//! @blocked
|
|
95
|
+
//!
|
|
96
|
+
//! @endrst
|
|
97
|
+
//!
|
|
98
|
+
//! @tparam T
|
|
99
|
+
//! **[inferred]** The data type to load.
|
|
100
|
+
//!
|
|
101
|
+
//! @tparam ITEMS_PER_THREAD
|
|
102
|
+
//! **[inferred]** The number of consecutive items partitioned onto each thread.
|
|
103
|
+
//!
|
|
104
|
+
//! @tparam RandomAccessIterator
|
|
105
|
+
//! **[inferred]** The random-access iterator type for input iterator.
|
|
106
|
+
//!
|
|
107
|
+
//! @param[in] linear_tid
|
|
108
|
+
//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
|
|
109
|
+
//! thread blocks)
|
|
110
|
+
//!
|
|
111
|
+
//! @param[in] block_src_it
|
|
112
|
+
//! The thread block's base iterator for loading from
|
|
113
|
+
//!
|
|
114
|
+
//! @param[out] dst_items
|
|
115
|
+
//! Destination to load data into
|
|
116
|
+
//!
|
|
117
|
+
//! @param[in] block_items_end
|
|
118
|
+
//! First out-of-bounds index when loading from block_src_it
|
|
119
|
+
template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
|
|
120
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
|
|
121
|
+
int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
|
|
122
|
+
{
|
|
123
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
124
|
+
for (int i = 0; i < ITEMS_PER_THREAD; i++)
|
|
125
|
+
{
|
|
126
|
+
const auto src_pos = linear_tid * ITEMS_PER_THREAD + i;
|
|
127
|
+
if (src_pos < block_items_end)
|
|
128
|
+
{
|
|
129
|
+
dst_items[i] = block_src_it[src_pos];
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
//! @rst
|
|
135
|
+
//! Load a linear segment of items into a blocked arrangement across the thread block, guarded
|
|
136
|
+
//! by range, with a fall-back assignment of out-of-bound elements.
|
|
137
|
+
//!
|
|
138
|
+
//! @blocked
|
|
139
|
+
//!
|
|
140
|
+
//! @endrst
|
|
141
|
+
//!
|
|
142
|
+
//! @tparam T
|
|
143
|
+
//! **[inferred]** The data type to load.
|
|
144
|
+
//!
|
|
145
|
+
//! @tparam ITEMS_PER_THREAD
|
|
146
|
+
//! **[inferred]** The number of consecutive items partitioned onto each thread.
|
|
147
|
+
//!
|
|
148
|
+
//! @tparam RandomAccessIterator
|
|
149
|
+
//! **[inferred]** The random-access iterator type for input \iterator.
|
|
150
|
+
//!
|
|
151
|
+
//! @param[in] linear_tid
|
|
152
|
+
//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
|
|
153
|
+
//! thread blocks)
|
|
154
|
+
//!
|
|
155
|
+
//! @param[in] block_src_it
|
|
156
|
+
//! The thread block's base input iterator for loading from
|
|
157
|
+
//!
|
|
158
|
+
//! @param[out] dst_items
|
|
159
|
+
//! Destination to load data into
|
|
160
|
+
//!
|
|
161
|
+
//! @param[in] block_items_end
|
|
162
|
+
//! First out-of-bounds index when loading from block_src_it
|
|
163
|
+
//!
|
|
164
|
+
//! @param[in] oob_default
|
|
165
|
+
//! Default value to assign out-of-bound items
|
|
166
|
+
template <typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
|
|
167
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectBlocked(
|
|
168
|
+
int linear_tid,
|
|
169
|
+
RandomAccessIterator block_src_it,
|
|
170
|
+
T (&dst_items)[ITEMS_PER_THREAD],
|
|
171
|
+
int block_items_end,
|
|
172
|
+
DefaultT oob_default)
|
|
173
|
+
{
|
|
174
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
175
|
+
for (int i = 0; i < ITEMS_PER_THREAD; i++)
|
|
176
|
+
{
|
|
177
|
+
dst_items[i] = oob_default;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
#ifndef _CCCL_DOXYGEN_INVOKED // Do not document
|
|
184
|
+
|
|
185
|
+
//! @brief Internal implementation for load vectorization
|
|
186
|
+
//!
|
|
187
|
+
//! @param[in] linear_tid
|
|
188
|
+
//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
|
|
189
|
+
//! thread blocks)
|
|
190
|
+
//!
|
|
191
|
+
//! @param[in] block_src_ptr
|
|
192
|
+
//! Input pointer for loading from
|
|
193
|
+
//!
|
|
194
|
+
//! @param[out] dst_items
|
|
195
|
+
//! Destination to load data into
|
|
196
|
+
template <CacheLoadModifier MODIFIER, typename T, int ITEMS_PER_THREAD>
|
|
197
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
198
|
+
InternalLoadDirectBlockedVectorized(int linear_tid, const T* block_src_ptr, T (&dst_items)[ITEMS_PER_THREAD])
|
|
199
|
+
{
|
|
200
|
+
// Find biggest memory access word that T is a whole multiple of
|
|
201
|
+
using device_word_t = typename UnitWord<T>::DeviceWord;
|
|
202
|
+
_CCCL_DIAG_PUSH
|
|
203
|
+
# if _CCCL_COMPILER(CLANG, >=, 10)
|
|
204
|
+
_CCCL_DIAG_SUPPRESS_CLANG("-Wsizeof-array-div")
|
|
205
|
+
# endif // _CCCL_COMPILER(CLANG, >=, 10)
|
|
206
|
+
constexpr int total_words = static_cast<int>(sizeof(dst_items) / sizeof(device_word_t));
|
|
207
|
+
_CCCL_DIAG_POP
|
|
208
|
+
constexpr int vector_size = (total_words % 4 == 0) ? 4 : (total_words % 2 == 0) ? 2 : 1;
|
|
209
|
+
constexpr int vectors_per_thread = total_words / vector_size;
|
|
210
|
+
|
|
211
|
+
// Load into an array of vectors in thread-blocked order
|
|
212
|
+
using vector_t = typename CubVector<device_word_t, vector_size>::Type;
|
|
213
|
+
|
|
214
|
+
// Add the alignment check to ensure the vectorized loading can proceed.
|
|
215
|
+
if (reinterpret_cast<uintptr_t>(block_src_ptr) % (alignof(vector_t)) == 0)
|
|
216
|
+
{
|
|
217
|
+
vector_t vec_items[vectors_per_thread];
|
|
218
|
+
// Load into an array of vectors in thread-blocked order
|
|
219
|
+
const vector_t* vec_ptr = reinterpret_cast<const vector_t*>(block_src_ptr) + linear_tid * vectors_per_thread;
|
|
220
|
+
|
|
221
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
222
|
+
for (int i = 0; i < vectors_per_thread; i++)
|
|
223
|
+
{
|
|
224
|
+
vec_items[i] = ThreadLoad<MODIFIER>(vec_ptr + i);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// Copy to destination
|
|
228
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
229
|
+
for (int i = 0; i < ITEMS_PER_THREAD; i++)
|
|
230
|
+
{
|
|
231
|
+
dst_items[i] = *(reinterpret_cast<T*>(vec_items) + i);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
else
|
|
235
|
+
{
|
|
236
|
+
LoadDirectBlocked(linear_tid, block_src_ptr, dst_items);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
#endif // _CCCL_DOXYGEN_INVOKED
|
|
241
|
+
|
|
242
|
+
//! @rst
|
|
243
|
+
//! Load a linear segment of items into a blocked arrangement across the thread block.
|
|
244
|
+
//!
|
|
245
|
+
//! @blocked
|
|
246
|
+
//!
|
|
247
|
+
//! The input offset (``block_ptr + block_offset``) must be quad-item aligned
|
|
248
|
+
//!
|
|
249
|
+
//! The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
|
|
250
|
+
//!
|
|
251
|
+
//! - ``ITEMS_PER_THREAD`` is odd
|
|
252
|
+
//! - The data type ``T`` is not a built-in primitive or CUDA vector type
|
|
253
|
+
//! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
|
|
254
|
+
//!
|
|
255
|
+
//! @endrst
|
|
256
|
+
//!
|
|
257
|
+
//! @tparam T
|
|
258
|
+
//! **[inferred]** The data type to load.
|
|
259
|
+
//!
|
|
260
|
+
//! @tparam ITEMS_PER_THREAD
|
|
261
|
+
//! **[inferred]** The number of consecutive items partitioned onto each thread.
|
|
262
|
+
//!
|
|
263
|
+
//! @param[in] linear_tid
|
|
264
|
+
//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
|
|
265
|
+
//! linear_tid` for 2D thread blocks)
|
|
266
|
+
//!
|
|
267
|
+
//! @param[in] block_src_ptr
|
|
268
|
+
//! The thread block's base pointer for loading from
|
|
269
|
+
//!
|
|
270
|
+
//! @param[out] dst_items
|
|
271
|
+
//! destination to load data into
|
|
272
|
+
template <typename T, int ITEMS_PER_THREAD>
|
|
273
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
274
|
+
LoadDirectBlockedVectorized(int linear_tid, T* block_src_ptr, T (&dst_items)[ITEMS_PER_THREAD])
|
|
275
|
+
{
|
|
276
|
+
InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_src_ptr, dst_items);
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
//! @} end member group
|
|
280
|
+
//! @name Striped arrangement I/O (direct)
|
|
281
|
+
//! @{
|
|
282
|
+
|
|
283
|
+
//! @rst
|
|
284
|
+
//! Load a linear segment of items into a striped arrangement across the thread block.
|
|
285
|
+
//!
|
|
286
|
+
//! @striped
|
|
287
|
+
//!
|
|
288
|
+
//! @endrst
|
|
289
|
+
//!
|
|
290
|
+
//! @tparam BLOCK_THREADS
|
|
291
|
+
//! The thread block size in threads
|
|
292
|
+
//!
|
|
293
|
+
//! @tparam T
|
|
294
|
+
//! **[inferred]** The data type to load.
|
|
295
|
+
//!
|
|
296
|
+
//! @tparam ITEMS_PER_THREAD
|
|
297
|
+
//! **[inferred]** The number of consecutive items partitioned onto each thread.
|
|
298
|
+
//!
|
|
299
|
+
//! @tparam RandomAccessIterator
|
|
300
|
+
//! **[inferred]** The random-access iterator type for input iterator.
|
|
301
|
+
//!
|
|
302
|
+
//! @param[in] linear_tid
|
|
303
|
+
//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) + linear_tid` for 2D
|
|
304
|
+
//! thread blocks)
|
|
305
|
+
//!
|
|
306
|
+
//! @param[in] block_src_it
|
|
307
|
+
//! The thread block's base iterator for loading from
|
|
308
|
+
//!
|
|
309
|
+
//! @param[out] dst_items
|
|
310
|
+
//! Destination to load data into
|
|
311
|
+
template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
|
|
312
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
313
|
+
LoadDirectStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
|
|
314
|
+
{
|
|
315
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
316
|
+
for (int i = 0; i < ITEMS_PER_THREAD; i++)
|
|
317
|
+
{
|
|
318
|
+
dst_items[i] = block_src_it[linear_tid + i * BLOCK_THREADS];
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
namespace detail
|
|
323
|
+
{
|
|
324
|
+
template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator, typename TransformOpT>
|
|
325
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped(
|
|
326
|
+
int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], TransformOpT transform_op)
|
|
327
|
+
{
|
|
328
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
329
|
+
for (int i = 0; i < ITEMS_PER_THREAD; i++)
|
|
330
|
+
{
|
|
331
|
+
dst_items[i] = transform_op(block_src_it[linear_tid + i * BLOCK_THREADS]);
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
} // namespace detail
|
|
335
|
+
|
|
336
|
+
//! @rst
|
|
337
|
+
//! Load a linear segment of items into a striped arrangement across the thread block, guarded by range
|
|
338
|
+
//!
|
|
339
|
+
//! @striped
|
|
340
|
+
//!
|
|
341
|
+
//! @endrst
|
|
342
|
+
//!
|
|
343
|
+
//! @tparam BLOCK_THREADS
|
|
344
|
+
//! The thread block size in threads
|
|
345
|
+
//!
|
|
346
|
+
//! @tparam T
|
|
347
|
+
//! **inferred** The data type to load.
|
|
348
|
+
//!
|
|
349
|
+
//! @tparam ITEMS_PER_THREAD
|
|
350
|
+
//! **inferred** The number of consecutive items partitioned onto each thread.
|
|
351
|
+
//!
|
|
352
|
+
//! @tparam RandomAccessIterator
|
|
353
|
+
//! **inferred** The random-access iterator type for input iterator.
|
|
354
|
+
//!
|
|
355
|
+
//! @param[in] linear_tid
|
|
356
|
+
//! A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) +
|
|
357
|
+
//! linear_tid</tt> for 2D thread blocks)
|
|
358
|
+
//!
|
|
359
|
+
//! @param[in] block_src_it
|
|
360
|
+
//! The thread block's base iterator for loading from
|
|
361
|
+
//!
|
|
362
|
+
//! @param[out] dst_items
|
|
363
|
+
//! Destination to load data into
|
|
364
|
+
//!
|
|
365
|
+
//! @param[in] block_items_end
|
|
366
|
+
//! Number of valid items to load
|
|
367
|
+
template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
|
|
368
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
|
|
369
|
+
int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
|
|
370
|
+
{
|
|
371
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
372
|
+
for (int i = 0; i < ITEMS_PER_THREAD; i++)
|
|
373
|
+
{
|
|
374
|
+
const auto src_pos = linear_tid + i * BLOCK_THREADS;
|
|
375
|
+
if (src_pos < block_items_end)
|
|
376
|
+
{
|
|
377
|
+
dst_items[i] = block_src_it[src_pos];
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
//! @rst
|
|
383
|
+
//! Load a linear segment of items into a striped arrangement across the thread block, guarded
|
|
384
|
+
//! by range, with a fall-back assignment of out-of-bound elements.
|
|
385
|
+
//!
|
|
386
|
+
//! @striped
|
|
387
|
+
//!
|
|
388
|
+
//! @endrst
|
|
389
|
+
//!
|
|
390
|
+
//! @tparam BLOCK_THREADS
|
|
391
|
+
//! The thread block size in threads
|
|
392
|
+
//!
|
|
393
|
+
//! @tparam T
|
|
394
|
+
//! **inferred** The data type to load.
|
|
395
|
+
//!
|
|
396
|
+
//! @tparam ITEMS_PER_THREAD
|
|
397
|
+
//! **inferred** The number of consecutive items partitioned onto each thread.
|
|
398
|
+
//!
|
|
399
|
+
//! @tparam RandomAccessIterator
|
|
400
|
+
//! **inferred** The random-access iterator type for input \iterator.
|
|
401
|
+
//!
|
|
402
|
+
//! @param[in] linear_tid
|
|
403
|
+
//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
|
|
404
|
+
//! linear_tid` for 2D thread blocks)
|
|
405
|
+
//!
|
|
406
|
+
//! @param[in] block_src_it
|
|
407
|
+
//! The thread block's base iterator for loading from
|
|
408
|
+
//!
|
|
409
|
+
//! @param[out] dst_items
|
|
410
|
+
//! Destination to load data into
|
|
411
|
+
//!
|
|
412
|
+
//! @param[in] block_items_end
|
|
413
|
+
//! Number of valid items to load
|
|
414
|
+
//!
|
|
415
|
+
//! @param[in] oob_default
|
|
416
|
+
//! Default value to assign out-of-bound items
|
|
417
|
+
template <int BLOCK_THREADS, typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
|
|
418
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectStriped(
|
|
419
|
+
int linear_tid,
|
|
420
|
+
RandomAccessIterator block_src_it,
|
|
421
|
+
T (&dst_items)[ITEMS_PER_THREAD],
|
|
422
|
+
int block_items_end,
|
|
423
|
+
DefaultT oob_default)
|
|
424
|
+
{
|
|
425
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
426
|
+
for (int i = 0; i < ITEMS_PER_THREAD; i++)
|
|
427
|
+
{
|
|
428
|
+
dst_items[i] = oob_default;
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
//! @} end member group
|
|
435
|
+
//! @name Warp-striped arrangement I/O (direct)
|
|
436
|
+
//! @{
|
|
437
|
+
|
|
438
|
+
//! @rst
|
|
439
|
+
//! Load a linear segment of items into a warp-striped arrangement across the thread block.
|
|
440
|
+
//!
|
|
441
|
+
//! @warpstriped
|
|
442
|
+
//!
|
|
443
|
+
//! Usage Considerations
|
|
444
|
+
//! ++++++++++++++++++++
|
|
445
|
+
//!
|
|
446
|
+
//! The number of threads in the thread block must be a multiple of the architecture's warp size.
|
|
447
|
+
//!
|
|
448
|
+
//! @endrst
|
|
449
|
+
//!
|
|
450
|
+
//! @tparam T
|
|
451
|
+
//! **inferred** The data type to load.
|
|
452
|
+
//!
|
|
453
|
+
//! @tparam ITEMS_PER_THREAD
|
|
454
|
+
//! **inferred** The number of consecutive items partitioned onto each thread.
|
|
455
|
+
//!
|
|
456
|
+
//! @tparam RandomAccessIterator
|
|
457
|
+
//! **inferred** The random-access iterator type for input iterator.
|
|
458
|
+
//!
|
|
459
|
+
//! @param[in] linear_tid
|
|
460
|
+
//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
|
|
461
|
+
//! linear_tid` for 2D thread blocks)
|
|
462
|
+
//!
|
|
463
|
+
//! @param[in] block_src_it
|
|
464
|
+
//! The thread block's base iterator for loading from
|
|
465
|
+
//!
|
|
466
|
+
//! @param[out] dst_items
|
|
467
|
+
//! Destination to load data into
|
|
468
|
+
template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
|
|
469
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
470
|
+
LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
|
|
471
|
+
{
|
|
472
|
+
const int tid = linear_tid & (detail::warp_threads - 1);
|
|
473
|
+
const int wid = linear_tid >> detail::log2_warp_threads;
|
|
474
|
+
const int warp_offset = wid * detail::warp_threads * ITEMS_PER_THREAD;
|
|
475
|
+
|
|
476
|
+
// Load directly in warp-striped order
|
|
477
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
478
|
+
for (int i = 0; i < ITEMS_PER_THREAD; i++)
|
|
479
|
+
{
|
|
480
|
+
new (&dst_items[i]) T(block_src_it[warp_offset + tid + (i * detail::warp_threads)]);
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
//! @rst
|
|
485
|
+
//! Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
|
|
486
|
+
//!
|
|
487
|
+
//! @warpstriped
|
|
488
|
+
//!
|
|
489
|
+
//! Usage Considerations
|
|
490
|
+
//! ++++++++++++++++++++
|
|
491
|
+
//!
|
|
492
|
+
//! The number of threads in the thread block must be a multiple of the architecture's warp size.
|
|
493
|
+
//!
|
|
494
|
+
//! @endrst
|
|
495
|
+
//!
|
|
496
|
+
//! @tparam T
|
|
497
|
+
//! **inferred** The data type to load.
|
|
498
|
+
//!
|
|
499
|
+
//! @tparam ITEMS_PER_THREAD
|
|
500
|
+
//! **inferred** The number of consecutive items partitioned onto each thread.
|
|
501
|
+
//!
|
|
502
|
+
//! @tparam RandomAccessIterator
|
|
503
|
+
//! **inferred** The random-access iterator type for input \iterator.
|
|
504
|
+
//!
|
|
505
|
+
//! @param[in] linear_tid
|
|
506
|
+
//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
|
|
507
|
+
//! linear_tid` for 2D thread blocks)
|
|
508
|
+
//!
|
|
509
|
+
//! @param[in] block_src_it
|
|
510
|
+
//! The thread block's base iterator for loading from
|
|
511
|
+
//!
|
|
512
|
+
//! @param[out] dst_items
|
|
513
|
+
//! Destination to load data into
|
|
514
|
+
//!
|
|
515
|
+
//! @param[in] block_items_end
|
|
516
|
+
//! Number of valid items to load
|
|
517
|
+
template <typename T, int ITEMS_PER_THREAD, typename RandomAccessIterator>
|
|
518
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
|
|
519
|
+
int linear_tid, RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
|
|
520
|
+
{
|
|
521
|
+
const int tid = linear_tid & (detail::warp_threads - 1);
|
|
522
|
+
const int wid = linear_tid >> detail::log2_warp_threads;
|
|
523
|
+
const int warp_offset = wid * detail::warp_threads * ITEMS_PER_THREAD;
|
|
524
|
+
|
|
525
|
+
// Load directly in warp-striped order
|
|
526
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
527
|
+
for (int i = 0; i < ITEMS_PER_THREAD; i++)
|
|
528
|
+
{
|
|
529
|
+
const auto src_pos = warp_offset + tid + (i * detail::warp_threads);
|
|
530
|
+
if (src_pos < block_items_end)
|
|
531
|
+
{
|
|
532
|
+
new (&dst_items[i]) T(block_src_it[src_pos]);
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
//! @rst
|
|
538
|
+
//! Load a linear segment of items into a warp-striped arrangement across the thread block,
|
|
539
|
+
//! guarded by range, with a fall-back assignment of out-of-bound elements.
|
|
540
|
+
//!
|
|
541
|
+
//! @warpstriped
|
|
542
|
+
//!
|
|
543
|
+
//! @endrst
|
|
544
|
+
//!
|
|
545
|
+
//! Usage Considerations
|
|
546
|
+
//! ++++++++++++++++++++
|
|
547
|
+
//!
|
|
548
|
+
//! The number of threads in the thread block must be a multiple of the architecture's warp size.
|
|
549
|
+
//!
|
|
550
|
+
//! @tparam T
|
|
551
|
+
//! **inferred** The data type to load.
|
|
552
|
+
//!
|
|
553
|
+
//! @tparam ITEMS_PER_THREAD
|
|
554
|
+
//! **inferred** The number of consecutive items partitioned onto each thread.
|
|
555
|
+
//!
|
|
556
|
+
//! @tparam RandomAccessIterator
|
|
557
|
+
//! **inferred** The random-access iterator type for input \iterator.
|
|
558
|
+
//!
|
|
559
|
+
//! @param[in] linear_tid
|
|
560
|
+
//! A suitable 1D thread-identifier for the calling thread (e.g., `(threadIdx.y * blockDim.x) +
|
|
561
|
+
//! linear_tid` for 2D thread blocks)
|
|
562
|
+
//!
|
|
563
|
+
//! @param[in] block_src_it
|
|
564
|
+
//! The thread block's base iterator for loading from
|
|
565
|
+
//!
|
|
566
|
+
//! @param[out] dst_items
|
|
567
|
+
//! Destination to load data into
|
|
568
|
+
//!
|
|
569
|
+
//! @param[in] block_items_end
|
|
570
|
+
//! Number of valid items to load
|
|
571
|
+
//!
|
|
572
|
+
//! @param[in] oob_default
|
|
573
|
+
//! Default value to assign out-of-bound items
|
|
574
|
+
template <typename T, typename DefaultT, int ITEMS_PER_THREAD, typename RandomAccessIterator>
|
|
575
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadDirectWarpStriped(
|
|
576
|
+
int linear_tid,
|
|
577
|
+
RandomAccessIterator block_src_it,
|
|
578
|
+
T (&dst_items)[ITEMS_PER_THREAD],
|
|
579
|
+
int block_items_end,
|
|
580
|
+
DefaultT oob_default)
|
|
581
|
+
{
|
|
582
|
+
// Load directly in warp-striped order
|
|
583
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
584
|
+
for (int i = 0; i < ITEMS_PER_THREAD; i++)
|
|
585
|
+
{
|
|
586
|
+
dst_items[i] = oob_default;
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
//! @} end member group
|
|
593
|
+
|
|
594
|
+
//! @brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data
|
|
595
|
+
//! from memory into a blocked arrangement across a CUDA thread block.
|
|
596
|
+
enum BlockLoadAlgorithm
|
|
597
|
+
{
|
|
598
|
+
//! @rst
|
|
599
|
+
//! Overview
|
|
600
|
+
//! ++++++++++++++++++++++++++
|
|
601
|
+
//!
|
|
602
|
+
//! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read directly from memory.
|
|
603
|
+
//!
|
|
604
|
+
//! Performance Considerations
|
|
605
|
+
//! ++++++++++++++++++++++++++
|
|
606
|
+
//!
|
|
607
|
+
//! The utilization of memory transactions (coalescing) decreases as the access stride between threads increases
|
|
608
|
+
//! (i.e., the number items per thread).
|
|
609
|
+
//! @endrst
|
|
610
|
+
BLOCK_LOAD_DIRECT,
|
|
611
|
+
|
|
612
|
+
//! @rst
|
|
613
|
+
//! Overview
|
|
614
|
+
//! ++++++++++++++++++++++++++
|
|
615
|
+
//!
|
|
616
|
+
//! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read directly from memory.
|
|
617
|
+
//!
|
|
618
|
+
//! Performance Considerations
|
|
619
|
+
//! ++++++++++++++++++++++++++
|
|
620
|
+
//!
|
|
621
|
+
//! The utilization of memory transactions (coalescing) doesn't depend on the number of items per thread.
|
|
622
|
+
//!
|
|
623
|
+
//! @endrst
|
|
624
|
+
BLOCK_LOAD_STRIPED,
|
|
625
|
+
|
|
626
|
+
//! @rst
|
|
627
|
+
//! Overview
|
|
628
|
+
//! ++++++++++++++++++++++++++
|
|
629
|
+
//!
|
|
630
|
+
//! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read from memory using CUDA's built-in
|
|
631
|
+
//! vectorized loads as a coalescing optimization. For example, ``ld.global.v4.s32`` instructions will be generated
|
|
632
|
+
//! when ``T = int`` and ``ITEMS_PER_THREAD % 4 == 0``.
|
|
633
|
+
//!
|
|
634
|
+
//! Performance Considerations
|
|
635
|
+
//! ++++++++++++++++++++++++++
|
|
636
|
+
//!
|
|
637
|
+
//! - The utilization of memory transactions (coalescing) remains high until the the access stride between threads
|
|
638
|
+
//! (i.e., the number items per thread) exceeds the maximum vector load width (typically 4 items or 64B, whichever
|
|
639
|
+
//! is lower).
|
|
640
|
+
//! - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
|
|
641
|
+
//!
|
|
642
|
+
//! - ``ITEMS_PER_THREAD`` is odd
|
|
643
|
+
//! - The ``RandomAccessIterator`` is not a simple pointer type
|
|
644
|
+
//! - The block input offset is not quadword-aligned
|
|
645
|
+
//! - The data type ``T`` is not a built-in primitive or CUDA vector type
|
|
646
|
+
//! (e.g., ``short``, ``int2``, ``double``, ``float2``, etc.)
|
|
647
|
+
//!
|
|
648
|
+
//! @endrst
|
|
649
|
+
BLOCK_LOAD_VECTORIZE,
|
|
650
|
+
|
|
651
|
+
//! @rst
|
|
652
|
+
//! Overview
|
|
653
|
+
//! ++++++++++++++++++++++++++
|
|
654
|
+
//!
|
|
655
|
+
//! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read efficiently from memory and then locally
|
|
656
|
+
//! transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
|
|
657
|
+
//!
|
|
658
|
+
//! Performance Considerations
|
|
659
|
+
//! ++++++++++++++++++++++++++
|
|
660
|
+
//!
|
|
661
|
+
//! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
|
|
662
|
+
//! - The local reordering incurs slightly longer latencies and throughput than the direct cub::BLOCK_LOAD_DIRECT and
|
|
663
|
+
//! cub::BLOCK_LOAD_VECTORIZE alternatives.
|
|
664
|
+
//!
|
|
665
|
+
//! @endrst
|
|
666
|
+
BLOCK_LOAD_TRANSPOSE,
|
|
667
|
+
|
|
668
|
+
//! @rst
|
|
669
|
+
//! Overview
|
|
670
|
+
//! ++++++++++++++++++++++++++
|
|
671
|
+
//!
|
|
672
|
+
//! A :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read efficiently from memory and then
|
|
673
|
+
//! locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
|
|
674
|
+
//!
|
|
675
|
+
//! Usage Considerations
|
|
676
|
+
//! ++++++++++++++++++++++++++
|
|
677
|
+
//!
|
|
678
|
+
//! - BLOCK_THREADS must be a multiple of WARP_THREADS
|
|
679
|
+
//!
|
|
680
|
+
//! Performance Considerations
|
|
681
|
+
//! ++++++++++++++++++++++++++
|
|
682
|
+
//!
|
|
683
|
+
//! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
|
|
684
|
+
//! - The local reordering incurs slightly larger latencies than the direct cub::BLOCK_LOAD_DIRECT and
|
|
685
|
+
//! cub::BLOCK_LOAD_VECTORIZE alternatives.
|
|
686
|
+
//! - Provisions more shared storage, but incurs smaller latencies than the
|
|
687
|
+
//! BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
|
|
688
|
+
//!
|
|
689
|
+
//! @endrst
|
|
690
|
+
BLOCK_LOAD_WARP_TRANSPOSE,
|
|
691
|
+
|
|
692
|
+
//! @rst
|
|
693
|
+
//! Overview
|
|
694
|
+
//! ++++++++++++++++++++++++++
|
|
695
|
+
//!
|
|
696
|
+
//! Like ``BLOCK_LOAD_WARP_TRANSPOSE``, a :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read
|
|
697
|
+
//! directly from memory and then is locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
|
|
698
|
+
//! To reduce the shared memory requirement, only one warp's worth of shared memory is provisioned and is subsequently
|
|
699
|
+
//! time-sliced among warps.
|
|
700
|
+
//!
|
|
701
|
+
//! Usage Considerations
|
|
702
|
+
//! ++++++++++++++++++++++++++
|
|
703
|
+
//!
|
|
704
|
+
//! - BLOCK_THREADS must be a multiple of WARP_THREADS
|
|
705
|
+
//!
|
|
706
|
+
//! Performance Considerations
|
|
707
|
+
//! ++++++++++++++++++++++++++
|
|
708
|
+
//!
|
|
709
|
+
//! - The utilization of memory transactions (coalescing) remains high regardless of items loaded per thread.
|
|
710
|
+
//! - Provisions less shared memory temporary storage, but incurs larger latencies than the BLOCK_LOAD_WARP_TRANSPOSE
|
|
711
|
+
//! alternative.
|
|
712
|
+
//!
|
|
713
|
+
//! @endrst
|
|
714
|
+
BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
|
|
715
|
+
};
|
|
716
|
+
|
|
717
|
+
//! @rst
|
|
718
|
+
//! The BlockLoad class provides :ref:`collective <collective-primitives>` data movement methods for loading a linear
|
|
719
|
+
//! segment of items from memory into a :ref:`blocked arrangement <flexible-data-arrangement>` across a CUDA thread
|
|
720
|
+
//! block.
|
|
721
|
+
//!
|
|
722
|
+
//! Overview
|
|
723
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
724
|
+
//!
|
|
725
|
+
//! - The BlockLoad class provides a single data movement abstraction that can be specialized to implement different
|
|
726
|
+
//! cub::BlockLoadAlgorithm strategies. This facilitates different performance policies for different architectures,
|
|
727
|
+
//! data types, granularity sizes, etc.
|
|
728
|
+
//! - BlockLoad can be optionally specialized by different data movement strategies:
|
|
729
|
+
//!
|
|
730
|
+
//! #. :cpp:enumerator:`cub::BLOCK_LOAD_DIRECT`:
|
|
731
|
+
//! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read directly from memory.
|
|
732
|
+
//! #. :cpp:enumerator:`cub::BLOCK_LOAD_STRIPED`:
|
|
733
|
+
//! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read directly from memory.
|
|
734
|
+
//! #. :cpp:enumerator:`cub::BLOCK_LOAD_VECTORIZE`:
|
|
735
|
+
//! A :ref:`blocked arrangement <flexible-data-arrangement>` of data is read directly from memory
|
|
736
|
+
//! using CUDA's built-in vectorized loads as a coalescing optimization.
|
|
737
|
+
//! #. :cpp:enumerator:`cub::BLOCK_LOAD_TRANSPOSE`:
|
|
738
|
+
//! A :ref:`striped arrangement <flexible-data-arrangement>` of data is read directly from memory and is then
|
|
739
|
+
//! locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
|
|
740
|
+
//! #. :cpp:enumerator:`cub::BLOCK_LOAD_WARP_TRANSPOSE`:
|
|
741
|
+
//! A :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read directly from memory and is then
|
|
742
|
+
//! locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>`.
|
|
743
|
+
//! #. :cpp:enumerator:`cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED`:
|
|
744
|
+
//! A :ref:`warp-striped arrangement <flexible-data-arrangement>` of data is read directly from memory and is then
|
|
745
|
+
//! locally transposed into a :ref:`blocked arrangement <flexible-data-arrangement>` one warp at a time.
|
|
746
|
+
//!
|
|
747
|
+
//! - @rowmajor
|
|
748
|
+
//!
|
|
749
|
+
//! A Simple Example
|
|
750
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
751
|
+
//!
|
|
752
|
+
//! @blockcollective{BlockLoad}
|
|
753
|
+
//!
|
|
754
|
+
//! The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement
|
|
755
|
+
//! across 128 threads where each thread owns 4 consecutive items. The load is specialized for
|
|
756
|
+
//! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
|
|
757
|
+
//! pattern (after which items are locally reordered among threads).
|
|
758
|
+
//!
|
|
759
|
+
//! .. code-block:: c++
|
|
760
|
+
//!
|
|
761
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
|
|
762
|
+
//!
|
|
763
|
+
//! __global__ void ExampleKernel(int *d_data, ...)
|
|
764
|
+
//! {
|
|
765
|
+
//! // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
|
|
766
|
+
//! using BlockLoad = cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE>;
|
|
767
|
+
//!
|
|
768
|
+
//! // Allocate shared memory for BlockLoad
|
|
769
|
+
//! __shared__ typename BlockLoad::TempStorage temp_storage;
|
|
770
|
+
//!
|
|
771
|
+
//! // Load a segment of consecutive items that are blocked across threads
|
|
772
|
+
//! int thread_data[4];
|
|
773
|
+
//! BlockLoad(temp_storage).Load(d_data, thread_data);
|
|
774
|
+
//! }
|
|
775
|
+
//!
|
|
776
|
+
//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads in
|
|
777
|
+
//! those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
|
|
778
|
+
//!
|
|
779
|
+
//! Re-using dynamically allocating shared memory
|
|
780
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
781
|
+
//!
|
|
782
|
+
//! The ``block/example_block_reduce_dyn_smem.cu`` example illustrates usage of dynamically shared memory with
|
|
783
|
+
//! BlockReduce and how to re-purpose the same memory region. This example can be easily adapted to the storage required
|
|
784
|
+
//! by BlockLoad.
|
|
785
|
+
//!
|
|
786
|
+
//! @endrst
|
|
787
|
+
//!
|
|
788
|
+
//! @tparam T
|
|
789
|
+
//! The data type to read into (which must be convertible from the input iterator's value type).
|
|
790
|
+
//!
|
|
791
|
+
//! @tparam BLOCK_DIM_X
|
|
792
|
+
//! The thread block length in threads along the X dimension
|
|
793
|
+
//!
|
|
794
|
+
//! @tparam ITEMS_PER_THREAD
|
|
795
|
+
//! The number of consecutive items partitioned onto each thread.
|
|
796
|
+
//!
|
|
797
|
+
//! @tparam ALGORITHM
|
|
798
|
+
//! **[optional]** cub::BlockLoadAlgorithm tuning policy. default: ``cub::BLOCK_LOAD_DIRECT``.
|
|
799
|
+
//!
|
|
800
|
+
//! @tparam BLOCK_DIM_Y
|
|
801
|
+
//! **[optional]** The thread block length in threads along the Y dimension (default: 1)
|
|
802
|
+
//!
|
|
803
|
+
//! @tparam BLOCK_DIM_Z
|
|
804
|
+
//! **[optional]** The thread block length in threads along the Z dimension (default: 1)
|
|
805
|
+
//!
|
|
806
|
+
template <typename T,
|
|
807
|
+
int BLOCK_DIM_X,
|
|
808
|
+
int ITEMS_PER_THREAD,
|
|
809
|
+
BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT,
|
|
810
|
+
int BLOCK_DIM_Y = 1,
|
|
811
|
+
int BLOCK_DIM_Z = 1>
|
|
812
|
+
class BlockLoad
|
|
813
|
+
{
|
|
814
|
+
static constexpr int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; // total threads in the block
|
|
815
|
+
|
|
816
|
+
template <BlockLoadAlgorithm _POLICY, int DUMMY>
|
|
817
|
+
struct LoadInternal; // helper to dispatch the load algorithm
|
|
818
|
+
|
|
819
|
+
template <int DUMMY>
|
|
820
|
+
struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
|
|
821
|
+
{
|
|
822
|
+
using TempStorage = NullType;
|
|
823
|
+
int linear_tid;
|
|
824
|
+
|
|
825
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
|
|
826
|
+
: linear_tid(linear_tid)
|
|
827
|
+
{}
|
|
828
|
+
|
|
829
|
+
template <typename RandomAccessIterator>
|
|
830
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
|
|
831
|
+
{
|
|
832
|
+
LoadDirectBlocked(linear_tid, block_src_it, dst_items);
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
template <typename RandomAccessIterator>
|
|
836
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
837
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
|
|
838
|
+
{
|
|
839
|
+
LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
template <typename RandomAccessIterator, typename DefaultT>
|
|
843
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
844
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
|
|
845
|
+
{
|
|
846
|
+
LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
|
|
847
|
+
}
|
|
848
|
+
};
|
|
849
|
+
|
|
850
|
+
template <int DUMMY>
|
|
851
|
+
struct LoadInternal<BLOCK_LOAD_STRIPED, DUMMY>
|
|
852
|
+
{
|
|
853
|
+
using TempStorage = NullType;
|
|
854
|
+
int linear_tid;
|
|
855
|
+
|
|
856
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
|
|
857
|
+
: linear_tid(linear_tid)
|
|
858
|
+
{}
|
|
859
|
+
|
|
860
|
+
template <typename RandomAccessIterator>
|
|
861
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
|
|
862
|
+
{
|
|
863
|
+
LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items);
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
template <typename RandomAccessIterator>
|
|
867
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
868
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
|
|
869
|
+
{
|
|
870
|
+
LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
template <typename RandomAccessIterator, typename DefaultT>
|
|
874
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
875
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
|
|
876
|
+
{
|
|
877
|
+
LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
|
|
878
|
+
}
|
|
879
|
+
};
|
|
880
|
+
|
|
881
|
+
template <int DUMMY>
|
|
882
|
+
struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
|
|
883
|
+
{
|
|
884
|
+
using TempStorage = NullType;
|
|
885
|
+
int linear_tid;
|
|
886
|
+
|
|
887
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& /*temp_storage*/, int linear_tid)
|
|
888
|
+
: linear_tid(linear_tid)
|
|
889
|
+
{}
|
|
890
|
+
|
|
891
|
+
// attempts vectorization (pointer)
|
|
892
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(const T* block_ptr, T (&dst_items)[ITEMS_PER_THREAD])
|
|
893
|
+
{
|
|
894
|
+
InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, dst_items);
|
|
895
|
+
}
|
|
896
|
+
// NOTE: This function is necessary for pointers to non-const types.
|
|
897
|
+
// The core reason is that the compiler will not deduce 'T*' to 'const T*' automatically.
|
|
898
|
+
// Otherwise, when the pointer type is 'T*', the compiler will prefer the overloaded version
|
|
899
|
+
// Load(RandomAccessIterator...) over Load(const T*...), which means it will never perform vectorized loading for
|
|
900
|
+
// pointers to non-const types.
|
|
901
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(T* block_ptr, T (&dst_items)[ITEMS_PER_THREAD])
|
|
902
|
+
{
|
|
903
|
+
InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, dst_items);
|
|
904
|
+
}
|
|
905
|
+
|
|
906
|
+
// any other iterator, no vectorization
|
|
907
|
+
template <typename RandomAccessIterator>
|
|
908
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
|
|
909
|
+
{
|
|
910
|
+
LoadDirectBlocked(linear_tid, block_src_it, dst_items);
|
|
911
|
+
}
|
|
912
|
+
|
|
913
|
+
// attempts vectorization (cache modified iterator)
|
|
914
|
+
template <CacheLoadModifier MODIFIER, typename ValueType, typename OffsetT>
|
|
915
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
916
|
+
Load(CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT> block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
|
|
917
|
+
{
|
|
918
|
+
InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_src_it.ptr, dst_items);
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
// skips vectorization
|
|
922
|
+
template <typename RandomAccessIterator>
|
|
923
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
924
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
|
|
925
|
+
{
|
|
926
|
+
LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end);
|
|
927
|
+
}
|
|
928
|
+
|
|
929
|
+
// skips vectorization
|
|
930
|
+
template <typename RandomAccessIterator, typename DefaultT>
|
|
931
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
932
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
|
|
933
|
+
{
|
|
934
|
+
LoadDirectBlocked(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
|
|
935
|
+
}
|
|
936
|
+
};
|
|
937
|
+
|
|
938
|
+
template <int DUMMY>
|
|
939
|
+
struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
|
|
940
|
+
{
|
|
941
|
+
using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
|
|
942
|
+
using _TempStorage = typename BlockExchange::TempStorage;
|
|
943
|
+
using TempStorage = Uninitialized<_TempStorage>;
|
|
944
|
+
|
|
945
|
+
_TempStorage& temp_storage;
|
|
946
|
+
int linear_tid;
|
|
947
|
+
|
|
948
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid)
|
|
949
|
+
: temp_storage(temp_storage.Alias())
|
|
950
|
+
, linear_tid(linear_tid)
|
|
951
|
+
{}
|
|
952
|
+
|
|
953
|
+
template <typename RandomAccessIterator>
|
|
954
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
|
|
955
|
+
{
|
|
956
|
+
LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items);
|
|
957
|
+
BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
|
|
958
|
+
}
|
|
959
|
+
|
|
960
|
+
template <typename RandomAccessIterator>
|
|
961
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
962
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
|
|
963
|
+
{
|
|
964
|
+
LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end);
|
|
965
|
+
BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
|
|
966
|
+
}
|
|
967
|
+
|
|
968
|
+
template <typename RandomAccessIterator, typename DefaultT>
|
|
969
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
970
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
|
|
971
|
+
{
|
|
972
|
+
LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
|
|
973
|
+
BlockExchange(temp_storage).StripedToBlocked(dst_items, dst_items);
|
|
974
|
+
}
|
|
975
|
+
};
|
|
976
|
+
|
|
977
|
+
template <int DUMMY>
|
|
978
|
+
struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
|
|
979
|
+
{
|
|
980
|
+
static constexpr int WARP_THREADS = detail::warp_threads;
|
|
981
|
+
static_assert(BLOCK_THREADS % WARP_THREADS == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
|
|
982
|
+
|
|
983
|
+
using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z>;
|
|
984
|
+
using _TempStorage = typename BlockExchange::TempStorage;
|
|
985
|
+
using TempStorage = Uninitialized<_TempStorage>;
|
|
986
|
+
|
|
987
|
+
_TempStorage& temp_storage;
|
|
988
|
+
int linear_tid;
|
|
989
|
+
|
|
990
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid)
|
|
991
|
+
: temp_storage(temp_storage.Alias())
|
|
992
|
+
, linear_tid(linear_tid)
|
|
993
|
+
{}
|
|
994
|
+
|
|
995
|
+
template <typename RandomAccessIterator>
|
|
996
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
|
|
997
|
+
{
|
|
998
|
+
LoadDirectWarpStriped(linear_tid, block_src_it, dst_items);
|
|
999
|
+
BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
template <typename RandomAccessIterator>
|
|
1003
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1004
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
|
|
1005
|
+
{
|
|
1006
|
+
LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
|
|
1007
|
+
BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
|
|
1008
|
+
}
|
|
1009
|
+
|
|
1010
|
+
template <typename RandomAccessIterator, typename DefaultT>
|
|
1011
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1012
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
|
|
1013
|
+
{
|
|
1014
|
+
LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
|
|
1015
|
+
BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
|
|
1016
|
+
}
|
|
1017
|
+
};
|
|
1018
|
+
|
|
1019
|
+
template <int DUMMY>
|
|
1020
|
+
struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
|
|
1021
|
+
{
|
|
1022
|
+
static constexpr int WARP_THREADS = detail::warp_threads;
|
|
1023
|
+
static_assert(BLOCK_THREADS % WARP_THREADS == 0, "BLOCK_THREADS must be a multiple of WARP_THREADS");
|
|
1024
|
+
|
|
1025
|
+
using BlockExchange = BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z>;
|
|
1026
|
+
using _TempStorage = typename BlockExchange::TempStorage;
|
|
1027
|
+
using TempStorage = Uninitialized<_TempStorage>;
|
|
1028
|
+
|
|
1029
|
+
_TempStorage& temp_storage;
|
|
1030
|
+
int linear_tid;
|
|
1031
|
+
|
|
1032
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE LoadInternal(TempStorage& temp_storage, int linear_tid)
|
|
1033
|
+
: temp_storage(temp_storage.Alias())
|
|
1034
|
+
, linear_tid(linear_tid)
|
|
1035
|
+
{}
|
|
1036
|
+
|
|
1037
|
+
template <typename RandomAccessIterator>
|
|
1038
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
|
|
1039
|
+
{
|
|
1040
|
+
LoadDirectWarpStriped(linear_tid, block_src_it, dst_items);
|
|
1041
|
+
BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
|
|
1042
|
+
}
|
|
1043
|
+
|
|
1044
|
+
template <typename RandomAccessIterator>
|
|
1045
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1046
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
|
|
1047
|
+
{
|
|
1048
|
+
LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end);
|
|
1049
|
+
BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
|
|
1050
|
+
}
|
|
1051
|
+
|
|
1052
|
+
template <typename RandomAccessIterator, typename DefaultT>
|
|
1053
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1054
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
|
|
1055
|
+
{
|
|
1056
|
+
LoadDirectWarpStriped(linear_tid, block_src_it, dst_items, block_items_end, oob_default);
|
|
1057
|
+
BlockExchange(temp_storage).WarpStripedToBlocked(dst_items, dst_items);
|
|
1058
|
+
}
|
|
1059
|
+
};
|
|
1060
|
+
|
|
1061
|
+
using InternalLoad = LoadInternal<ALGORITHM, 0>; // load implementation to use
|
|
1062
|
+
using _TempStorage = typename InternalLoad::TempStorage;
|
|
1063
|
+
|
|
1064
|
+
// Internal storage allocator
|
|
1065
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE _TempStorage& PrivateStorage()
|
|
1066
|
+
{
|
|
1067
|
+
__shared__ _TempStorage private_storage;
|
|
1068
|
+
return private_storage;
|
|
1069
|
+
}
|
|
1070
|
+
|
|
1071
|
+
_TempStorage& temp_storage;
|
|
1072
|
+
int linear_tid;
|
|
1073
|
+
|
|
1074
|
+
public:
|
|
1075
|
+
/// @smemstorage{BlockLoad}
|
|
1076
|
+
using TempStorage = Uninitialized<_TempStorage>;
|
|
1077
|
+
|
|
1078
|
+
//! @name Collective constructors
|
|
1079
|
+
//! @{
|
|
1080
|
+
|
|
1081
|
+
/// @brief Collective constructor using a private static allocation of shared memory as temporary storage.
|
|
1082
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad()
|
|
1083
|
+
: temp_storage(PrivateStorage())
|
|
1084
|
+
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
|
|
1085
|
+
{}
|
|
1086
|
+
|
|
1087
|
+
/// @brief Collective constructor using the specified memory allocation as temporary storage.
|
|
1088
|
+
/// @param[in] temp_storage Reference to memory allocation having layout type TempStorage
|
|
1089
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE BlockLoad(TempStorage& temp_storage)
|
|
1090
|
+
: temp_storage(temp_storage.Alias())
|
|
1091
|
+
, linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
|
|
1092
|
+
{}
|
|
1093
|
+
|
|
1094
|
+
//! @} end member group
|
|
1095
|
+
//! @name Data movement
|
|
1096
|
+
//! @{
|
|
1097
|
+
|
|
1098
|
+
//! @rst
|
|
1099
|
+
//! Load a linear segment of items from memory.
|
|
1100
|
+
//!
|
|
1101
|
+
//! - @blocked
|
|
1102
|
+
//! - @smemreuse
|
|
1103
|
+
//!
|
|
1104
|
+
//! Snippet
|
|
1105
|
+
//! +++++++
|
|
1106
|
+
//!
|
|
1107
|
+
//! The code snippet below illustrates the loading of a linear segment of 512 integers into a "blocked" arrangement
|
|
1108
|
+
//! across 128 threads where each thread owns 4 consecutive items. The load is specialized for
|
|
1109
|
+
//! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
|
|
1110
|
+
//! pattern (after which items are locally reordered among threads).
|
|
1111
|
+
//!
|
|
1112
|
+
//! .. code-block:: c++
|
|
1113
|
+
//!
|
|
1114
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
|
|
1115
|
+
//!
|
|
1116
|
+
//! __global__ void ExampleKernel(int *d_data, ...)
|
|
1117
|
+
//! {
|
|
1118
|
+
//! // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
|
|
1119
|
+
//! using BlockLoad = cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE>;
|
|
1120
|
+
//!
|
|
1121
|
+
//! // Allocate shared memory for BlockLoad
|
|
1122
|
+
//! __shared__ typename BlockLoad::TempStorage temp_storage;
|
|
1123
|
+
//!
|
|
1124
|
+
//! // Load a segment of consecutive items that are blocked across threads
|
|
1125
|
+
//! int thread_data[4];
|
|
1126
|
+
//! BlockLoad(temp_storage).Load(d_data, thread_data);
|
|
1127
|
+
//! }
|
|
1128
|
+
//!
|
|
1129
|
+
//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads
|
|
1130
|
+
//! in those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
|
|
1131
|
+
//!
|
|
1132
|
+
//! @endrst
|
|
1133
|
+
//!
|
|
1134
|
+
//! @param[in] block_src_it
|
|
1135
|
+
//! The thread block's base iterator for loading from
|
|
1136
|
+
//!
|
|
1137
|
+
//! @param[out] dst_items
|
|
1138
|
+
//! Destination to load data into
|
|
1139
|
+
template <typename RandomAccessIterator>
|
|
1140
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD])
|
|
1141
|
+
{
|
|
1142
|
+
InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items);
|
|
1143
|
+
}
|
|
1144
|
+
|
|
1145
|
+
//! @rst
|
|
1146
|
+
//!
|
|
1147
|
+
//! Load a linear segment of items from memory, guarded by range.
|
|
1148
|
+
//!
|
|
1149
|
+
//! - @blocked
|
|
1150
|
+
//! - @smemreuse
|
|
1151
|
+
//!
|
|
1152
|
+
//! Snippet
|
|
1153
|
+
//! +++++++
|
|
1154
|
+
//!
|
|
1155
|
+
//! The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked"
|
|
1156
|
+
//! arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for
|
|
1157
|
+
//! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
|
|
1158
|
+
//! pattern (after which items are locally reordered among threads).
|
|
1159
|
+
//!
|
|
1160
|
+
//! .. code-block:: c++
|
|
1161
|
+
//!
|
|
1162
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
|
|
1163
|
+
//!
|
|
1164
|
+
//! __global__ void ExampleKernel(int *d_data, int block_items_end, ...)
|
|
1165
|
+
//! {
|
|
1166
|
+
//! // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
|
|
1167
|
+
//! using BlockLoad = cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE>;
|
|
1168
|
+
//!
|
|
1169
|
+
//! // Allocate shared memory for BlockLoad
|
|
1170
|
+
//! __shared__ typename BlockLoad::TempStorage temp_storage;
|
|
1171
|
+
//!
|
|
1172
|
+
//! // Load a segment of consecutive items that are blocked across threads
|
|
1173
|
+
//! int thread_data[4];
|
|
1174
|
+
//! BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end);
|
|
1175
|
+
//! }
|
|
1176
|
+
//!
|
|
1177
|
+
//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``block_items_end`` is ``5``. The set of
|
|
1178
|
+
//! ``thread_data`` across the block of threads in those threads will be ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``,
|
|
1179
|
+
//! with only the first two threads being unmasked to load portions of valid data (and other items remaining
|
|
1180
|
+
//! unassigned).
|
|
1181
|
+
//!
|
|
1182
|
+
//! @endrst
|
|
1183
|
+
//!
|
|
1184
|
+
//! @param[in] block_src_it
|
|
1185
|
+
//! The thread block's base iterator for loading from
|
|
1186
|
+
//!
|
|
1187
|
+
//! @param[out] dst_items
|
|
1188
|
+
//! Destination to load data into
|
|
1189
|
+
//!
|
|
1190
|
+
//! @param[in] block_items_end
|
|
1191
|
+
//! Number of valid items to load
|
|
1192
|
+
template <typename RandomAccessIterator>
|
|
1193
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1194
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end)
|
|
1195
|
+
{
|
|
1196
|
+
InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end);
|
|
1197
|
+
}
|
|
1198
|
+
|
|
1199
|
+
//! @rst
|
|
1200
|
+
//! Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
|
|
1201
|
+
//!
|
|
1202
|
+
//! - @blocked
|
|
1203
|
+
//! - @smemreuse
|
|
1204
|
+
//!
|
|
1205
|
+
//! Snippet
|
|
1206
|
+
//! +++++++
|
|
1207
|
+
//!
|
|
1208
|
+
//! The code snippet below illustrates the guarded loading of a linear segment of 512 integers into a "blocked"
|
|
1209
|
+
//! arrangement across 128 threads where each thread owns 4 consecutive items. The load is specialized for
|
|
1210
|
+
//! ``BLOCK_LOAD_WARP_TRANSPOSE``, meaning memory references are efficiently coalesced using a warp-striped access
|
|
1211
|
+
//! pattern (after which items are locally reordered among threads).
|
|
1212
|
+
//!
|
|
1213
|
+
//! .. code-block:: c++
|
|
1214
|
+
//!
|
|
1215
|
+
//! #include <cub/cub.cuh> // or equivalently <cub/block/block_load.cuh>
|
|
1216
|
+
//!
|
|
1217
|
+
//! __global__ void ExampleKernel(int *d_data, int block_items_end, ...)
|
|
1218
|
+
//! {
|
|
1219
|
+
//! // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
|
|
1220
|
+
//! using BlockLoad = cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE>;
|
|
1221
|
+
//!
|
|
1222
|
+
//! // Allocate shared memory for BlockLoad
|
|
1223
|
+
//! __shared__ typename BlockLoad::TempStorage temp_storage;
|
|
1224
|
+
//!
|
|
1225
|
+
//! // Load a segment of consecutive items that are blocked across threads
|
|
1226
|
+
//! int thread_data[4];
|
|
1227
|
+
//! BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end, -1);
|
|
1228
|
+
//! }
|
|
1229
|
+
//!
|
|
1230
|
+
//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...``, ``block_items_end`` is ``5``, and the out-of-bounds
|
|
1231
|
+
//! default is ``-1``. The set of ``thread_data`` across the block of threads in those threads will be
|
|
1232
|
+
//! ``{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }``, with only the first two threads being unmasked to load
|
|
1233
|
+
//! portions of valid data (and other items are assigned ``-1``)
|
|
1234
|
+
//!
|
|
1235
|
+
//! @endrst
|
|
1236
|
+
//!
|
|
1237
|
+
//! @param[in] block_src_it
|
|
1238
|
+
//! The thread block's base iterator for loading from
|
|
1239
|
+
//!
|
|
1240
|
+
//! @param[out] dst_items
|
|
1241
|
+
//! Destination to load data into
|
|
1242
|
+
//!
|
|
1243
|
+
//! @param[in] block_items_end
|
|
1244
|
+
//! Number of valid items to load
|
|
1245
|
+
//!
|
|
1246
|
+
//! @param[in] oob_default
|
|
1247
|
+
//! Default value to assign out-of-bound items
|
|
1248
|
+
template <typename RandomAccessIterator, typename DefaultT>
|
|
1249
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
1250
|
+
Load(RandomAccessIterator block_src_it, T (&dst_items)[ITEMS_PER_THREAD], int block_items_end, DefaultT oob_default)
|
|
1251
|
+
{
|
|
1252
|
+
InternalLoad(temp_storage, linear_tid).Load(block_src_it, dst_items, block_items_end, oob_default);
|
|
1253
|
+
}
|
|
1254
|
+
|
|
1255
|
+
//! @} end member group
|
|
1256
|
+
};
|
|
1257
|
+
|
|
1258
|
+
template <class Policy, class It, class T = cub::detail::it_value_t<It>>
|
|
1259
|
+
struct BlockLoadType
|
|
1260
|
+
{
|
|
1261
|
+
using type = cub::BlockLoad<T, Policy::BLOCK_THREADS, Policy::ITEMS_PER_THREAD, Policy::LOAD_ALGORITHM>;
|
|
1262
|
+
};
|
|
1263
|
+
|
|
1264
|
+
CUB_NAMESPACE_END
|