cuda-cccl 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/__init__.py +27 -0
- cuda/cccl/_cuda_version_utils.py +24 -0
- cuda/cccl/cooperative/__init__.py +9 -0
- cuda/cccl/cooperative/experimental/__init__.py +24 -0
- cuda/cccl/headers/__init__.py +7 -0
- cuda/cccl/headers/include/__init__.py +1 -0
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +259 -0
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1182 -0
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +81 -0
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +709 -0
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +748 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +786 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +703 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +555 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +589 -0
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +474 -0
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +289 -0
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1117 -0
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +606 -0
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +631 -0
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +963 -0
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1227 -0
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +1313 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +424 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +1264 -0
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1225 -0
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2196 -0
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +667 -0
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +2315 -0
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
- cuda/cccl/headers/include/cub/block/block_store.cuh +1247 -0
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
- cuda/cccl/headers/include/cub/config.cuh +53 -0
- cuda/cccl/headers/include/cub/cub.cuh +120 -0
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +114 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
- cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
- cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
- cuda/cccl/headers/include/cub/device/device_for.cuh +1063 -0
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
- cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
- cuda/cccl/headers/include/cub/device/device_partition.cuh +668 -0
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +2518 -0
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
- cuda/cccl/headers/include/cub/device/device_scan.cuh +2212 -0
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1430 -0
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
- cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
- cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
- cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1744 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1310 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +531 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +517 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +975 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +440 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +627 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +569 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +261 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +583 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +189 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +321 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +522 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1028 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +67 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +118 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +60 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +275 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +76 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +126 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1065 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +942 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +673 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +618 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1010 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +398 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +440 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +481 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +884 -0
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +548 -0
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
- cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
- cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
- cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
- cuda/cccl/headers/include/cub/util_device.cuh +800 -0
- cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
- cuda/cccl/headers/include/cub/util_math.cuh +118 -0
- cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
- cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
- cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
- cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
- cuda/cccl/headers/include/cub/version.cuh +89 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +737 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +408 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +829 -0
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1890 -0
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +521 -0
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
- cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
- cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +487 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
- cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
- cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
- cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
- cuda/cccl/headers/include/cuda/__cccl_config +37 -0
- cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
- cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
- cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
- cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
- cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
- cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
- cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
- cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
- cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
- cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
- cuda/cccl/headers/include/cuda/__complex_ +28 -0
- cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
- cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
- cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +835 -0
- cuda/cccl/headers/include/cuda/__event/event.h +171 -0
- cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
- cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
- cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
- cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
- cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
- cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
- cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
- cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
- cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
- cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
- cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +533 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
- cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
- cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
- cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
- cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
- cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
- cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +652 -0
- cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
- cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2983 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
- cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
- cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
- cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
- cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
- cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
- cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
- cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
- cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
- cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
- cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
- cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
- cuda/cccl/headers/include/cuda/access_property +26 -0
- cuda/cccl/headers/include/cuda/algorithm +27 -0
- cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
- cuda/cccl/headers/include/cuda/atomic +27 -0
- cuda/cccl/headers/include/cuda/barrier +267 -0
- cuda/cccl/headers/include/cuda/bit +29 -0
- cuda/cccl/headers/include/cuda/cmath +37 -0
- cuda/cccl/headers/include/cuda/devices +33 -0
- cuda/cccl/headers/include/cuda/discard_memory +32 -0
- cuda/cccl/headers/include/cuda/functional +32 -0
- cuda/cccl/headers/include/cuda/iterator +39 -0
- cuda/cccl/headers/include/cuda/latch +27 -0
- cuda/cccl/headers/include/cuda/mdspan +28 -0
- cuda/cccl/headers/include/cuda/memory +35 -0
- cuda/cccl/headers/include/cuda/memory_resource +35 -0
- cuda/cccl/headers/include/cuda/numeric +29 -0
- cuda/cccl/headers/include/cuda/pipeline +579 -0
- cuda/cccl/headers/include/cuda/ptx +129 -0
- cuda/cccl/headers/include/cuda/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
- cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
- cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
- cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
- cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
- cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
- cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
- cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
- cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
- cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
- cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
- cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
- cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +673 -0
- cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +91 -0
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
- cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
- cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
- cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
- cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
- cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
- cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
- cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
- cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
- cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +259 -0
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
- cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
- cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
- cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
- cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
- cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
- cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
- cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
- cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
- cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
- cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
- cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
- cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
- cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
- cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
- cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
- cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
- cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
- cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
- cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
- cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
- cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
- cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
- cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +139 -0
- cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
- cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
- cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
- cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +165 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
- cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
- cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
- cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
- cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
- cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
- cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
- cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
- cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
- cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
- cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
- cuda/cccl/headers/include/cuda/std/__format_ +45 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
- cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
- cuda/cccl/headers/include/cuda/std/__functional/function.h +1275 -0
- cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
- cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
- cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
- cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
- cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
- cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
- cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
- cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
- cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
- cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
- cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
- cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
- cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +72 -0
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
- cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
- cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
- cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
- cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
- cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
- cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
- cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
- cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
- cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
- cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
- cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
- cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +759 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
- cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +525 -0
- cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
- cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
- cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
- cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
- cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
- cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
- cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
- cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
- cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
- cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
- cuda/cccl/headers/include/cuda/std/__new_ +29 -0
- cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
- cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
- cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
- cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
- cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
- cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
- cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
- cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
- cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
- cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
- cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
- cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
- cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
- cuda/cccl/headers/include/cuda/std/__random_ +29 -0
- cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
- cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
- cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
- cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
- cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
- cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
- cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
- cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
- cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
- cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
- cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
- cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
- cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
- cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
- cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
- cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
- cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
- cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
- cuda/cccl/headers/include/cuda/std/__string_ +29 -0
- cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
- cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +84 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
- cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
- cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
- cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
- cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
- cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
- cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
- cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
- cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
- cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
- cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
- cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
- cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
- cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
- cuda/cccl/headers/include/cuda/std/array +518 -0
- cuda/cccl/headers/include/cuda/std/atomic +810 -0
- cuda/cccl/headers/include/cuda/std/barrier +42 -0
- cuda/cccl/headers/include/cuda/std/bit +35 -0
- cuda/cccl/headers/include/cuda/std/bitset +994 -0
- cuda/cccl/headers/include/cuda/std/cassert +28 -0
- cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
- cuda/cccl/headers/include/cuda/std/cfloat +59 -0
- cuda/cccl/headers/include/cuda/std/chrono +26 -0
- cuda/cccl/headers/include/cuda/std/climits +61 -0
- cuda/cccl/headers/include/cuda/std/cmath +87 -0
- cuda/cccl/headers/include/cuda/std/complex +50 -0
- cuda/cccl/headers/include/cuda/std/concepts +48 -0
- cuda/cccl/headers/include/cuda/std/cstddef +28 -0
- cuda/cccl/headers/include/cuda/std/cstdint +178 -0
- cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
- cuda/cccl/headers/include/cuda/std/cstring +110 -0
- cuda/cccl/headers/include/cuda/std/ctime +154 -0
- cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2144 -0
- cuda/cccl/headers/include/cuda/std/execution +29 -0
- cuda/cccl/headers/include/cuda/std/expected +30 -0
- cuda/cccl/headers/include/cuda/std/functional +56 -0
- cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
- cuda/cccl/headers/include/cuda/std/iterator +70 -0
- cuda/cccl/headers/include/cuda/std/latch +34 -0
- cuda/cccl/headers/include/cuda/std/limits +28 -0
- cuda/cccl/headers/include/cuda/std/linalg +30 -0
- cuda/cccl/headers/include/cuda/std/mdspan +38 -0
- cuda/cccl/headers/include/cuda/std/memory +39 -0
- cuda/cccl/headers/include/cuda/std/numbers +346 -0
- cuda/cccl/headers/include/cuda/std/numeric +41 -0
- cuda/cccl/headers/include/cuda/std/optional +31 -0
- cuda/cccl/headers/include/cuda/std/ranges +69 -0
- cuda/cccl/headers/include/cuda/std/ratio +416 -0
- cuda/cccl/headers/include/cuda/std/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/source_location +83 -0
- cuda/cccl/headers/include/cuda/std/span +628 -0
- cuda/cccl/headers/include/cuda/std/string_view +925 -0
- cuda/cccl/headers/include/cuda/std/tuple +26 -0
- cuda/cccl/headers/include/cuda/std/type_traits +177 -0
- cuda/cccl/headers/include/cuda/std/utility +70 -0
- cuda/cccl/headers/include/cuda/std/variant +25 -0
- cuda/cccl/headers/include/cuda/std/version +240 -0
- cuda/cccl/headers/include/cuda/stream +31 -0
- cuda/cccl/headers/include/cuda/stream_ref +59 -0
- cuda/cccl/headers/include/cuda/type_traits +27 -0
- cuda/cccl/headers/include/cuda/utility +28 -0
- cuda/cccl/headers/include/cuda/version +16 -0
- cuda/cccl/headers/include/cuda/warp +28 -0
- cuda/cccl/headers/include/cuda/work_stealing +26 -0
- cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
- cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
- cuda/cccl/headers/include/nv/target +240 -0
- cuda/cccl/headers/include/thrust/addressof.h +22 -0
- cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
- cuda/cccl/headers/include/thrust/advance.h +57 -0
- cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
- cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
- cuda/cccl/headers/include/thrust/complex.h +858 -0
- cuda/cccl/headers/include/thrust/copy.h +506 -0
- cuda/cccl/headers/include/thrust/count.h +245 -0
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
- cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +626 -0
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +192 -0
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +96 -0
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +78 -0
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +115 -0
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +116 -0
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
- cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
- cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
- cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
- cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
- cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
- cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
- cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
- cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config.h +36 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
- cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
- cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
- cuda/cccl/headers/include/thrust/detail/count.h +55 -0
- cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
- cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
- cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
- cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +81 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
- cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
- cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
- cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
- cuda/cccl/headers/include/thrust/detail/function.h +49 -0
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
- cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
- cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
- cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
- cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
- cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
- cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
- cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
- cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
- cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
- cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
- cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
- cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
- cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
- cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
- cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
- cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
- cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
- cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
- cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
- cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
- cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
- cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
- cuda/cccl/headers/include/thrust/device_delete.h +74 -0
- cuda/cccl/headers/include/thrust/device_free.h +85 -0
- cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
- cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
- cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
- cuda/cccl/headers/include/thrust/device_new.h +112 -0
- cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
- cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
- cuda/cccl/headers/include/thrust/device_reference.h +983 -0
- cuda/cccl/headers/include/thrust/device_vector.h +576 -0
- cuda/cccl/headers/include/thrust/distance.h +43 -0
- cuda/cccl/headers/include/thrust/equal.h +247 -0
- cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
- cuda/cccl/headers/include/thrust/extrema.h +657 -0
- cuda/cccl/headers/include/thrust/fill.h +200 -0
- cuda/cccl/headers/include/thrust/find.h +382 -0
- cuda/cccl/headers/include/thrust/for_each.h +261 -0
- cuda/cccl/headers/include/thrust/functional.h +395 -0
- cuda/cccl/headers/include/thrust/gather.h +464 -0
- cuda/cccl/headers/include/thrust/generate.h +193 -0
- cuda/cccl/headers/include/thrust/host_vector.h +576 -0
- cuda/cccl/headers/include/thrust/inner_product.h +264 -0
- cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
- cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
- cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
- cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
- cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
- cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
- cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
- cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
- cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
- cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
- cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
- cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
- cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
- cuda/cccl/headers/include/thrust/logical.h +290 -0
- cuda/cccl/headers/include/thrust/memory.h +299 -0
- cuda/cccl/headers/include/thrust/merge.h +725 -0
- cuda/cccl/headers/include/thrust/mismatch.h +261 -0
- cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +528 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
- cuda/cccl/headers/include/thrust/mr/new.h +100 -0
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
- cuda/cccl/headers/include/thrust/mr/pool.h +528 -0
- cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
- cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
- cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
- cuda/cccl/headers/include/thrust/pair.h +99 -0
- cuda/cccl/headers/include/thrust/partition.h +1391 -0
- cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
- cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
- cuda/cccl/headers/include/thrust/random.h +120 -0
- cuda/cccl/headers/include/thrust/reduce.h +1113 -0
- cuda/cccl/headers/include/thrust/remove.h +768 -0
- cuda/cccl/headers/include/thrust/replace.h +826 -0
- cuda/cccl/headers/include/thrust/reverse.h +215 -0
- cuda/cccl/headers/include/thrust/scan.h +1671 -0
- cuda/cccl/headers/include/thrust/scatter.h +446 -0
- cuda/cccl/headers/include/thrust/sequence.h +277 -0
- cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
- cuda/cccl/headers/include/thrust/shuffle.h +182 -0
- cuda/cccl/headers/include/thrust/sort.h +1320 -0
- cuda/cccl/headers/include/thrust/swap.h +147 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +273 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +233 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +170 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +223 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +341 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +469 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
- cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
- cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
- cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
- cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +73 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.inl +172 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +36 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +33 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system_error.h +57 -0
- cuda/cccl/headers/include/thrust/tabulate.h +125 -0
- cuda/cccl/headers/include/thrust/transform.h +1045 -0
- cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
- cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
- cuda/cccl/headers/include/thrust/tuple.h +139 -0
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
- cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
- cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
- cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
- cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
- cuda/cccl/headers/include/thrust/unique.h +1088 -0
- cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
- cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
- cuda/cccl/headers/include/thrust/version.h +93 -0
- cuda/cccl/headers/include/thrust/zip_function.h +176 -0
- cuda/cccl/headers/include_paths.py +51 -0
- cuda/cccl/parallel/__init__.py +9 -0
- cuda/cccl/parallel/experimental/__init__.py +24 -0
- cuda/cccl/py.typed +0 -0
- cuda/compute/__init__.py +79 -0
- cuda/compute/_bindings.py +79 -0
- cuda/compute/_bindings.pyi +475 -0
- cuda/compute/_bindings_impl.pyx +2273 -0
- cuda/compute/_caching.py +71 -0
- cuda/compute/_cccl_interop.py +422 -0
- cuda/compute/_utils/__init__.py +0 -0
- cuda/compute/_utils/protocols.py +132 -0
- cuda/compute/_utils/temp_storage_buffer.py +86 -0
- cuda/compute/algorithms/__init__.py +54 -0
- cuda/compute/algorithms/_histogram.py +243 -0
- cuda/compute/algorithms/_merge_sort.py +225 -0
- cuda/compute/algorithms/_radix_sort.py +312 -0
- cuda/compute/algorithms/_reduce.py +182 -0
- cuda/compute/algorithms/_scan.py +331 -0
- cuda/compute/algorithms/_segmented_reduce.py +257 -0
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/compute/algorithms/_transform.py +329 -0
- cuda/compute/algorithms/_unique_by_key.py +252 -0
- cuda/compute/cccl/.gitkeep +0 -0
- cuda/compute/cu12/_bindings_impl.cp313-win_amd64.pyd +0 -0
- cuda/compute/cu12/cccl/cccl.c.parallel.dll +0 -0
- cuda/compute/cu12/cccl/cccl.c.parallel.lib +0 -0
- cuda/compute/cu13/_bindings_impl.cp313-win_amd64.pyd +0 -0
- cuda/compute/cu13/cccl/cccl.c.parallel.dll +0 -0
- cuda/compute/cu13/cccl/cccl.c.parallel.lib +0 -0
- cuda/compute/iterators/__init__.py +21 -0
- cuda/compute/iterators/_factories.py +219 -0
- cuda/compute/iterators/_iterators.py +817 -0
- cuda/compute/iterators/_zip_iterator.py +199 -0
- cuda/compute/numba_utils.py +53 -0
- cuda/compute/op.py +3 -0
- cuda/compute/struct.py +272 -0
- cuda/compute/typing.py +37 -0
- cuda/coop/__init__.py +8 -0
- cuda/coop/_caching.py +48 -0
- cuda/coop/_common.py +275 -0
- cuda/coop/_nvrtc.py +92 -0
- cuda/coop/_scan_op.py +181 -0
- cuda/coop/_types.py +937 -0
- cuda/coop/_typing.py +107 -0
- cuda/coop/block/__init__.py +39 -0
- cuda/coop/block/_block_exchange.py +251 -0
- cuda/coop/block/_block_load_store.py +215 -0
- cuda/coop/block/_block_merge_sort.py +125 -0
- cuda/coop/block/_block_radix_sort.py +214 -0
- cuda/coop/block/_block_reduce.py +294 -0
- cuda/coop/block/_block_scan.py +983 -0
- cuda/coop/warp/__init__.py +9 -0
- cuda/coop/warp/_warp_merge_sort.py +92 -0
- cuda/coop/warp/_warp_reduce.py +153 -0
- cuda/coop/warp/_warp_scan.py +78 -0
- cuda_cccl-0.3.3.dist-info/METADATA +41 -0
- cuda_cccl-0.3.3.dist-info/RECORD +1968 -0
- cuda_cccl-0.3.3.dist-info/WHEEL +5 -0
- cuda_cccl-0.3.3.dist-info/licenses/LICENSE +1 -0
|
@@ -0,0 +1,1182 @@
|
|
|
1
|
+
/******************************************************************************
|
|
2
|
+
* Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
*
|
|
4
|
+
* Redistribution and use in source and binary forms, with or without
|
|
5
|
+
* modification, are permitted provided that the following conditions are met:
|
|
6
|
+
* * Redistributions of source code must retain the above copyright
|
|
7
|
+
* notice, this list of conditions and the following disclaimer.
|
|
8
|
+
* * Redistributions in binary form must reproduce the above copyright
|
|
9
|
+
* notice, this list of conditions and the following disclaimer in the
|
|
10
|
+
* documentation and/or other materials provided with the distribution.
|
|
11
|
+
* * Neither the name of the NVIDIA CORPORATION nor the
|
|
12
|
+
* names of its contributors may be used to endorse or promote products
|
|
13
|
+
* derived from this software without specific prior written permission.
|
|
14
|
+
*
|
|
15
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
16
|
+
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
17
|
+
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
18
|
+
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
|
|
19
|
+
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
20
|
+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
21
|
+
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
22
|
+
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
23
|
+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
24
|
+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
25
|
+
*
|
|
26
|
+
******************************************************************************/
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* \file
|
|
30
|
+
* cub::AgentBatchMemcpy implements device-wide copying of a batch of device-accessible
|
|
31
|
+
* source-buffers to device-accessible destination-buffers.
|
|
32
|
+
*/
|
|
33
|
+
|
|
34
|
+
#pragma once
|
|
35
|
+
|
|
36
|
+
#include <cub/config.cuh>
|
|
37
|
+
|
|
38
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
39
|
+
# pragma GCC system_header
|
|
40
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
41
|
+
# pragma clang system_header
|
|
42
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
43
|
+
# pragma system_header
|
|
44
|
+
#endif // no system header
|
|
45
|
+
|
|
46
|
+
#include <cub/agent/single_pass_scan_operators.cuh>
|
|
47
|
+
#include <cub/block/block_exchange.cuh>
|
|
48
|
+
#include <cub/block/block_load.cuh>
|
|
49
|
+
#include <cub/block/block_run_length_decode.cuh>
|
|
50
|
+
#include <cub/block/block_scan.cuh>
|
|
51
|
+
#include <cub/block/block_store.cuh>
|
|
52
|
+
#include <cub/util_ptx.cuh>
|
|
53
|
+
#include <cub/util_type.cuh>
|
|
54
|
+
|
|
55
|
+
#include <cuda/__cmath/ceil_div.h>
|
|
56
|
+
#include <cuda/__cmath/round_up.h>
|
|
57
|
+
#include <cuda/std/__functional/operations.h>
|
|
58
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
59
|
+
#include <cuda/std/__type_traits/enable_if.h>
|
|
60
|
+
#include <cuda/std/__type_traits/type_identity.h>
|
|
61
|
+
#include <cuda/std/cstdint>
|
|
62
|
+
|
|
63
|
+
CUB_NAMESPACE_BEGIN
|
|
64
|
+
|
|
65
|
+
namespace detail::batch_memcpy
|
|
66
|
+
{
|
|
67
|
+
template <bool PTR_IS_FOUR_BYTE_ALIGNED>
|
|
68
|
+
_CCCL_FORCEINLINE _CCCL_DEVICE void
|
|
69
|
+
LoadVectorAndFunnelShiftR(uint32_t const* aligned_ptr, uint32_t bit_shift, uint4& data_out)
|
|
70
|
+
{
|
|
71
|
+
data_out = {aligned_ptr[0], aligned_ptr[1], aligned_ptr[2], aligned_ptr[3]};
|
|
72
|
+
|
|
73
|
+
if (!PTR_IS_FOUR_BYTE_ALIGNED)
|
|
74
|
+
{
|
|
75
|
+
uint32_t tail = aligned_ptr[4];
|
|
76
|
+
data_out.x = __funnelshift_r(data_out.x, data_out.y, bit_shift);
|
|
77
|
+
data_out.y = __funnelshift_r(data_out.y, data_out.z, bit_shift);
|
|
78
|
+
data_out.z = __funnelshift_r(data_out.z, data_out.w, bit_shift);
|
|
79
|
+
data_out.w = __funnelshift_r(data_out.w, tail, bit_shift);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
template <bool PTR_IS_FOUR_BYTE_ALIGNED>
|
|
84
|
+
_CCCL_FORCEINLINE _CCCL_DEVICE void
|
|
85
|
+
LoadVectorAndFunnelShiftR(uint32_t const* aligned_ptr, uint32_t bit_shift, uint2& data_out)
|
|
86
|
+
{
|
|
87
|
+
data_out = {aligned_ptr[0], aligned_ptr[1]};
|
|
88
|
+
|
|
89
|
+
if (!PTR_IS_FOUR_BYTE_ALIGNED)
|
|
90
|
+
{
|
|
91
|
+
uint32_t tail = aligned_ptr[2];
|
|
92
|
+
data_out.x = __funnelshift_r(data_out.x, data_out.y, bit_shift);
|
|
93
|
+
data_out.y = __funnelshift_r(data_out.y, tail, bit_shift);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
template <bool PTR_IS_FOUR_BYTE_ALIGNED>
|
|
98
|
+
_CCCL_FORCEINLINE _CCCL_DEVICE void
|
|
99
|
+
LoadVectorAndFunnelShiftR(uint32_t const* aligned_ptr, uint32_t bit_shift, uint32_t& data_out)
|
|
100
|
+
{
|
|
101
|
+
data_out = aligned_ptr[0];
|
|
102
|
+
|
|
103
|
+
if (!PTR_IS_FOUR_BYTE_ALIGNED)
|
|
104
|
+
{
|
|
105
|
+
uint32_t tail = aligned_ptr[1];
|
|
106
|
+
data_out = __funnelshift_r(data_out, tail, bit_shift);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* @brief Loads data from \p ptr into \p data_out without requiring \p ptr to be aligned.
|
|
112
|
+
* @note If \p ptr isn't aligned to four bytes, the bytes from the last four-byte aligned address up
|
|
113
|
+
* to \p ptr are loaded too (but dropped) and, hence, need to be device-accessible. Similarly, if
|
|
114
|
+
* \p ptr isn't aligned to four bytes, the bytes from `(ptr + sizeof(VectorT))` up to the following
|
|
115
|
+
* four-byte aligned address are loaded too (but dropped), and, hence, need to be device-accessible.
|
|
116
|
+
*
|
|
117
|
+
* @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t)
|
|
118
|
+
* @param ptr The pointer from which the data is supposed to be loaded
|
|
119
|
+
* @param data_out The vector type that stores the data loaded from \p ptr
|
|
120
|
+
*/
|
|
121
|
+
template <typename VectorT>
|
|
122
|
+
_CCCL_FORCEINLINE _CCCL_DEVICE void LoadVector(const char* ptr, VectorT& data_out)
|
|
123
|
+
{
|
|
124
|
+
const uint32_t offset = reinterpret_cast<uintptr_t>(ptr) % 4U;
|
|
125
|
+
const uint32_t* aligned_ptr = reinterpret_cast<uint32_t const*>(ptr - offset);
|
|
126
|
+
constexpr uint32_t bits_per_byte = 8U;
|
|
127
|
+
const uint32_t bit_shift = offset * bits_per_byte;
|
|
128
|
+
|
|
129
|
+
// If `ptr` is aligned to four bytes, we can perform a simple uint32_t-aliased load
|
|
130
|
+
if (offset == 0)
|
|
131
|
+
{
|
|
132
|
+
LoadVectorAndFunnelShiftR<true>(aligned_ptr, bit_shift, data_out);
|
|
133
|
+
}
|
|
134
|
+
// Otherwise, we need to load extra bytes and perform funnel-shifting
|
|
135
|
+
else
|
|
136
|
+
{
|
|
137
|
+
LoadVectorAndFunnelShiftR<false>(aligned_ptr, bit_shift, data_out);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* @brief Helper data structure to hold information on the byte range for which we can safely
|
|
143
|
+
* perform vectorized copies.
|
|
144
|
+
*
|
|
145
|
+
* @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t)
|
|
146
|
+
*/
|
|
147
|
+
template <typename VectorT>
|
|
148
|
+
struct PointerRange
|
|
149
|
+
{
|
|
150
|
+
VectorT* out_begin;
|
|
151
|
+
VectorT* out_end;
|
|
152
|
+
const char* in_begin;
|
|
153
|
+
const char* in_end;
|
|
154
|
+
};
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* @brief Both `out_start_aligned` and `out_end_aligned` are indices into `out_ptr`.
|
|
158
|
+
* `out_start_aligned` is the first VectorT-aligned memory location after `out_ptr + 3`.
|
|
159
|
+
* `out_end_aligned` is the last VectorT-aligned memory location before `out_end - 4`, where out_end
|
|
160
|
+
* corresponds to one past the last byte to be copied. Bytes between `[out_start_aligned,
|
|
161
|
+
* out_end_aligned)` will be copied using VectorT. `out_ptr + 3` and `out_end - 4` are used instead
|
|
162
|
+
* of `out_ptr` and `out_end` to avoid `LoadVector` reading beyond data boundaries.
|
|
163
|
+
*
|
|
164
|
+
* @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t)
|
|
165
|
+
* @tparam ByteOffsetT Type used to index the bytes within the buffers
|
|
166
|
+
* @param in_begin Pointer to the beginning of the byte range that shall be copied
|
|
167
|
+
* @param out_begin Pointer to the beginning of the byte range that shall be copied
|
|
168
|
+
* @param num_bytes Number of bytes that shall be copied
|
|
169
|
+
* @return The byte range that can safely be copied using vectorized stores of type VectorT
|
|
170
|
+
*/
|
|
171
|
+
template <typename VectorT, typename ByteOffsetT>
|
|
172
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE PointerRange<VectorT>
|
|
173
|
+
GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes)
|
|
174
|
+
{
|
|
175
|
+
// Data type size used for vectorized stores
|
|
176
|
+
constexpr auto out_datatype_size = uint32_t{sizeof(VectorT)};
|
|
177
|
+
// Data type size used for type-aliased loads
|
|
178
|
+
constexpr auto in_datatype_size = uint32_t{sizeof(uint32_t)};
|
|
179
|
+
|
|
180
|
+
// char-aliased ptrs to simplify pointer arithmetic
|
|
181
|
+
char* out_ptr = reinterpret_cast<char*>(out_begin);
|
|
182
|
+
const char* in_ptr = reinterpret_cast<const char*>(in_begin);
|
|
183
|
+
|
|
184
|
+
// Number of bytes between the first VectorT-aligned address at or before out_begin and out_begin
|
|
185
|
+
const uint32_t alignment_offset = reinterpret_cast<uintptr_t>(out_ptr) % out_datatype_size;
|
|
186
|
+
|
|
187
|
+
// The first VectorT-aligned address before (or at) out_begin
|
|
188
|
+
char* out_chars_aligned = reinterpret_cast<char*>(out_ptr - alignment_offset);
|
|
189
|
+
|
|
190
|
+
// The number of extra bytes preceding `in_ptr` that are loaded but dropped
|
|
191
|
+
uint32_t in_extra_bytes = reinterpret_cast<uintptr_t>(in_ptr) % in_datatype_size;
|
|
192
|
+
|
|
193
|
+
// The offset required by `LoadVector`:
|
|
194
|
+
// If the input pointer is not aligned, we load data from the last aligned address preceding the
|
|
195
|
+
// pointer. That is, loading up to (in_datatype_size-1) bytes before `in_ptr`
|
|
196
|
+
uint32_t in_offset_req = in_extra_bytes;
|
|
197
|
+
|
|
198
|
+
// Bytes after `out_chars_aligned` to the first VectorT-aligned address at or after `out_begin`
|
|
199
|
+
uint32_t out_start_aligned = ::cuda::round_up(in_offset_req + alignment_offset, out_datatype_size);
|
|
200
|
+
|
|
201
|
+
// Compute the beginning of the aligned ranges (output and input pointers)
|
|
202
|
+
VectorT* out_aligned_begin = reinterpret_cast<VectorT*>(out_chars_aligned + out_start_aligned);
|
|
203
|
+
const char* in_aligned_begin = in_ptr + (reinterpret_cast<char*>(out_aligned_begin) - out_ptr);
|
|
204
|
+
|
|
205
|
+
// If the aligned range is not aligned for the input pointer, we load up to (in_datatype_size-1)
|
|
206
|
+
// bytes after the last byte that is copied. That is, we always load four bytes up to the next
|
|
207
|
+
// aligned input address at a time. E.g., if the last byte loaded is one byte past the last
|
|
208
|
+
// aligned address we'll also load the three bytes after that byte.
|
|
209
|
+
uint32_t in_extra_bytes_from_aligned = (reinterpret_cast<uintptr_t>(in_aligned_begin) % in_datatype_size);
|
|
210
|
+
uint32_t in_end_padding_req = (in_datatype_size - in_extra_bytes_from_aligned) % in_datatype_size;
|
|
211
|
+
|
|
212
|
+
// Bytes after `out_chars_aligned` to the last VectorT-aligned
|
|
213
|
+
// address at (or before) `out_begin` + `num_bytes`
|
|
214
|
+
uint32_t out_end_aligned{};
|
|
215
|
+
if (in_end_padding_req + alignment_offset > num_bytes)
|
|
216
|
+
{
|
|
217
|
+
out_end_aligned = out_start_aligned;
|
|
218
|
+
}
|
|
219
|
+
else
|
|
220
|
+
{
|
|
221
|
+
out_end_aligned = (num_bytes - in_end_padding_req + alignment_offset) / out_datatype_size * out_datatype_size;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
VectorT* out_aligned_end = reinterpret_cast<VectorT*>(out_chars_aligned + out_end_aligned);
|
|
225
|
+
const char* in_aligned_end = in_ptr + (reinterpret_cast<char*>(out_aligned_end) - out_ptr);
|
|
226
|
+
|
|
227
|
+
return {out_aligned_begin, out_aligned_end, in_aligned_begin, in_aligned_end};
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* @brief Cooperatively copies \p num_bytes from \p src to \p dest using vectorized stores of type
|
|
232
|
+
* \p VectorT for addresses within [dest, dest + num_bytes) that are aligned to \p VectorT. A
|
|
233
|
+
* byte-wise copy is used for byte-ranges that are not aligned to \p VectorT.
|
|
234
|
+
*
|
|
235
|
+
* @tparam LOGICAL_WARP_SIZE The number of threads cooperaing to copy the data; all threads within
|
|
236
|
+
* [0, `LOGICAL_WARP_SIZE`) must invoke this method with the same arguments
|
|
237
|
+
* @tparam VectorT The vector type used for vectorized stores (i.e., one of uint4, uint2, uint32_t)
|
|
238
|
+
* @tparam ByteOffsetT Type used to index the bytes within the buffers
|
|
239
|
+
* @param thread_rank The thread rank within the group that cooperates to copy the data must be
|
|
240
|
+
* within [0, `LOGICAL_WARP_SIZE`)
|
|
241
|
+
* @param dest Pointer to the memory location to copy to
|
|
242
|
+
* @param num_bytes Number of bytes to copy
|
|
243
|
+
* @param src Pointer to the memory location to copy from
|
|
244
|
+
*/
|
|
245
|
+
template <int LOGICAL_WARP_SIZE, typename VectorT, typename ByteOffsetT>
|
|
246
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
247
|
+
vectorized_copy(int32_t thread_rank, void* dest, ByteOffsetT num_bytes, const void* src)
|
|
248
|
+
{
|
|
249
|
+
char* out_ptr = reinterpret_cast<char*>(dest);
|
|
250
|
+
const char* in_ptr = reinterpret_cast<const char*>(src);
|
|
251
|
+
|
|
252
|
+
// Gets the byte range that can safely be copied using vectorized stores of type VectorT
|
|
253
|
+
auto aligned_range = GetAlignedPtrs<VectorT>(src, dest, num_bytes);
|
|
254
|
+
|
|
255
|
+
// If byte range for which we can use vectorized copies is empty -> use byte-wise copies
|
|
256
|
+
if (aligned_range.out_end <= aligned_range.out_begin)
|
|
257
|
+
{
|
|
258
|
+
for (ByteOffsetT ichar = thread_rank; ichar < num_bytes; ichar += LOGICAL_WARP_SIZE)
|
|
259
|
+
{
|
|
260
|
+
out_ptr[ichar] = in_ptr[ichar];
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
else
|
|
264
|
+
{
|
|
265
|
+
// Copy bytes in range `[dest, aligned_range.out_begin)`
|
|
266
|
+
out_ptr += thread_rank;
|
|
267
|
+
in_ptr += thread_rank;
|
|
268
|
+
while (out_ptr < reinterpret_cast<char*>(aligned_range.out_begin))
|
|
269
|
+
{
|
|
270
|
+
*out_ptr = *in_ptr;
|
|
271
|
+
out_ptr += LOGICAL_WARP_SIZE;
|
|
272
|
+
in_ptr += LOGICAL_WARP_SIZE;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// Copy bytes in range `[aligned_range.out_begin, aligned_range.out_end)`
|
|
276
|
+
VectorT* aligned_range_begin = aligned_range.out_begin + thread_rank;
|
|
277
|
+
const char* in_aligned_begin = aligned_range.in_begin + thread_rank * sizeof(VectorT);
|
|
278
|
+
while (aligned_range_begin < aligned_range.out_end)
|
|
279
|
+
{
|
|
280
|
+
VectorT data_in;
|
|
281
|
+
LoadVector(in_aligned_begin, data_in);
|
|
282
|
+
*aligned_range_begin = data_in;
|
|
283
|
+
in_aligned_begin += sizeof(VectorT) * LOGICAL_WARP_SIZE;
|
|
284
|
+
aligned_range_begin += LOGICAL_WARP_SIZE;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
// Copy bytes in range `[aligned_range.out_end, dest + num_bytes)`.
|
|
288
|
+
out_ptr = reinterpret_cast<char*>(aligned_range.out_end) + thread_rank;
|
|
289
|
+
in_ptr = aligned_range.in_end + thread_rank;
|
|
290
|
+
while (out_ptr < reinterpret_cast<char*>(dest) + num_bytes)
|
|
291
|
+
{
|
|
292
|
+
*out_ptr = *in_ptr;
|
|
293
|
+
out_ptr += LOGICAL_WARP_SIZE;
|
|
294
|
+
in_ptr += LOGICAL_WARP_SIZE;
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
template <bool IsMemcpy,
|
|
300
|
+
uint32_t LOGICAL_WARP_SIZE,
|
|
301
|
+
typename InputBufferT,
|
|
302
|
+
typename OutputBufferT,
|
|
303
|
+
typename OffsetT,
|
|
304
|
+
::cuda::std::enable_if_t<IsMemcpy, int> = 0>
|
|
305
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
306
|
+
copy_items(InputBufferT input_buffer, OutputBufferT output_buffer, OffsetT num_bytes, OffsetT offset = 0)
|
|
307
|
+
{
|
|
308
|
+
vectorized_copy<LOGICAL_WARP_SIZE, uint4>(
|
|
309
|
+
threadIdx.x % LOGICAL_WARP_SIZE,
|
|
310
|
+
&reinterpret_cast<char*>(output_buffer)[offset],
|
|
311
|
+
num_bytes,
|
|
312
|
+
&reinterpret_cast<const char*>(input_buffer)[offset]);
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
template <bool IsMemcpy,
|
|
316
|
+
uint32_t LOGICAL_WARP_SIZE,
|
|
317
|
+
typename InputBufferT,
|
|
318
|
+
typename OutputBufferT,
|
|
319
|
+
typename OffsetT,
|
|
320
|
+
::cuda::std::enable_if_t<!IsMemcpy, int> = 0>
|
|
321
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
322
|
+
copy_items(InputBufferT input_buffer, OutputBufferT output_buffer, OffsetT num_items, OffsetT offset = 0)
|
|
323
|
+
{
|
|
324
|
+
output_buffer += offset;
|
|
325
|
+
input_buffer += offset;
|
|
326
|
+
for (OffsetT i = threadIdx.x % LOGICAL_WARP_SIZE; i < num_items; i += LOGICAL_WARP_SIZE)
|
|
327
|
+
{
|
|
328
|
+
*(output_buffer + i) = *(input_buffer + i);
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
template <bool IsMemcpy, typename AliasT, typename InputIt, typename OffsetT, ::cuda::std::enable_if_t<IsMemcpy, int> = 0>
|
|
333
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE AliasT read_item(InputIt buffer_src, OffsetT offset)
|
|
334
|
+
{
|
|
335
|
+
return *(reinterpret_cast<const AliasT*>(buffer_src) + offset);
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
template <bool IsMemcpy, typename AliasT, typename InputIt, typename OffsetT, ::cuda::std::enable_if_t<!IsMemcpy, int> = 0>
|
|
339
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE AliasT read_item(InputIt buffer_src, OffsetT offset)
|
|
340
|
+
{
|
|
341
|
+
return *(buffer_src + offset);
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
template <bool IsMemcpy, typename AliasT, typename OutputIt, typename OffsetT, ::cuda::std::enable_if_t<IsMemcpy, int> = 0>
|
|
345
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void write_item(OutputIt buffer_dst, OffsetT offset, AliasT value)
|
|
346
|
+
{
|
|
347
|
+
*(reinterpret_cast<AliasT*>(buffer_dst) + offset) = value;
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
template <bool IsMemcpy, typename AliasT, typename OutputIt, typename OffsetT, ::cuda::std::enable_if_t<!IsMemcpy, int> = 0>
|
|
351
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void write_item(OutputIt buffer_dst, OffsetT offset, AliasT value)
|
|
352
|
+
{
|
|
353
|
+
*(buffer_dst + offset) = value;
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
enum class prefer_power_of_two_bits_option
|
|
357
|
+
{
|
|
358
|
+
no,
|
|
359
|
+
yes
|
|
360
|
+
};
|
|
361
|
+
|
|
362
|
+
/**
|
|
363
|
+
* @brief A helper class that allows threads to maintain multiple counters, where the counter that
|
|
364
|
+
* shall be incremented can be addressed dynamically without incurring register spillage.
|
|
365
|
+
*
|
|
366
|
+
* @tparam NumItems The number of counters to allocate
|
|
367
|
+
* @tparam MaxItemValue The maximum count that must be supported.
|
|
368
|
+
* @tparam PreferPowerOfTwoBits Whether the number of bits to dedicate to each counter should be a
|
|
369
|
+
* power-of-two. If enabled, this allows replacing integer multiplication with a bit-shift in
|
|
370
|
+
* exchange for higher register pressure.
|
|
371
|
+
* @tparam BackingUnitT The data type that is used to provide the bits of all the counters that
|
|
372
|
+
* shall be allocated.
|
|
373
|
+
*/
|
|
374
|
+
template <uint32_t NumItems,
|
|
375
|
+
uint32_t MaxItemValue,
|
|
376
|
+
prefer_power_of_two_bits_option PreferPowerOfTwoBits,
|
|
377
|
+
typename BackingUnitT = uint32_t>
|
|
378
|
+
class bit_packed_counter
|
|
379
|
+
{
|
|
380
|
+
private:
|
|
381
|
+
/// The minimum number of bits required to represent all values from [0, MaxItemValue]
|
|
382
|
+
static constexpr uint32_t MIN_BITS_PER_ITEM =
|
|
383
|
+
(MaxItemValue == 0U) ? 1U : cub::Log2<static_cast<int32_t>(MaxItemValue + 1U)>::VALUE;
|
|
384
|
+
|
|
385
|
+
/// The number of bits allocated for each item. For pre-Volta, we prefer a power-of-2 here to
|
|
386
|
+
/// have the compiler replace costly integer multiplication with bit-shifting.
|
|
387
|
+
static constexpr uint32_t BITS_PER_ITEM =
|
|
388
|
+
(PreferPowerOfTwoBits == prefer_power_of_two_bits_option::yes)
|
|
389
|
+
? (0x01ULL << (cub::Log2<static_cast<int32_t>(MIN_BITS_PER_ITEM)>::VALUE))
|
|
390
|
+
: MIN_BITS_PER_ITEM;
|
|
391
|
+
|
|
392
|
+
/// The number of bits that each backing data type can store
|
|
393
|
+
static constexpr uint32_t NUM_BITS_PER_UNIT = sizeof(BackingUnitT) * 8;
|
|
394
|
+
|
|
395
|
+
/// The number of items that each backing data type can store
|
|
396
|
+
static constexpr uint32_t ITEMS_PER_UNIT = NUM_BITS_PER_UNIT / BITS_PER_ITEM;
|
|
397
|
+
|
|
398
|
+
/// The number of bits the backing data type is actually making use of
|
|
399
|
+
static constexpr uint32_t USED_BITS_PER_UNIT = ITEMS_PER_UNIT * BITS_PER_ITEM;
|
|
400
|
+
|
|
401
|
+
/// The number of backing data types required to store the given number of items
|
|
402
|
+
static constexpr uint32_t NUM_TOTAL_UNITS = ::cuda::ceil_div(NumItems, ITEMS_PER_UNIT);
|
|
403
|
+
|
|
404
|
+
/// This is the net number of bit-storage provided by each unit (remainder bits are unused)
|
|
405
|
+
static constexpr uint32_t UNIT_MASK =
|
|
406
|
+
(USED_BITS_PER_UNIT >= (8U * sizeof(uint32_t))) ? 0xFFFFFFFF : (0x01U << USED_BITS_PER_UNIT) - 1;
|
|
407
|
+
/// This is the bit-mask for each item
|
|
408
|
+
static constexpr uint32_t ITEM_MASK =
|
|
409
|
+
(BITS_PER_ITEM >= (8U * sizeof(uint32_t))) ? 0xFFFFFFFF : (0x01U << BITS_PER_ITEM) - 1;
|
|
410
|
+
|
|
411
|
+
//------------------------------------------------------------------------------
|
|
412
|
+
// ACCESSORS
|
|
413
|
+
//------------------------------------------------------------------------------
|
|
414
|
+
|
|
415
|
+
public:
|
|
416
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE uint32_t get(uint32_t index) const
|
|
417
|
+
{
|
|
418
|
+
const uint32_t target_offset = index * BITS_PER_ITEM;
|
|
419
|
+
uint32_t val = 0;
|
|
420
|
+
|
|
421
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
422
|
+
for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i)
|
|
423
|
+
{
|
|
424
|
+
// In case the bit-offset of the counter at <index> is larger than the bit range of the
|
|
425
|
+
// current unit, the bit_shift amount will be larger than the bits provided by this unit. As
|
|
426
|
+
// C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width,
|
|
427
|
+
// we use the PTX instruction `shr` to make sure behaviour is well-defined.
|
|
428
|
+
// Negative bit-shift amounts wrap around in unsigned integer math and are ultimately clamped.
|
|
429
|
+
const uint32_t bit_shift = target_offset - i * USED_BITS_PER_UNIT;
|
|
430
|
+
val |= detail::LogicShiftRight(data[i], bit_shift) & ITEM_MASK;
|
|
431
|
+
}
|
|
432
|
+
return val;
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void add(uint32_t index, uint32_t value)
|
|
436
|
+
{
|
|
437
|
+
const uint32_t target_offset = index * BITS_PER_ITEM;
|
|
438
|
+
|
|
439
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
440
|
+
for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i)
|
|
441
|
+
{
|
|
442
|
+
// In case the bit-offset of the counter at <index> is larger than the bit range of the
|
|
443
|
+
// current unit, the bit_shift amount will be larger than the bits provided by this unit. As
|
|
444
|
+
// C++'s bit-shift has undefined behaviour if the bits being shifted exceed the operand width,
|
|
445
|
+
// we use the PTX instruction `shl` to make sure behaviour is well-defined.
|
|
446
|
+
// Negative bit-shift amounts wrap around in unsigned integer math and are ultimately clamped.
|
|
447
|
+
const uint32_t bit_shift = target_offset - i * USED_BITS_PER_UNIT;
|
|
448
|
+
data[i] += detail::LogicShiftLeft(value, bit_shift) & UNIT_MASK;
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
_CCCL_DEVICE bit_packed_counter operator+(const bit_packed_counter& rhs) const
|
|
453
|
+
{
|
|
454
|
+
bit_packed_counter result;
|
|
455
|
+
|
|
456
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
457
|
+
for (uint32_t i = 0; i < NUM_TOTAL_UNITS; ++i)
|
|
458
|
+
{
|
|
459
|
+
result.data[i] = data[i] + rhs.data[i];
|
|
460
|
+
}
|
|
461
|
+
return result;
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
//------------------------------------------------------------------------------
|
|
465
|
+
// MEMBER VARIABLES
|
|
466
|
+
//------------------------------------------------------------------------------
|
|
467
|
+
|
|
468
|
+
private:
|
|
469
|
+
BackingUnitT data[NUM_TOTAL_UNITS] = {};
|
|
470
|
+
};
|
|
471
|
+
|
|
472
|
+
/**
|
|
473
|
+
* Parameterizable tuning policy type for AgentBatchMemcpy
|
|
474
|
+
*/
|
|
475
|
+
template <uint32_t _BLOCK_THREADS,
|
|
476
|
+
uint32_t _BUFFERS_PER_THREAD,
|
|
477
|
+
uint32_t _TLEV_BYTES_PER_THREAD,
|
|
478
|
+
bool _PREFER_POW2_BITS,
|
|
479
|
+
uint32_t _BLOCK_LEVEL_TILE_SIZE,
|
|
480
|
+
uint32_t _WARP_LEVEL_THRESHOLD,
|
|
481
|
+
uint32_t _BLOCK_LEVEL_THRESHOLD,
|
|
482
|
+
class BuffDelayConstructor,
|
|
483
|
+
class BlockDelayConstructor>
|
|
484
|
+
struct AgentBatchMemcpyPolicy
|
|
485
|
+
{
|
|
486
|
+
/// Threads per thread block
|
|
487
|
+
static constexpr uint32_t BLOCK_THREADS = _BLOCK_THREADS;
|
|
488
|
+
/// Items per thread (per tile of input)
|
|
489
|
+
static constexpr uint32_t BUFFERS_PER_THREAD = _BUFFERS_PER_THREAD;
|
|
490
|
+
/// The number of bytes that each thread will work on with each iteration of reading in bytes
|
|
491
|
+
/// from one or more
|
|
492
|
+
// source-buffers and writing them out to the respective destination-buffers.
|
|
493
|
+
static constexpr uint32_t TLEV_BYTES_PER_THREAD = _TLEV_BYTES_PER_THREAD;
|
|
494
|
+
/// Whether the bit_packed_counter should prefer allocating a power-of-2 number of bits per
|
|
495
|
+
/// counter
|
|
496
|
+
static constexpr uint32_t PREFER_POW2_BITS = _PREFER_POW2_BITS;
|
|
497
|
+
/// BLEV tile size granularity
|
|
498
|
+
static constexpr uint32_t BLOCK_LEVEL_TILE_SIZE = _BLOCK_LEVEL_TILE_SIZE;
|
|
499
|
+
|
|
500
|
+
static constexpr uint32_t WARP_LEVEL_THRESHOLD = _WARP_LEVEL_THRESHOLD;
|
|
501
|
+
static constexpr uint32_t BLOCK_LEVEL_THRESHOLD = _BLOCK_LEVEL_THRESHOLD;
|
|
502
|
+
|
|
503
|
+
using buff_delay_constructor = BuffDelayConstructor;
|
|
504
|
+
using block_delay_constructor = BlockDelayConstructor;
|
|
505
|
+
};
|
|
506
|
+
|
|
507
|
+
template <typename AgentMemcpySmallBuffersPolicyT,
|
|
508
|
+
typename InputBufferIt,
|
|
509
|
+
typename OutputBufferIt,
|
|
510
|
+
typename BufferSizeIteratorT,
|
|
511
|
+
typename BufferOffsetT,
|
|
512
|
+
typename BlevBufferSrcsOutItT,
|
|
513
|
+
typename BlevBufferDstsOutItT,
|
|
514
|
+
typename BlevBufferSizesOutItT,
|
|
515
|
+
typename BlevBufferTileOffsetsOutItT,
|
|
516
|
+
typename BlockOffsetT,
|
|
517
|
+
typename BLevBufferOffsetTileState,
|
|
518
|
+
typename BLevBlockOffsetTileState,
|
|
519
|
+
bool IsMemcpy>
|
|
520
|
+
class AgentBatchMemcpy
|
|
521
|
+
{
|
|
522
|
+
private:
|
|
523
|
+
//---------------------------------------------------------------------
|
|
524
|
+
// CONFIGS / CONSTANTS
|
|
525
|
+
//---------------------------------------------------------------------
|
|
526
|
+
// Tuning policy-based configurations
|
|
527
|
+
static constexpr uint32_t BLOCK_THREADS = AgentMemcpySmallBuffersPolicyT::BLOCK_THREADS;
|
|
528
|
+
static constexpr uint32_t BUFFERS_PER_THREAD = AgentMemcpySmallBuffersPolicyT::BUFFERS_PER_THREAD;
|
|
529
|
+
static constexpr uint32_t TLEV_BYTES_PER_THREAD = AgentMemcpySmallBuffersPolicyT::TLEV_BYTES_PER_THREAD;
|
|
530
|
+
static constexpr prefer_power_of_two_bits_option PREFER_POW2_BITS =
|
|
531
|
+
(AgentMemcpySmallBuffersPolicyT::PREFER_POW2_BITS)
|
|
532
|
+
? prefer_power_of_two_bits_option::yes
|
|
533
|
+
: prefer_power_of_two_bits_option::no;
|
|
534
|
+
static constexpr uint32_t BLOCK_LEVEL_TILE_SIZE = AgentMemcpySmallBuffersPolicyT::BLOCK_LEVEL_TILE_SIZE;
|
|
535
|
+
|
|
536
|
+
// Derived configs
|
|
537
|
+
static constexpr uint32_t BUFFERS_PER_BLOCK = BUFFERS_PER_THREAD * BLOCK_THREADS;
|
|
538
|
+
static constexpr uint32_t TLEV_BUFFERS_PER_THREAD = BUFFERS_PER_THREAD;
|
|
539
|
+
static constexpr uint32_t BLEV_BUFFERS_PER_THREAD = BUFFERS_PER_THREAD;
|
|
540
|
+
|
|
541
|
+
static constexpr uint32_t WARP_LEVEL_THRESHOLD = AgentMemcpySmallBuffersPolicyT::WARP_LEVEL_THRESHOLD;
|
|
542
|
+
|
|
543
|
+
static constexpr uint32_t BLOCK_LEVEL_THRESHOLD = AgentMemcpySmallBuffersPolicyT::BLOCK_LEVEL_THRESHOLD;
|
|
544
|
+
|
|
545
|
+
static constexpr uint32_t BUFFER_STABLE_PARTITION = false;
|
|
546
|
+
|
|
547
|
+
// Constants
|
|
548
|
+
enum : uint32_t
|
|
549
|
+
{
|
|
550
|
+
TLEV_SIZE_CLASS = 0,
|
|
551
|
+
WLEV_SIZE_CLASS,
|
|
552
|
+
BLEV_SIZE_CLASS,
|
|
553
|
+
NUM_SIZE_CLASSES,
|
|
554
|
+
};
|
|
555
|
+
|
|
556
|
+
//---------------------------------------------------------------------
|
|
557
|
+
// TYPE DECLARATIONS
|
|
558
|
+
//---------------------------------------------------------------------
|
|
559
|
+
/// Internal load/store type. For byte-wise memcpy, a single-byte type
|
|
560
|
+
using AliasT = typename ::cuda::std::
|
|
561
|
+
conditional_t<IsMemcpy, ::cuda::std::type_identity<char>, lazy_trait<it_value_t, it_value_t<InputBufferIt>>>::type;
|
|
562
|
+
|
|
563
|
+
/// Types of the input and output buffers
|
|
564
|
+
using InputBufferT = it_value_t<InputBufferIt>;
|
|
565
|
+
using OutputBufferT = it_value_t<OutputBufferIt>;
|
|
566
|
+
|
|
567
|
+
/// Type that has to be sufficiently large to hold any of the buffers' sizes.
|
|
568
|
+
/// The BufferSizeIteratorT's value type must be convertible to this type.
|
|
569
|
+
using BufferSizeT = it_value_t<BufferSizeIteratorT>;
|
|
570
|
+
|
|
571
|
+
/// Type used to index into the tile of buffers that this thread block is assigned to.
|
|
572
|
+
using BlockBufferOffsetT = uint16_t;
|
|
573
|
+
|
|
574
|
+
/// Internal type used to index into the bytes of and represent size of a TLEV buffer
|
|
575
|
+
using TLevBufferSizeT = uint16_t;
|
|
576
|
+
|
|
577
|
+
/**
|
|
578
|
+
* @brief Helper struct to simplify BlockExchange within a single four-byte word
|
|
579
|
+
*/
|
|
580
|
+
struct ZippedTLevByteAssignment
|
|
581
|
+
{
|
|
582
|
+
// The buffer id within this tile
|
|
583
|
+
BlockBufferOffsetT tile_buffer_id;
|
|
584
|
+
|
|
585
|
+
// Byte-offset within that buffer
|
|
586
|
+
TLevBufferSizeT buffer_byte_offset;
|
|
587
|
+
};
|
|
588
|
+
|
|
589
|
+
/**
|
|
590
|
+
* POD to keep track of <buffer_id, buffer_size> pairs after having partitioned this tile's
|
|
591
|
+
* buffers by their size.
|
|
592
|
+
*/
|
|
593
|
+
struct BufferTuple
|
|
594
|
+
{
|
|
595
|
+
// Size is only valid (and relevant) for buffers that are use thread-level collaboration
|
|
596
|
+
TLevBufferSizeT size;
|
|
597
|
+
|
|
598
|
+
// The buffer id relative to this tile (i.e., the buffer id within this tile)
|
|
599
|
+
BlockBufferOffsetT buffer_id;
|
|
600
|
+
};
|
|
601
|
+
|
|
602
|
+
// Load buffers in a striped arrangement if we do not want to perform a stable partitioning into
|
|
603
|
+
// small, medium, and large buffers, otherwise load them in a blocked arrangement
|
|
604
|
+
using BufferLoadT =
|
|
605
|
+
BlockLoad<BufferSizeT,
|
|
606
|
+
static_cast<int32_t>(BLOCK_THREADS),
|
|
607
|
+
static_cast<int32_t>(BUFFERS_PER_THREAD),
|
|
608
|
+
BUFFER_STABLE_PARTITION ? BLOCK_LOAD_WARP_TRANSPOSE : BLOCK_LOAD_STRIPED>;
|
|
609
|
+
|
|
610
|
+
// A vectorized counter that will count the number of buffers that fall into each of the
|
|
611
|
+
// size-classes. Where the size class represents the collaboration level that is required to
|
|
612
|
+
// process a buffer. The collaboration level being either:
|
|
613
|
+
//-> (1) TLEV (thread-level collaboration), requiring one or multiple threads but not a FULL warp
|
|
614
|
+
// to collaborate
|
|
615
|
+
//-> (2) WLEV (warp-level collaboration), requiring a full warp to collaborate on a buffer
|
|
616
|
+
//-> (3) BLEV (block-level collaboration), requiring one or multiple thread blocks to collaborate
|
|
617
|
+
// on a buffer */
|
|
618
|
+
using VectorizedSizeClassCounterT = bit_packed_counter<NUM_SIZE_CLASSES, BUFFERS_PER_BLOCK, PREFER_POW2_BITS>;
|
|
619
|
+
|
|
620
|
+
// Block-level scan used to compute the write offsets
|
|
621
|
+
using BlockSizeClassScanT = cub::BlockScan<VectorizedSizeClassCounterT, static_cast<int32_t>(BLOCK_THREADS)>;
|
|
622
|
+
|
|
623
|
+
//
|
|
624
|
+
using BlockBLevTileCountScanT = cub::BlockScan<BlockOffsetT, static_cast<int32_t>(BLOCK_THREADS)>;
|
|
625
|
+
|
|
626
|
+
// Block-level run-length decode algorithm to evenly distribute work of all buffers requiring
|
|
627
|
+
// thread-level collaboration
|
|
628
|
+
using BlockRunLengthDecodeT =
|
|
629
|
+
cub::BlockRunLengthDecode<BlockBufferOffsetT,
|
|
630
|
+
static_cast<int32_t>(BLOCK_THREADS),
|
|
631
|
+
static_cast<int32_t>(TLEV_BUFFERS_PER_THREAD),
|
|
632
|
+
static_cast<int32_t>(TLEV_BYTES_PER_THREAD)>;
|
|
633
|
+
|
|
634
|
+
using BlockExchangeTLevT =
|
|
635
|
+
cub::BlockExchange<ZippedTLevByteAssignment,
|
|
636
|
+
static_cast<int32_t>(BLOCK_THREADS),
|
|
637
|
+
static_cast<int32_t>(TLEV_BYTES_PER_THREAD)>;
|
|
638
|
+
|
|
639
|
+
using BLevBuffScanPrefixCallbackOpT =
|
|
640
|
+
TilePrefixCallbackOp<BufferOffsetT,
|
|
641
|
+
::cuda::std::plus<>,
|
|
642
|
+
BLevBufferOffsetTileState,
|
|
643
|
+
typename AgentMemcpySmallBuffersPolicyT::buff_delay_constructor>;
|
|
644
|
+
|
|
645
|
+
using BLevBlockScanPrefixCallbackOpT =
|
|
646
|
+
TilePrefixCallbackOp<BlockOffsetT,
|
|
647
|
+
::cuda::std::plus<>,
|
|
648
|
+
BLevBlockOffsetTileState,
|
|
649
|
+
typename AgentMemcpySmallBuffersPolicyT::block_delay_constructor>;
|
|
650
|
+
|
|
651
|
+
//-----------------------------------------------------------------------------
|
|
652
|
+
// SHARED MEMORY DECLARATIONS
|
|
653
|
+
//-----------------------------------------------------------------------------
|
|
654
|
+
struct _TempStorage
|
|
655
|
+
{
|
|
656
|
+
union
|
|
657
|
+
{
|
|
658
|
+
typename BufferLoadT::TempStorage load_storage;
|
|
659
|
+
|
|
660
|
+
// Stage 1: histogram over the size classes in preparation for partitioning buffers by size
|
|
661
|
+
typename BlockSizeClassScanT::TempStorage size_scan_storage;
|
|
662
|
+
|
|
663
|
+
// Stage 2: Communicate the number ofer buffers requiring block-level collaboration
|
|
664
|
+
typename BLevBuffScanPrefixCallbackOpT::TempStorage buffer_scan_callback;
|
|
665
|
+
|
|
666
|
+
// Stage 3; batch memcpy buffers that require only thread-level collaboration
|
|
667
|
+
struct
|
|
668
|
+
{
|
|
669
|
+
BufferTuple buffers_by_size_class[BUFFERS_PER_BLOCK];
|
|
670
|
+
|
|
671
|
+
// Stage 3.1: Write buffers requiring block-level collaboration to queue
|
|
672
|
+
union
|
|
673
|
+
{
|
|
674
|
+
struct
|
|
675
|
+
{
|
|
676
|
+
typename BLevBlockScanPrefixCallbackOpT::TempStorage block_scan_callback;
|
|
677
|
+
typename BlockBLevTileCountScanT::TempStorage block_scan_storage;
|
|
678
|
+
} blev;
|
|
679
|
+
|
|
680
|
+
// Stage 3.3: run-length decode & block exchange for tlev
|
|
681
|
+
// rld_state needs to be persistent across loop iterations (RunLengthDecode calls) and,
|
|
682
|
+
// hence, cannot alias block_exchange_storage
|
|
683
|
+
struct
|
|
684
|
+
{
|
|
685
|
+
typename BlockRunLengthDecodeT::TempStorage rld_state;
|
|
686
|
+
typename BlockExchangeTLevT::TempStorage block_exchange_storage;
|
|
687
|
+
} tlev;
|
|
688
|
+
};
|
|
689
|
+
} staged;
|
|
690
|
+
};
|
|
691
|
+
BufferOffsetT blev_buffer_offset;
|
|
692
|
+
};
|
|
693
|
+
|
|
694
|
+
//-----------------------------------------------------------------------------
|
|
695
|
+
// PUBLIC TYPE MEMBERS
|
|
696
|
+
//-----------------------------------------------------------------------------
|
|
697
|
+
|
|
698
|
+
public:
|
|
699
|
+
struct TempStorage : Uninitialized<_TempStorage>
|
|
700
|
+
{};
|
|
701
|
+
|
|
702
|
+
//-----------------------------------------------------------------------------
|
|
703
|
+
// PRIVATE MEMBER FUNCTIONS
|
|
704
|
+
//-----------------------------------------------------------------------------
|
|
705
|
+
|
|
706
|
+
private:
|
|
707
|
+
/// Shared storage reference
|
|
708
|
+
_TempStorage& temp_storage;
|
|
709
|
+
|
|
710
|
+
/**
|
|
711
|
+
* @brief Loads this tile's buffers' sizes, without any guards (i.e., out-of-bounds checks)
|
|
712
|
+
*/
|
|
713
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void
|
|
714
|
+
LoadBufferSizesFullTile(BufferSizeIteratorT tile_buffer_sizes_it, BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD])
|
|
715
|
+
{
|
|
716
|
+
BufferLoadT(temp_storage.load_storage).Load(tile_buffer_sizes_it, buffer_sizes);
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
/**
|
|
720
|
+
* @brief Loads this tile's buffers' sizes, making sure to read at most \p num_valid items.
|
|
721
|
+
*/
|
|
722
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void LoadBufferSizesPartialTile(
|
|
723
|
+
BufferSizeIteratorT tile_buffer_sizes_it, BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD], BufferOffsetT num_valid)
|
|
724
|
+
{
|
|
725
|
+
// Out-of-bounds buffer items are initialized to '0', so those buffers will simply be ignored
|
|
726
|
+
// later on
|
|
727
|
+
constexpr BufferSizeT OOB_DEFAULT_BUFFER_SIZE = 0U;
|
|
728
|
+
|
|
729
|
+
BufferLoadT(temp_storage.load_storage).Load(tile_buffer_sizes_it, buffer_sizes, num_valid, OOB_DEFAULT_BUFFER_SIZE);
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
/**
|
|
733
|
+
* @brief Computes the histogram over the number of buffers belonging to each of the three
|
|
734
|
+
* size-classes (TLEV, WLEV, BLEV).
|
|
735
|
+
*/
|
|
736
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE VectorizedSizeClassCounterT
|
|
737
|
+
GetBufferSizeClassHistogram(const BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD])
|
|
738
|
+
{
|
|
739
|
+
VectorizedSizeClassCounterT vectorized_counters{};
|
|
740
|
+
|
|
741
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
742
|
+
for (uint32_t i = 0; i < BUFFERS_PER_THREAD; i++)
|
|
743
|
+
{
|
|
744
|
+
// Whether to increment ANY of the buffer size classes at all
|
|
745
|
+
const uint32_t increment = buffer_sizes[i] > 0 ? 1U : 0U;
|
|
746
|
+
// Identify the buffer's size class
|
|
747
|
+
uint32_t buffer_size_class = 0;
|
|
748
|
+
buffer_size_class += buffer_sizes[i] > WARP_LEVEL_THRESHOLD ? 1U : 0U;
|
|
749
|
+
buffer_size_class += buffer_sizes[i] > BLOCK_LEVEL_THRESHOLD ? 1U : 0U;
|
|
750
|
+
|
|
751
|
+
// Increment the count of the respective size class
|
|
752
|
+
vectorized_counters.add(buffer_size_class, increment);
|
|
753
|
+
}
|
|
754
|
+
return vectorized_counters;
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
/**
|
|
758
|
+
* @brief Scatters the buffers into the respective buffer's size-class partition.
|
|
759
|
+
*/
|
|
760
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void PartitionBuffersBySize(
|
|
761
|
+
const BufferSizeT (&buffer_sizes)[BUFFERS_PER_THREAD],
|
|
762
|
+
VectorizedSizeClassCounterT& vectorized_offsets,
|
|
763
|
+
BufferTuple (&buffers_by_size_class)[BUFFERS_PER_BLOCK])
|
|
764
|
+
{
|
|
765
|
+
// If we intend to perform a stable partitioning, the thread's buffer are in a blocked
|
|
766
|
+
// arrangement, otherwise they are in a striped arrangement
|
|
767
|
+
BlockBufferOffsetT buffer_id = BUFFER_STABLE_PARTITION ? (BUFFERS_PER_THREAD * threadIdx.x) : (threadIdx.x);
|
|
768
|
+
constexpr BlockBufferOffsetT BUFFER_STRIDE =
|
|
769
|
+
BUFFER_STABLE_PARTITION ? static_cast<BlockBufferOffsetT>(1) : static_cast<BlockBufferOffsetT>(BLOCK_THREADS);
|
|
770
|
+
|
|
771
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
772
|
+
for (uint32_t i = 0; i < BUFFERS_PER_THREAD; i++)
|
|
773
|
+
{
|
|
774
|
+
if (buffer_sizes[i] > 0)
|
|
775
|
+
{
|
|
776
|
+
uint32_t buffer_size_class = 0;
|
|
777
|
+
buffer_size_class += buffer_sizes[i] > WARP_LEVEL_THRESHOLD ? 1U : 0U;
|
|
778
|
+
buffer_size_class += buffer_sizes[i] > BLOCK_LEVEL_THRESHOLD ? 1U : 0U;
|
|
779
|
+
const uint32_t write_offset = vectorized_offsets.get(buffer_size_class);
|
|
780
|
+
buffers_by_size_class[write_offset] = {static_cast<TLevBufferSizeT>(buffer_sizes[i]), buffer_id};
|
|
781
|
+
vectorized_offsets.add(buffer_size_class, 1U);
|
|
782
|
+
}
|
|
783
|
+
buffer_id += BUFFER_STRIDE;
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
/**
|
|
788
|
+
* @brief Read in all the buffers that require block-level collaboration and put them to a queue
|
|
789
|
+
* that will get picked up in a separate, subsequent kernel.
|
|
790
|
+
*/
|
|
791
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void EnqueueBLEVBuffers(
|
|
792
|
+
BufferTuple* buffers_by_size_class,
|
|
793
|
+
InputBufferIt tile_buffer_srcs,
|
|
794
|
+
OutputBufferIt tile_buffer_dsts,
|
|
795
|
+
BufferSizeIteratorT tile_buffer_sizes,
|
|
796
|
+
BlockBufferOffsetT num_blev_buffers,
|
|
797
|
+
BufferOffsetT tile_buffer_offset,
|
|
798
|
+
BufferOffsetT tile_id)
|
|
799
|
+
{
|
|
800
|
+
BlockOffsetT block_offset[BLEV_BUFFERS_PER_THREAD];
|
|
801
|
+
// Read in the BLEV buffer partition (i.e., the buffers that require block-level collaboration)
|
|
802
|
+
uint32_t blev_buffer_offset = threadIdx.x * BLEV_BUFFERS_PER_THREAD;
|
|
803
|
+
|
|
804
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
805
|
+
for (uint32_t i = 0; i < BLEV_BUFFERS_PER_THREAD; i++)
|
|
806
|
+
{
|
|
807
|
+
if (blev_buffer_offset < num_blev_buffers)
|
|
808
|
+
{
|
|
809
|
+
BlockBufferOffsetT tile_buffer_id = buffers_by_size_class[blev_buffer_offset].buffer_id;
|
|
810
|
+
block_offset[i] = ::cuda::ceil_div(+tile_buffer_sizes[tile_buffer_id], BLOCK_LEVEL_TILE_SIZE);
|
|
811
|
+
}
|
|
812
|
+
else
|
|
813
|
+
{
|
|
814
|
+
// Out-of-bounds buffers are assigned a tile count of '0'
|
|
815
|
+
block_offset[i] = 0U;
|
|
816
|
+
}
|
|
817
|
+
blev_buffer_offset++;
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
if (tile_id == 0)
|
|
821
|
+
{
|
|
822
|
+
BlockOffsetT block_aggregate;
|
|
823
|
+
BlockBLevTileCountScanT(temp_storage.staged.blev.block_scan_storage)
|
|
824
|
+
.ExclusiveSum(block_offset, block_offset, block_aggregate);
|
|
825
|
+
if (threadIdx.x == 0)
|
|
826
|
+
{
|
|
827
|
+
blev_block_scan_state.SetInclusive(0, block_aggregate);
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
else
|
|
831
|
+
{
|
|
832
|
+
BLevBlockScanPrefixCallbackOpT blev_tile_prefix_op(
|
|
833
|
+
blev_block_scan_state, temp_storage.staged.blev.block_scan_callback, ::cuda::std::plus<>{}, tile_id);
|
|
834
|
+
BlockBLevTileCountScanT(temp_storage.staged.blev.block_scan_storage)
|
|
835
|
+
.ExclusiveSum(block_offset, block_offset, blev_tile_prefix_op);
|
|
836
|
+
}
|
|
837
|
+
__syncthreads();
|
|
838
|
+
|
|
839
|
+
// Read in the BLEV buffer partition (i.e., the buffers that require block-level collaboration)
|
|
840
|
+
blev_buffer_offset = threadIdx.x * BLEV_BUFFERS_PER_THREAD;
|
|
841
|
+
|
|
842
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
843
|
+
for (uint32_t i = 0; i < BLEV_BUFFERS_PER_THREAD; i++)
|
|
844
|
+
{
|
|
845
|
+
if (blev_buffer_offset < num_blev_buffers)
|
|
846
|
+
{
|
|
847
|
+
BlockBufferOffsetT tile_buffer_id = buffers_by_size_class[blev_buffer_offset].buffer_id;
|
|
848
|
+
blev_buffer_srcs[tile_buffer_offset + blev_buffer_offset] = tile_buffer_srcs[tile_buffer_id];
|
|
849
|
+
blev_buffer_dsts[tile_buffer_offset + blev_buffer_offset] = tile_buffer_dsts[tile_buffer_id];
|
|
850
|
+
blev_buffer_sizes[tile_buffer_offset + blev_buffer_offset] = tile_buffer_sizes[tile_buffer_id];
|
|
851
|
+
blev_buffer_tile_offsets[tile_buffer_offset + blev_buffer_offset] = block_offset[i];
|
|
852
|
+
blev_buffer_offset++;
|
|
853
|
+
}
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
/**
|
|
858
|
+
* @brief Read in all the buffers of this tile that require warp-level collaboration and copy
|
|
859
|
+
* their bytes to the corresponding destination buffer
|
|
860
|
+
*/
|
|
861
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void BatchMemcpyWLEVBuffers(
|
|
862
|
+
BufferTuple* buffers_by_size_class,
|
|
863
|
+
InputBufferIt tile_buffer_srcs,
|
|
864
|
+
OutputBufferIt tile_buffer_dsts,
|
|
865
|
+
BufferSizeIteratorT tile_buffer_sizes,
|
|
866
|
+
BlockBufferOffsetT num_wlev_buffers)
|
|
867
|
+
{
|
|
868
|
+
const int32_t warp_id = threadIdx.x / warp_threads;
|
|
869
|
+
constexpr uint32_t warps_per_block = BLOCK_THREADS / warp_threads;
|
|
870
|
+
|
|
871
|
+
for (BlockBufferOffsetT buffer_offset = warp_id; buffer_offset < num_wlev_buffers; buffer_offset += warps_per_block)
|
|
872
|
+
{
|
|
873
|
+
const auto buffer_id = buffers_by_size_class[buffer_offset].buffer_id;
|
|
874
|
+
copy_items<IsMemcpy, warp_threads, InputBufferT, OutputBufferT, BufferSizeT>(
|
|
875
|
+
tile_buffer_srcs[buffer_id], tile_buffer_dsts[buffer_id], tile_buffer_sizes[buffer_id]);
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
/**
|
|
880
|
+
* @brief Read in all the buffers of this tile that require thread-level collaboration and copy
|
|
881
|
+
* their bytes to the corresponding destination buffer
|
|
882
|
+
*/
|
|
883
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void BatchMemcpyTLEVBuffers(
|
|
884
|
+
BufferTuple* buffers_by_size_class,
|
|
885
|
+
InputBufferIt tile_buffer_srcs,
|
|
886
|
+
OutputBufferIt tile_buffer_dsts,
|
|
887
|
+
BlockBufferOffsetT num_tlev_buffers)
|
|
888
|
+
{
|
|
889
|
+
// Read in the buffers' ids that require thread-level collaboration (where buffer id is the
|
|
890
|
+
// buffer within this tile)
|
|
891
|
+
BlockBufferOffsetT tlev_buffer_ids[TLEV_BUFFERS_PER_THREAD];
|
|
892
|
+
TLevBufferSizeT tlev_buffer_sizes[TLEV_BUFFERS_PER_THREAD];
|
|
893
|
+
// Currently we do not go over the TLEV buffers in multiple iterations, so we need to make sure
|
|
894
|
+
// we are able to be covered for the case that all our buffers are TLEV buffers
|
|
895
|
+
static_assert(TLEV_BUFFERS_PER_THREAD >= BUFFERS_PER_THREAD,
|
|
896
|
+
"Unsupported confiugraiton: The number of 'thread-level buffers' must be at "
|
|
897
|
+
"least as large as the number of overall buffers being processed by each "
|
|
898
|
+
"thread.");
|
|
899
|
+
|
|
900
|
+
// Read in the TLEV buffer partition (i.e., the buffers that require thread-level collaboration)
|
|
901
|
+
uint32_t tlev_buffer_offset = threadIdx.x * TLEV_BUFFERS_PER_THREAD;
|
|
902
|
+
|
|
903
|
+
// Pre-populate the buffer sizes to 0 (i.e. zero-padding towards the end) to ensure
|
|
904
|
+
// out-of-bounds TLEV buffers will not be considered
|
|
905
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
906
|
+
for (uint32_t i = 0; i < TLEV_BUFFERS_PER_THREAD; i++)
|
|
907
|
+
{
|
|
908
|
+
tlev_buffer_sizes[i] = 0;
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
// Assign TLEV buffers in a blocked arrangement (each thread is assigned consecutive TLEV
|
|
912
|
+
// buffers)
|
|
913
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
914
|
+
for (uint32_t i = 0; i < TLEV_BUFFERS_PER_THREAD; i++)
|
|
915
|
+
{
|
|
916
|
+
if (tlev_buffer_offset < num_tlev_buffers)
|
|
917
|
+
{
|
|
918
|
+
tlev_buffer_ids[i] = buffers_by_size_class[tlev_buffer_offset].buffer_id;
|
|
919
|
+
tlev_buffer_sizes[i] = buffers_by_size_class[tlev_buffer_offset].size;
|
|
920
|
+
}
|
|
921
|
+
tlev_buffer_offset++;
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
// Evenly distribute all the bytes that have to be copied from all the buffers that require
|
|
925
|
+
// thread-level collaboration using BlockRunLengthDecode
|
|
926
|
+
uint32_t num_total_tlev_bytes = 0U;
|
|
927
|
+
BlockRunLengthDecodeT block_run_length_decode(
|
|
928
|
+
temp_storage.staged.tlev.rld_state, tlev_buffer_ids, tlev_buffer_sizes, num_total_tlev_bytes);
|
|
929
|
+
|
|
930
|
+
// Run-length decode the buffers' sizes into a window buffer of limited size. This is repeated
|
|
931
|
+
// until we were able to cover all the bytes of TLEV buffers
|
|
932
|
+
uint32_t decoded_window_offset = 0U;
|
|
933
|
+
while (decoded_window_offset < num_total_tlev_bytes)
|
|
934
|
+
{
|
|
935
|
+
BlockBufferOffsetT buffer_id[TLEV_BYTES_PER_THREAD];
|
|
936
|
+
TLevBufferSizeT buffer_byte_offset[TLEV_BYTES_PER_THREAD];
|
|
937
|
+
|
|
938
|
+
// Now we have a balanced assignment: buffer_id[i] will hold the tile's buffer id and
|
|
939
|
+
// buffer_byte_offset[i] that buffer's byte that this thread supposed to copy
|
|
940
|
+
block_run_length_decode.RunLengthDecode(buffer_id, buffer_byte_offset, decoded_window_offset);
|
|
941
|
+
|
|
942
|
+
// Zip from SoA to AoS
|
|
943
|
+
ZippedTLevByteAssignment zipped_byte_assignment[TLEV_BYTES_PER_THREAD];
|
|
944
|
+
|
|
945
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
946
|
+
for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++)
|
|
947
|
+
{
|
|
948
|
+
zipped_byte_assignment[i] = {buffer_id[i], buffer_byte_offset[i]};
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
// Exchange from blocked to striped arrangement for coalesced memory reads and writes
|
|
952
|
+
BlockExchangeTLevT(temp_storage.staged.tlev.block_exchange_storage)
|
|
953
|
+
.BlockedToStriped(zipped_byte_assignment, zipped_byte_assignment);
|
|
954
|
+
|
|
955
|
+
// Read in the bytes that this thread is assigned to
|
|
956
|
+
constexpr uint32_t WINDOW_SIZE = (TLEV_BYTES_PER_THREAD * BLOCK_THREADS);
|
|
957
|
+
const bool is_full_window = decoded_window_offset + WINDOW_SIZE < num_total_tlev_bytes;
|
|
958
|
+
if (is_full_window)
|
|
959
|
+
{
|
|
960
|
+
uint32_t absolute_tlev_byte_offset = decoded_window_offset + threadIdx.x;
|
|
961
|
+
AliasT src_byte[TLEV_BYTES_PER_THREAD];
|
|
962
|
+
|
|
963
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
964
|
+
for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++)
|
|
965
|
+
{
|
|
966
|
+
src_byte[i] = read_item<IsMemcpy, AliasT, InputBufferT>(
|
|
967
|
+
tile_buffer_srcs[zipped_byte_assignment[i].tile_buffer_id], zipped_byte_assignment[i].buffer_byte_offset);
|
|
968
|
+
absolute_tlev_byte_offset += BLOCK_THREADS;
|
|
969
|
+
}
|
|
970
|
+
|
|
971
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
972
|
+
for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++)
|
|
973
|
+
{
|
|
974
|
+
write_item<IsMemcpy, AliasT, OutputBufferT>(
|
|
975
|
+
tile_buffer_dsts[zipped_byte_assignment[i].tile_buffer_id],
|
|
976
|
+
zipped_byte_assignment[i].buffer_byte_offset,
|
|
977
|
+
src_byte[i]);
|
|
978
|
+
}
|
|
979
|
+
}
|
|
980
|
+
else
|
|
981
|
+
{
|
|
982
|
+
uint32_t absolute_tlev_byte_offset = decoded_window_offset + threadIdx.x;
|
|
983
|
+
|
|
984
|
+
_CCCL_PRAGMA_UNROLL_FULL()
|
|
985
|
+
for (int32_t i = 0; i < TLEV_BYTES_PER_THREAD; i++)
|
|
986
|
+
{
|
|
987
|
+
if (absolute_tlev_byte_offset < num_total_tlev_bytes)
|
|
988
|
+
{
|
|
989
|
+
const AliasT src_byte = read_item<IsMemcpy, AliasT, InputBufferT>(
|
|
990
|
+
tile_buffer_srcs[zipped_byte_assignment[i].tile_buffer_id], zipped_byte_assignment[i].buffer_byte_offset);
|
|
991
|
+
write_item<IsMemcpy, AliasT, OutputBufferT>(
|
|
992
|
+
tile_buffer_dsts[zipped_byte_assignment[i].tile_buffer_id],
|
|
993
|
+
zipped_byte_assignment[i].buffer_byte_offset,
|
|
994
|
+
src_byte);
|
|
995
|
+
}
|
|
996
|
+
absolute_tlev_byte_offset += BLOCK_THREADS;
|
|
997
|
+
}
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
decoded_window_offset += WINDOW_SIZE;
|
|
1001
|
+
|
|
1002
|
+
// Ensure all threads finished collaborative BlockExchange so temporary storage can be reused
|
|
1003
|
+
// with next iteration
|
|
1004
|
+
__syncthreads();
|
|
1005
|
+
}
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
//-----------------------------------------------------------------------------
|
|
1009
|
+
// PUBLIC MEMBER FUNCTIONS
|
|
1010
|
+
//-----------------------------------------------------------------------------
|
|
1011
|
+
|
|
1012
|
+
public:
|
|
1013
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTile(BufferOffsetT tile_id)
|
|
1014
|
+
{
|
|
1015
|
+
// Offset into this tile's buffers
|
|
1016
|
+
BufferOffsetT buffer_offset = tile_id * BUFFERS_PER_BLOCK;
|
|
1017
|
+
|
|
1018
|
+
// Indicates whether all of this tiles items are within bounds
|
|
1019
|
+
bool is_full_tile = buffer_offset + BUFFERS_PER_BLOCK < num_buffers;
|
|
1020
|
+
|
|
1021
|
+
// Load the buffer sizes of this tile's buffers
|
|
1022
|
+
BufferSizeIteratorT tile_buffer_sizes_it = buffer_sizes_it + buffer_offset;
|
|
1023
|
+
BufferSizeT buffer_sizes[BUFFERS_PER_THREAD];
|
|
1024
|
+
if (is_full_tile)
|
|
1025
|
+
{
|
|
1026
|
+
LoadBufferSizesFullTile(tile_buffer_sizes_it, buffer_sizes);
|
|
1027
|
+
}
|
|
1028
|
+
else
|
|
1029
|
+
{
|
|
1030
|
+
LoadBufferSizesPartialTile(tile_buffer_sizes_it, buffer_sizes, num_buffers - buffer_offset);
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
// Ensure we can repurpose the BlockLoad's temporary storage
|
|
1034
|
+
__syncthreads();
|
|
1035
|
+
|
|
1036
|
+
// Count how many buffers fall into each size-class
|
|
1037
|
+
VectorizedSizeClassCounterT size_class_histogram = GetBufferSizeClassHistogram(buffer_sizes);
|
|
1038
|
+
|
|
1039
|
+
// Compute the prefix sum over the histogram
|
|
1040
|
+
VectorizedSizeClassCounterT size_class_agg = {};
|
|
1041
|
+
BlockSizeClassScanT(temp_storage.size_scan_storage)
|
|
1042
|
+
.ExclusiveSum(size_class_histogram, size_class_histogram, size_class_agg);
|
|
1043
|
+
|
|
1044
|
+
// Ensure we can repurpose the scan's temporary storage for scattering the buffer ids
|
|
1045
|
+
__syncthreads();
|
|
1046
|
+
|
|
1047
|
+
// Factor in the per-size-class counts / offsets
|
|
1048
|
+
// That is, WLEV buffer offset has to be offset by the TLEV buffer count and BLEV buffer offset
|
|
1049
|
+
// has to be offset by the TLEV+WLEV buffer count
|
|
1050
|
+
uint32_t buffer_count = 0U;
|
|
1051
|
+
for (uint32_t i = 0; i < NUM_SIZE_CLASSES; i++)
|
|
1052
|
+
{
|
|
1053
|
+
size_class_histogram.add(i, buffer_count);
|
|
1054
|
+
buffer_count += size_class_agg.get(i);
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
// Signal the number of BLEV buffers we're planning to write out
|
|
1058
|
+
BufferOffsetT buffer_exclusive_prefix = 0;
|
|
1059
|
+
if (tile_id == 0)
|
|
1060
|
+
{
|
|
1061
|
+
if (threadIdx.x == 0)
|
|
1062
|
+
{
|
|
1063
|
+
blev_buffer_scan_state.SetInclusive(tile_id, size_class_agg.get(BLEV_SIZE_CLASS));
|
|
1064
|
+
}
|
|
1065
|
+
buffer_exclusive_prefix = 0;
|
|
1066
|
+
}
|
|
1067
|
+
else
|
|
1068
|
+
{
|
|
1069
|
+
BLevBuffScanPrefixCallbackOpT blev_buffer_prefix_op(
|
|
1070
|
+
blev_buffer_scan_state, temp_storage.buffer_scan_callback, ::cuda::std::plus<>{}, tile_id);
|
|
1071
|
+
|
|
1072
|
+
// Signal our partial prefix and wait for the inclusive prefix of previous tiles
|
|
1073
|
+
if (threadIdx.x < warp_threads)
|
|
1074
|
+
{
|
|
1075
|
+
buffer_exclusive_prefix = blev_buffer_prefix_op(size_class_agg.get(BLEV_SIZE_CLASS));
|
|
1076
|
+
}
|
|
1077
|
+
}
|
|
1078
|
+
if (threadIdx.x == 0)
|
|
1079
|
+
{
|
|
1080
|
+
temp_storage.blev_buffer_offset = buffer_exclusive_prefix;
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
// Ensure the prefix callback has finished using its temporary storage and that it can be reused
|
|
1084
|
+
// in the next stage
|
|
1085
|
+
__syncthreads();
|
|
1086
|
+
|
|
1087
|
+
// Scatter the buffers into one of the three partitions (TLEV, WLEV, BLEV) depending on their
|
|
1088
|
+
// size
|
|
1089
|
+
PartitionBuffersBySize(buffer_sizes, size_class_histogram, temp_storage.staged.buffers_by_size_class);
|
|
1090
|
+
|
|
1091
|
+
// Ensure all buffers have been partitioned by their size class AND
|
|
1092
|
+
// ensure that blev_buffer_offset has been written to shared memory
|
|
1093
|
+
__syncthreads();
|
|
1094
|
+
|
|
1095
|
+
// TODO: think about prefetching tile_buffer_{srcs,dsts} into shmem
|
|
1096
|
+
InputBufferIt tile_buffer_srcs = input_buffer_it + buffer_offset;
|
|
1097
|
+
OutputBufferIt tile_buffer_dsts = output_buffer_it + buffer_offset;
|
|
1098
|
+
BufferSizeIteratorT tile_buffer_sizes = buffer_sizes_it + buffer_offset;
|
|
1099
|
+
|
|
1100
|
+
// Copy block-level buffers
|
|
1101
|
+
EnqueueBLEVBuffers(
|
|
1102
|
+
&temp_storage.staged
|
|
1103
|
+
.buffers_by_size_class[size_class_agg.get(TLEV_SIZE_CLASS) + size_class_agg.get(WLEV_SIZE_CLASS)],
|
|
1104
|
+
tile_buffer_srcs,
|
|
1105
|
+
tile_buffer_dsts,
|
|
1106
|
+
tile_buffer_sizes,
|
|
1107
|
+
size_class_agg.get(BLEV_SIZE_CLASS),
|
|
1108
|
+
temp_storage.blev_buffer_offset,
|
|
1109
|
+
tile_id);
|
|
1110
|
+
|
|
1111
|
+
// Ensure we can repurpose the temporary storage required by EnqueueBLEVBuffers
|
|
1112
|
+
__syncthreads();
|
|
1113
|
+
|
|
1114
|
+
// Copy warp-level buffers
|
|
1115
|
+
BatchMemcpyWLEVBuffers(
|
|
1116
|
+
&temp_storage.staged.buffers_by_size_class[size_class_agg.get(TLEV_SIZE_CLASS)],
|
|
1117
|
+
tile_buffer_srcs,
|
|
1118
|
+
tile_buffer_dsts,
|
|
1119
|
+
tile_buffer_sizes,
|
|
1120
|
+
size_class_agg.get(WLEV_SIZE_CLASS));
|
|
1121
|
+
|
|
1122
|
+
// Perform batch memcpy for all the buffers that require thread-level collaboration
|
|
1123
|
+
uint32_t num_tlev_buffers = size_class_agg.get(TLEV_SIZE_CLASS);
|
|
1124
|
+
BatchMemcpyTLEVBuffers(
|
|
1125
|
+
temp_storage.staged.buffers_by_size_class, tile_buffer_srcs, tile_buffer_dsts, num_tlev_buffers);
|
|
1126
|
+
}
|
|
1127
|
+
|
|
1128
|
+
//-----------------------------------------------------------------------------
|
|
1129
|
+
// CONSTRUCTOR
|
|
1130
|
+
//-----------------------------------------------------------------------------
|
|
1131
|
+
_CCCL_DEVICE _CCCL_FORCEINLINE AgentBatchMemcpy(
|
|
1132
|
+
TempStorage& temp_storage,
|
|
1133
|
+
InputBufferIt input_buffer_it,
|
|
1134
|
+
OutputBufferIt output_buffer_it,
|
|
1135
|
+
BufferSizeIteratorT buffer_sizes_it,
|
|
1136
|
+
BufferOffsetT num_buffers,
|
|
1137
|
+
BlevBufferSrcsOutItT blev_buffer_srcs,
|
|
1138
|
+
BlevBufferDstsOutItT blev_buffer_dsts,
|
|
1139
|
+
BlevBufferSizesOutItT blev_buffer_sizes,
|
|
1140
|
+
BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
|
|
1141
|
+
BLevBufferOffsetTileState blev_buffer_scan_state,
|
|
1142
|
+
BLevBlockOffsetTileState blev_block_scan_state)
|
|
1143
|
+
: temp_storage(temp_storage.Alias())
|
|
1144
|
+
, input_buffer_it(input_buffer_it)
|
|
1145
|
+
, output_buffer_it(output_buffer_it)
|
|
1146
|
+
, buffer_sizes_it(buffer_sizes_it)
|
|
1147
|
+
, num_buffers(num_buffers)
|
|
1148
|
+
, blev_buffer_srcs(blev_buffer_srcs)
|
|
1149
|
+
, blev_buffer_dsts(blev_buffer_dsts)
|
|
1150
|
+
, blev_buffer_sizes(blev_buffer_sizes)
|
|
1151
|
+
, blev_buffer_tile_offsets(blev_buffer_tile_offsets)
|
|
1152
|
+
, blev_buffer_scan_state(blev_buffer_scan_state)
|
|
1153
|
+
, blev_block_scan_state(blev_block_scan_state)
|
|
1154
|
+
{}
|
|
1155
|
+
|
|
1156
|
+
private:
|
|
1157
|
+
// Iterator providing the pointers to the source memory buffers
|
|
1158
|
+
InputBufferIt input_buffer_it;
|
|
1159
|
+
// Iterator providing the pointers to the destination memory buffers
|
|
1160
|
+
OutputBufferIt output_buffer_it;
|
|
1161
|
+
// Iterator providing the number of bytes to be copied for each pair of buffers
|
|
1162
|
+
BufferSizeIteratorT buffer_sizes_it;
|
|
1163
|
+
// The total number of buffer pairs
|
|
1164
|
+
BufferOffsetT num_buffers;
|
|
1165
|
+
// Output iterator to which the source pointers of the BLEV buffers are written
|
|
1166
|
+
BlevBufferSrcsOutItT blev_buffer_srcs;
|
|
1167
|
+
// Output iterator to which the destination pointers of the BLEV buffers are written
|
|
1168
|
+
BlevBufferDstsOutItT blev_buffer_dsts;
|
|
1169
|
+
// Output iterator to which the number of bytes to be copied of the BLEV buffers are written
|
|
1170
|
+
BlevBufferSizesOutItT blev_buffer_sizes;
|
|
1171
|
+
// Output iterator to which the mapping of tiles to BLEV buffers is written
|
|
1172
|
+
BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets;
|
|
1173
|
+
// The single-pass prefix scan's tile state used for tracking the prefix sum over the number of
|
|
1174
|
+
// BLEV buffers
|
|
1175
|
+
BLevBufferOffsetTileState blev_buffer_scan_state;
|
|
1176
|
+
// The single-pass prefix scan's tile state used for tracking the prefix sum over tiles of BLEV
|
|
1177
|
+
// buffers
|
|
1178
|
+
BLevBlockOffsetTileState blev_block_scan_state;
|
|
1179
|
+
};
|
|
1180
|
+
} // namespace detail::batch_memcpy
|
|
1181
|
+
|
|
1182
|
+
CUB_NAMESPACE_END
|