cuda-cccl 0.4.3__cp312-cp312-manylinux_2_26_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cuda/cccl/__init__.py +27 -0
- cuda/cccl/_cuda_version_utils.py +24 -0
- cuda/cccl/cooperative/__init__.py +9 -0
- cuda/cccl/cooperative/experimental/__init__.py +24 -0
- cuda/cccl/headers/__init__.py +7 -0
- cuda/cccl/headers/include/__init__.py +1 -0
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +233 -0
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1158 -0
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +55 -0
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +699 -0
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +365 -0
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +721 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +756 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +277 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +715 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +546 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +592 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +780 -0
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1092 -0
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +564 -0
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +448 -0
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +263 -0
- cuda/cccl/headers/include/cub/agent/agent_segmented_scan.cuh +292 -0
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1090 -0
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +584 -0
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +762 -0
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +599 -0
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1384 -0
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +939 -0
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1200 -0
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +1279 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +396 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +1269 -0
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +437 -0
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +771 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1215 -0
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2129 -0
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +124 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +661 -0
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +406 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +2168 -0
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +319 -0
- cuda/cccl/headers/include/cub/block/block_store.cuh +1238 -0
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +597 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +62 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +209 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +230 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +207 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +257 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +766 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +514 -0
- cuda/cccl/headers/include/cub/config.cuh +29 -0
- cuda/cccl/headers/include/cub/cub.cuh +96 -0
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +54 -0
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +135 -0
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +50 -0
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +94 -0
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +60 -0
- cuda/cccl/headers/include/cub/detail/env_dispatch.cuh +87 -0
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +227 -0
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +87 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +149 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +103 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +112 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/array.cuh +41 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/json.cuh +39 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/object.cuh +71 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/string.cuh +79 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/value.cuh +95 -0
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.cuh +39 -0
- cuda/cccl/headers/include/cub/detail/rfa.cuh +706 -0
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +163 -0
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +194 -0
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +377 -0
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +185 -0
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +48 -0
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +33 -0
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +572 -0
- cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
- cuda/cccl/headers/include/cub/device/device_for.cuh +1061 -0
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +1485 -0
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +171 -0
- cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +955 -0
- cuda/cccl/headers/include/cub/device/device_partition.cuh +644 -0
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3413 -0
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +2303 -0
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +346 -0
- cuda/cccl/headers/include/cub/device/device_scan.cuh +2152 -0
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1472 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1635 -0
- cuda/cccl/headers/include/cub/device/device_segmented_scan.cuh +1398 -0
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2787 -0
- cuda/cccl/headers/include/cub/device/device_select.cuh +1204 -0
- cuda/cccl/headers/include/cub/device/device_topk.cuh +521 -0
- cuda/cccl/headers/include/cub/device/device_transform.cuh +666 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +289 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +694 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +50 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +77 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh +349 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +160 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1849 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +317 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +429 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1066 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +830 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +629 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +479 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +256 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +603 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +447 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +545 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_radix_sort.cuh +638 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_reduce.cuh +410 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_scan.cuh +278 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +899 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +831 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +321 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +454 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +364 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +626 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +527 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +472 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_for_each.cuh +259 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_histogram.cuh +669 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_merge_sort.cuh +332 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_radix_sort.cuh +553 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_reduce.cuh +584 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_scan.cuh +178 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh +262 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +295 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_scan.cuh +77 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +521 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +200 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_transform.cuh +1049 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +175 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/common.cuh +97 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +43 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +94 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +34 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +268 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +108 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +100 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1045 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +681 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +918 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +647 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +571 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +986 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_scan.cuh +108 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +373 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1563 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +415 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +84 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +476 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +858 -0
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +203 -0
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +82 -0
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +175 -0
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +230 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +235 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +226 -0
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +293 -0
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +353 -0
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +664 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +525 -0
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +472 -0
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +214 -0
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +456 -0
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +78 -0
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +341 -0
- cuda/cccl/headers/include/cub/util_allocator.cuh +897 -0
- cuda/cccl/headers/include/cub/util_arch.cuh +176 -0
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +71 -0
- cuda/cccl/headers/include/cub/util_debug.cuh +183 -0
- cuda/cccl/headers/include/cub/util_device.cuh +838 -0
- cuda/cccl/headers/include/cub/util_macro.cuh +73 -0
- cuda/cccl/headers/include/cub/util_math.cuh +92 -0
- cuda/cccl/headers/include/cub/util_namespace.cuh +152 -0
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +31 -0
- cuda/cccl/headers/include/cub/util_ptx.cuh +483 -0
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +93 -0
- cuda/cccl/headers/include/cub/util_type.cuh +1084 -0
- cuda/cccl/headers/include/cub/util_vsmem.cuh +227 -0
- cuda/cccl/headers/include/cub/version.cuh +65 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +304 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +152 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +567 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +378 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +922 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +691 -0
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +381 -0
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +591 -0
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +145 -0
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +810 -0
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1863 -0
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +498 -0
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +59 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +199 -0
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +110 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +171 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +216 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
- cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
- cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +101 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +528 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +73 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
- cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +198 -0
- cuda/cccl/headers/include/cuda/__bit/bitfield.h +121 -0
- cuda/cccl/headers/include/cuda/__bit/bitmask.h +89 -0
- cuda/cccl/headers/include/cuda/__cccl_config +38 -0
- cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +123 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
- cuda/cccl/headers/include/cuda/__cmath/ilog.h +194 -0
- cuda/cccl/headers/include/cuda/__cmath/ipow.h +111 -0
- cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +145 -0
- cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
- cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
- cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
- cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
- cuda/cccl/headers/include/cuda/__cmath/sincos.h +134 -0
- cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
- cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
- cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
- cuda/cccl/headers/include/cuda/__complex_ +28 -0
- cuda/cccl/headers/include/cuda/__container/buffer.h +891 -0
- cuda/cccl/headers/include/cuda/__container/heterogeneous_iterator.h +436 -0
- cuda/cccl/headers/include/cuda/__container/uninitialized_async_buffer.h +416 -0
- cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
- cuda/cccl/headers/include/cuda/__device/arch_id.h +194 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +553 -0
- cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +172 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +168 -0
- cuda/cccl/headers/include/cuda/__device/physical_device.h +178 -0
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +1041 -0
- cuda/cccl/headers/include/cuda/__event/event.h +171 -0
- cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
- cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +89 -0
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +87 -0
- cuda/cccl/headers/include/cuda/__execution/policy.h +53 -0
- cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
- cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
- cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
- cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
- cuda/cccl/headers/include/cuda/__functional/maximum.h +77 -0
- cuda/cccl/headers/include/cuda/__functional/minimum.h +77 -0
- cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +106 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/execution_policy.h +47 -0
- cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
- cuda/cccl/headers/include/cuda/__hierarchy/dimensions.h +162 -0
- cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_dimensions.h +986 -0
- cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_levels.h +494 -0
- cuda/cccl/headers/include/cuda/__hierarchy/level_dimensions.h +225 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +490 -0
- cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
- cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
- cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +147 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +114 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +555 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +589 -0
- cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
- cuda/cccl/headers/include/cuda/__launch/configuration.h +754 -0
- cuda/cccl/headers/include/cuda/__launch/host_launch.h +115 -0
- cuda/cccl/headers/include/cuda/__launch/launch.h +334 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +531 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +239 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +118 -0
- cuda/cccl/headers/include/cuda/__mdspan/shared_memory_accessor.h +208 -0
- cuda/cccl/headers/include/cuda/__mdspan/shared_memory_mdspan.h +129 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +77 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +164 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +57 -0
- cuda/cccl/headers/include/cuda/__memory/address_space.h +256 -0
- cuda/cccl/headers/include/cuda/__memory/align_down.h +77 -0
- cuda/cccl/headers/include/cuda/__memory/align_up.h +77 -0
- cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
- cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
- cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
- cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
- cuda/cccl/headers/include/cuda/__memory/is_aligned.h +60 -0
- cuda/cccl/headers/include/cuda/__memory/is_pointer_accessible.h +278 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +92 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
- cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +125 -0
- cuda/cccl/headers/include/cuda/__memory_pool/device_memory_pool.h +166 -0
- cuda/cccl/headers/include/cuda/__memory_pool/managed_memory_pool.h +161 -0
- cuda/cccl/headers/include/cuda/__memory_pool/memory_pool_base.h +644 -0
- cuda/cccl/headers/include/cuda/__memory_pool/pinned_memory_pool.h +218 -0
- cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +882 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
- cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
- cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +141 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +130 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +135 -0
- cuda/cccl/headers/include/cuda/__memory_resource/shared_resource.h +261 -0
- cuda/cccl/headers/include/cuda/__memory_resource/synchronous_resource_adapter.h +136 -0
- cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +331 -0
- cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
- cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
- cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +359 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2977 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +245 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +977 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +302 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +631 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_inval.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/setmaxnreg.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +120 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +91 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +693 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +50 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +11437 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +6513 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6726 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +40 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4767 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +48 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +886 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_inval.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/setmaxnreg.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +178 -0
- cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
- cuda/cccl/headers/include/cuda/__random/pcg_engine.h +398 -0
- cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
- cuda/cccl/headers/include/cuda/__runtime/api_wrapper.h +62 -0
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
- cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
- cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +109 -0
- cuda/cccl/headers/include/cuda/__stream/internal_streams.h +49 -0
- cuda/cccl/headers/include/cuda/__stream/invalid_stream.h +47 -0
- cuda/cccl/headers/include/cuda/__stream/launch_transform.h +193 -0
- cuda/cccl/headers/include/cuda/__stream/stream.h +145 -0
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +356 -0
- cuda/cccl/headers/include/cuda/__tma/make_tma_descriptor.h +657 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_instantiable_with.h +47 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
- cuda/cccl/headers/include/cuda/__type_traits/vector_type.h +355 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +611 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +170 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +147 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +256 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +323 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +183 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
- cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
- cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
- cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
- cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
- cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
- cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
- cuda/cccl/headers/include/cuda/access_property +26 -0
- cuda/cccl/headers/include/cuda/algorithm +28 -0
- cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
- cuda/cccl/headers/include/cuda/atomic +27 -0
- cuda/cccl/headers/include/cuda/barrier +293 -0
- cuda/cccl/headers/include/cuda/bit +29 -0
- cuda/cccl/headers/include/cuda/buffer +27 -0
- cuda/cccl/headers/include/cuda/cmath +38 -0
- cuda/cccl/headers/include/cuda/devices +33 -0
- cuda/cccl/headers/include/cuda/discard_memory +32 -0
- cuda/cccl/headers/include/cuda/functional +32 -0
- cuda/cccl/headers/include/cuda/hierarchy +28 -0
- cuda/cccl/headers/include/cuda/iterator +39 -0
- cuda/cccl/headers/include/cuda/latch +27 -0
- cuda/cccl/headers/include/cuda/launch +28 -0
- cuda/cccl/headers/include/cuda/mdspan +29 -0
- cuda/cccl/headers/include/cuda/memory +37 -0
- cuda/cccl/headers/include/cuda/memory_pool +27 -0
- cuda/cccl/headers/include/cuda/memory_resource +41 -0
- cuda/cccl/headers/include/cuda/numeric +31 -0
- cuda/cccl/headers/include/cuda/pipeline +580 -0
- cuda/cccl/headers/include/cuda/ptx +131 -0
- cuda/cccl/headers/include/cuda/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +143 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/inplace_merge.h +293 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move.h +91 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/nth_element.h +309 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if.h +78 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if_not.h +85 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +97 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sample.h +116 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shuffle.h +71 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sort.h +1097 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/stable_partition.h +359 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/stable_sort.h +321 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +153 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +458 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +4436 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +184 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +242 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
- cuda/cccl/headers/include/cuda/std/__atomic/order.h +158 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
- cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +242 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +103 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +238 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +51 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
- cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
- cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
- cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
- cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +81 -0
- cuda/cccl/headers/include/cuda/std/__bit/blsr.h +51 -0
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +209 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +191 -0
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +202 -0
- cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
- cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
- cuda/cccl/headers/include/cuda/std/__bit/integral.h +125 -0
- cuda/cccl/headers/include/cuda/std/__bit/popcount.h +172 -0
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
- cuda/cccl/headers/include/cuda/std/__bit/rotate.h +185 -0
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
- cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
- cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +494 -0
- cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +213 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +87 -0
- cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +197 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +355 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
- cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +139 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +59 -0
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +71 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1288 -0
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +312 -0
- cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +363 -0
- cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
- cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
- cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
- cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
- cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
- cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
- cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +171 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +192 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +52 -0
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +160 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +499 -0
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +53 -0
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +44 -0
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +185 -0
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +58 -0
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +78 -0
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +255 -0
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +184 -0
- cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
- cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +204 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +783 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +122 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +129 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +230 -0
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +204 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +285 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +220 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +285 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +370 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +166 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +203 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +184 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
- cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +533 -0
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
- cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
- cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +210 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +198 -0
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +983 -0
- cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +242 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +327 -0
- cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +403 -0
- cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +119 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +522 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
- cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +302 -0
- cuda/cccl/headers/include/cuda/std/__complex/math.h +161 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
- cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
- cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
- cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
- cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
- cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
- cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
- cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +385 -0
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
- cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
- cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +98 -0
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
- cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
- cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
- cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
- cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
- cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +110 -0
- cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +108 -0
- cuda/cccl/headers/include/cuda/std/__exception/format_error.h +62 -0
- cuda/cccl/headers/include/cuda/std/__exception/msg_storage.h +41 -0
- cuda/cccl/headers/include/cuda/std/__exception/terminate.h +74 -0
- cuda/cccl/headers/include/cuda/std/__exception/throw_error.h +120 -0
- cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
- cuda/cccl/headers/include/cuda/std/__execution/policy.h +90 -0
- cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1051 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +164 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +375 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +126 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
- cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
- cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
- cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
- cuda/cccl/headers/include/cuda/std/__format/format_context.h +93 -0
- cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
- cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
- cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1265 -0
- cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
- cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
- cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
- cuda/cccl/headers/include/cuda/std/__format_ +45 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +81 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/compose.h +69 -0
- cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +76 -0
- cuda/cccl/headers/include/cuda/std/__functional/function.h +1271 -0
- cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
- cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +298 -0
- cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
- cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
- cuda/cccl/headers/include/cuda/std/__functional/operations.h +535 -0
- cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +114 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
- cuda/cccl/headers/include/cuda/std/__fwd/execution_policy.h +73 -0
- cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
- cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
- cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +122 -0
- cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/ios.h +123 -0
- cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
- cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +39 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
- cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
- cuda/cccl/headers/include/cuda/std/__fwd/variant.h +51 -0
- cuda/cccl/headers/include/cuda/std/__internal/atomic.h +55 -0
- cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +104 -0
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +132 -0
- cuda/cccl/headers/include/cuda/std/__internal/pstl_config.h +32 -0
- cuda/cccl/headers/include/cuda/std/__internal/thread_api.h +58 -0
- cuda/cccl/headers/include/cuda/std/__internal/version.h +52 -0
- cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +227 -0
- cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +164 -0
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
- cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
- cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +465 -0
- cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +124 -0
- cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +100 -0
- cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +181 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
- cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
- cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
- cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +366 -0
- cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
- cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
- cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +603 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +76 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +52 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +136 -0
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +128 -0
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +315 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +137 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +348 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +749 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +598 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +515 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +190 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +187 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +339 -0
- cuda/cccl/headers/include/cuda/std/__memory/addressof.h +90 -0
- cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +82 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +327 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +85 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +517 -0
- cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
- cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +241 -0
- cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +51 -0
- cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
- cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +99 -0
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
- cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
- cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +131 -0
- cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
- cuda/cccl/headers/include/cuda/std/__new/device_new.h +30 -0
- cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
- cuda/cccl/headers/include/cuda/std/__new_ +30 -0
- cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
- cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
- cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
- cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
- cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
- cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
- cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
- cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
- cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
- cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +861 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +439 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
- cuda/cccl/headers/include/cuda/std/__pstl/cuda/for_each_n.h +97 -0
- cuda/cccl/headers/include/cuda/std/__pstl/dispatch.h +123 -0
- cuda/cccl/headers/include/cuda/std/__pstl/for_each.h +71 -0
- cuda/cccl/headers/include/cuda/std/__pstl/for_each_n.h +68 -0
- cuda/cccl/headers/include/cuda/std/__random/bernoulli_distribution.h +173 -0
- cuda/cccl/headers/include/cuda/std/__random/binomial_distribution.h +254 -0
- cuda/cccl/headers/include/cuda/std/__random/cauchy_distribution.h +192 -0
- cuda/cccl/headers/include/cuda/std/__random/chi_squared_distribution.h +179 -0
- cuda/cccl/headers/include/cuda/std/__random/exponential_distribution.h +187 -0
- cuda/cccl/headers/include/cuda/std/__random/extreme_value_distribution.h +196 -0
- cuda/cccl/headers/include/cuda/std/__random/fisher_f_distribution.h +196 -0
- cuda/cccl/headers/include/cuda/std/__random/gamma_distribution.h +257 -0
- cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
- cuda/cccl/headers/include/cuda/std/__random/geometric_distribution.h +179 -0
- cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
- cuda/cccl/headers/include/cuda/std/__random/is_valid.h +70 -0
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
- cuda/cccl/headers/include/cuda/std/__random/lognormal_distribution.h +174 -0
- cuda/cccl/headers/include/cuda/std/__random/negative_binomial_distribution.h +212 -0
- cuda/cccl/headers/include/cuda/std/__random/normal_distribution.h +232 -0
- cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
- cuda/cccl/headers/include/cuda/std/__random/poisson_distribution.h +338 -0
- cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
- cuda/cccl/headers/include/cuda/std/__random/student_t_distribution.h +186 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +341 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +192 -0
- cuda/cccl/headers/include/cuda/std/__random/weibull_distribution.h +189 -0
- cuda/cccl/headers/include/cuda/std/__random_ +47 -0
- cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
- cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +889 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
- cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
- cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
- cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
- cuda/cccl/headers/include/cuda/std/__ranges/drop_view.h +389 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
- cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
- cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +264 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +409 -0
- cuda/cccl/headers/include/cuda/std/__ranges/non_propagating_cache.h +210 -0
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +163 -0
- cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +111 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
- cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +346 -0
- cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
- cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
- cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +510 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +472 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
- cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
- cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +52 -0
- cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
- cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__string/char_traits.h +190 -0
- cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +580 -0
- cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
- cuda/cccl/headers/include/cuda/std/__string_ +29 -0
- cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
- cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +100 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +77 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +83 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +155 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +49 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +63 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +72 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +200 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/as_const.h +73 -0
- cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/cmp.h +114 -0
- cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
- cuda/cccl/headers/include/cuda/std/__utility/ctad_support.h +27 -0
- cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
- cuda/cccl/headers/include/cuda/std/__utility/delegate_constructors.h +51 -0
- cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +162 -0
- cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward.h +82 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +82 -0
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +249 -0
- cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
- cuda/cccl/headers/include/cuda/std/__utility/move.h +126 -0
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
- cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
- cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +425 -0
- cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +61 -0
- cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
- cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
- cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
- cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
- cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
- cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
- cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
- cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
- cuda/cccl/headers/include/cuda/std/algorithm +138 -0
- cuda/cccl/headers/include/cuda/std/array +519 -0
- cuda/cccl/headers/include/cuda/std/atomic +810 -0
- cuda/cccl/headers/include/cuda/std/barrier +42 -0
- cuda/cccl/headers/include/cuda/std/bit +35 -0
- cuda/cccl/headers/include/cuda/std/bitset +986 -0
- cuda/cccl/headers/include/cuda/std/cassert +28 -0
- cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
- cuda/cccl/headers/include/cuda/std/cfloat +59 -0
- cuda/cccl/headers/include/cuda/std/charconv +31 -0
- cuda/cccl/headers/include/cuda/std/chrono +26 -0
- cuda/cccl/headers/include/cuda/std/climits +61 -0
- cuda/cccl/headers/include/cuda/std/cmath +87 -0
- cuda/cccl/headers/include/cuda/std/complex +50 -0
- cuda/cccl/headers/include/cuda/std/concepts +48 -0
- cuda/cccl/headers/include/cuda/std/cstddef +28 -0
- cuda/cccl/headers/include/cuda/std/cstdint +178 -0
- cuda/cccl/headers/include/cuda/std/cstdlib +31 -0
- cuda/cccl/headers/include/cuda/std/cstring +110 -0
- cuda/cccl/headers/include/cuda/std/ctime +155 -0
- cuda/cccl/headers/include/cuda/std/detail/__config +22 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2506 -0
- cuda/cccl/headers/include/cuda/std/execution +29 -0
- cuda/cccl/headers/include/cuda/std/expected +30 -0
- cuda/cccl/headers/include/cuda/std/functional +56 -0
- cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
- cuda/cccl/headers/include/cuda/std/iterator +70 -0
- cuda/cccl/headers/include/cuda/std/latch +34 -0
- cuda/cccl/headers/include/cuda/std/limits +28 -0
- cuda/cccl/headers/include/cuda/std/linalg +30 -0
- cuda/cccl/headers/include/cuda/std/mdspan +38 -0
- cuda/cccl/headers/include/cuda/std/memory +40 -0
- cuda/cccl/headers/include/cuda/std/numbers +344 -0
- cuda/cccl/headers/include/cuda/std/numeric +41 -0
- cuda/cccl/headers/include/cuda/std/optional +31 -0
- cuda/cccl/headers/include/cuda/std/ranges +70 -0
- cuda/cccl/headers/include/cuda/std/ratio +416 -0
- cuda/cccl/headers/include/cuda/std/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/source_location +107 -0
- cuda/cccl/headers/include/cuda/std/span +599 -0
- cuda/cccl/headers/include/cuda/std/string_view +924 -0
- cuda/cccl/headers/include/cuda/std/tuple +43 -0
- cuda/cccl/headers/include/cuda/std/type_traits +176 -0
- cuda/cccl/headers/include/cuda/std/utility +70 -0
- cuda/cccl/headers/include/cuda/std/variant +32 -0
- cuda/cccl/headers/include/cuda/std/version +240 -0
- cuda/cccl/headers/include/cuda/stream +32 -0
- cuda/cccl/headers/include/cuda/stream_ref +59 -0
- cuda/cccl/headers/include/cuda/tma +25 -0
- cuda/cccl/headers/include/cuda/type_traits +27 -0
- cuda/cccl/headers/include/cuda/utility +28 -0
- cuda/cccl/headers/include/cuda/version +16 -0
- cuda/cccl/headers/include/cuda/warp +28 -0
- cuda/cccl/headers/include/cuda/work_stealing +26 -0
- cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
- cuda/cccl/headers/include/nv/detail/__target_macros +739 -0
- cuda/cccl/headers/include/nv/target +241 -0
- cuda/cccl/headers/include/thrust/addressof.h +22 -0
- cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
- cuda/cccl/headers/include/thrust/advance.h +60 -0
- cuda/cccl/headers/include/thrust/allocate_unique.h +301 -0
- cuda/cccl/headers/include/thrust/binary_search.h +1911 -0
- cuda/cccl/headers/include/thrust/complex.h +859 -0
- cuda/cccl/headers/include/thrust/copy.h +506 -0
- cuda/cccl/headers/include/thrust/count.h +245 -0
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +108 -0
- cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +629 -0
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +193 -0
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +95 -0
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +79 -0
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +76 -0
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +74 -0
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +113 -0
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +114 -0
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +96 -0
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +537 -0
- cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +210 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +877 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +591 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +234 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +162 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +194 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +170 -0
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +222 -0
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +209 -0
- cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
- cuda/cccl/headers/include/thrust/detail/complex/cpow.h +53 -0
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +75 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +169 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +212 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +136 -0
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +130 -0
- cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
- cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
- cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
- cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +57 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +50 -0
- cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +164 -0
- cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config.h +36 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +227 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +272 -0
- cuda/cccl/headers/include/thrust/detail/copy.h +70 -0
- cuda/cccl/headers/include/thrust/detail/copy.inl +146 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +114 -0
- cuda/cccl/headers/include/thrust/detail/count.h +55 -0
- cuda/cccl/headers/include/thrust/detail/count.inl +101 -0
- cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
- cuda/cccl/headers/include/thrust/detail/equal.inl +105 -0
- cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +79 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
- cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
- cuda/cccl/headers/include/thrust/detail/extrema.inl +196 -0
- cuda/cccl/headers/include/thrust/detail/fill.inl +97 -0
- cuda/cccl/headers/include/thrust/detail/find.inl +125 -0
- cuda/cccl/headers/include/thrust/detail/for_each.inl +96 -0
- cuda/cccl/headers/include/thrust/detail/function.h +49 -0
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +213 -0
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +384 -0
- cuda/cccl/headers/include/thrust/detail/gather.inl +185 -0
- cuda/cccl/headers/include/thrust/detail/generate.inl +97 -0
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +60 -0
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +130 -0
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +335 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +125 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +89 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +208 -0
- cuda/cccl/headers/include/thrust/detail/merge.inl +288 -0
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +107 -0
- cuda/cccl/headers/include/thrust/detail/nvtx_policy.h +41 -0
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +120 -0
- cuda/cccl/headers/include/thrust/detail/partition.inl +390 -0
- cuda/cccl/headers/include/thrust/detail/pointer.h +313 -0
- cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +114 -0
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +128 -0
- cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +191 -0
- cuda/cccl/headers/include/thrust/detail/reduce.inl +396 -0
- cuda/cccl/headers/include/thrust/detail/reference.h +521 -0
- cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
- cuda/cccl/headers/include/thrust/detail/remove.inl +225 -0
- cuda/cccl/headers/include/thrust/detail/replace.inl +243 -0
- cuda/cccl/headers/include/thrust/detail/reverse.inl +100 -0
- cuda/cccl/headers/include/thrust/detail/scan.inl +536 -0
- cuda/cccl/headers/include/thrust/detail/scatter.inl +169 -0
- cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
- cuda/cccl/headers/include/thrust/detail/sequence.inl +121 -0
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +993 -0
- cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/sort.inl +385 -0
- cuda/cccl/headers/include/thrust/detail/static_assert.h +56 -0
- cuda/cccl/headers/include/thrust/detail/static_map.h +164 -0
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +77 -0
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +74 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +150 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +121 -0
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +93 -0
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +81 -0
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +173 -0
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +128 -0
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +59 -0
- cuda/cccl/headers/include/thrust/detail/type_deduction.h +61 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +50 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +162 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +67 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +55 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +48 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +91 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
- cuda/cccl/headers/include/thrust/detail/type_traits.h +143 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +102 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +97 -0
- cuda/cccl/headers/include/thrust/detail/unique.inl +391 -0
- cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +1216 -0
- cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
- cuda/cccl/headers/include/thrust/device_delete.h +74 -0
- cuda/cccl/headers/include/thrust/device_free.h +85 -0
- cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
- cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
- cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
- cuda/cccl/headers/include/thrust/device_new.h +112 -0
- cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
- cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
- cuda/cccl/headers/include/thrust/device_reference.h +983 -0
- cuda/cccl/headers/include/thrust/device_vector.h +576 -0
- cuda/cccl/headers/include/thrust/distance.h +44 -0
- cuda/cccl/headers/include/thrust/equal.h +247 -0
- cuda/cccl/headers/include/thrust/execution_policy.h +252 -0
- cuda/cccl/headers/include/thrust/extrema.h +657 -0
- cuda/cccl/headers/include/thrust/fill.h +200 -0
- cuda/cccl/headers/include/thrust/find.h +382 -0
- cuda/cccl/headers/include/thrust/for_each.h +261 -0
- cuda/cccl/headers/include/thrust/functional.h +399 -0
- cuda/cccl/headers/include/thrust/gather.h +464 -0
- cuda/cccl/headers/include/thrust/generate.h +193 -0
- cuda/cccl/headers/include/thrust/host_vector.h +576 -0
- cuda/cccl/headers/include/thrust/inner_product.h +264 -0
- cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
- cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +338 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +43 -0
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +42 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +83 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +56 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +184 -0
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +55 -0
- cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
- cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +80 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +171 -0
- cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
- cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
- cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
- cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
- cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
- cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +190 -0
- cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +225 -0
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +353 -0
- cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
- cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +365 -0
- cuda/cccl/headers/include/thrust/logical.h +290 -0
- cuda/cccl/headers/include/thrust/memory.h +299 -0
- cuda/cccl/headers/include/thrust/merge.h +726 -0
- cuda/cccl/headers/include/thrust/mismatch.h +262 -0
- cuda/cccl/headers/include/thrust/mr/allocator.h +227 -0
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +46 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +116 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +65 -0
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +65 -0
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +42 -0
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +215 -0
- cuda/cccl/headers/include/thrust/mr/new.h +98 -0
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +61 -0
- cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
- cuda/cccl/headers/include/thrust/mr/pool_options.h +172 -0
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +112 -0
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +62 -0
- cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
- cuda/cccl/headers/include/thrust/mr/validator.h +54 -0
- cuda/cccl/headers/include/thrust/pair.h +102 -0
- cuda/cccl/headers/include/thrust/partition.h +1392 -0
- cuda/cccl/headers/include/thrust/per_device_resource.h +110 -0
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +182 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +153 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +97 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +149 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +46 -0
- cuda/cccl/headers/include/thrust/random/detail/mod.h +94 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +185 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +157 -0
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +56 -0
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +199 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +196 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +181 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +186 -0
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +238 -0
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +287 -0
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +215 -0
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +256 -0
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +245 -0
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +259 -0
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +257 -0
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +253 -0
- cuda/cccl/headers/include/thrust/random.h +118 -0
- cuda/cccl/headers/include/thrust/reduce.h +1114 -0
- cuda/cccl/headers/include/thrust/remove.h +768 -0
- cuda/cccl/headers/include/thrust/replace.h +826 -0
- cuda/cccl/headers/include/thrust/reverse.h +215 -0
- cuda/cccl/headers/include/thrust/scan.h +1671 -0
- cuda/cccl/headers/include/thrust/scatter.h +446 -0
- cuda/cccl/headers/include/thrust/sequence.h +277 -0
- cuda/cccl/headers/include/thrust/set_operations.h +3027 -0
- cuda/cccl/headers/include/thrust/shuffle.h +182 -0
- cuda/cccl/headers/include/thrust/sort.h +1320 -0
- cuda/cccl/headers/include/thrust/swap.h +147 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +113 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +55 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +105 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +70 -0
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +118 -0
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +95 -0
- cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +218 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +272 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +251 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +280 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +162 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +578 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +241 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +230 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +62 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +87 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +266 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +473 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +99 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +165 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +89 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +59 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +77 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +79 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +55 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +119 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +55 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +205 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +88 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +79 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +401 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +70 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +774 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +994 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +105 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +121 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +83 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +340 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +412 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +90 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1722 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +473 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +99 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +62 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +428 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +139 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +117 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +114 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +102 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +288 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +307 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
- cuda/cccl/headers/include/thrust/system/cuda/error.h +159 -0
- cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +118 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +115 -0
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +155 -0
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +104 -0
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/errno.h +118 -0
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +298 -0
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +171 -0
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +77 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +159 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +370 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +62 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +56 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +145 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +46 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +82 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +65 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +246 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +135 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +56 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +94 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +57 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +61 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +97 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +146 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +67 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +127 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +208 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +41 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +69 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +98 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +81 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +183 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +84 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +93 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +172 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +46 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +65 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +61 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +125 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +124 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +229 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +83 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +105 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +281 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +474 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +123 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +111 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +176 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +42 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +39 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +53 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +81 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +393 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +78 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +111 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +164 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +112 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +69 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +112 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +80 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +124 -0
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +115 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +68 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +40 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +120 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +69 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +109 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +72 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +120 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +41 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +139 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +147 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +298 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +62 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +97 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +177 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +152 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +143 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +204 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +120 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +353 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +121 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +584 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +56 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +112 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +105 -0
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +106 -0
- cuda/cccl/headers/include/thrust/system/error_code.h +508 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +34 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +57 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +61 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +39 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +48 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +132 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +46 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +33 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +74 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +84 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +16 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +63 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +48 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +67 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +216 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +23 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +245 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +16 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +52 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/omp/memory.h +153 -0
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +71 -0
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
- cuda/cccl/headers/include/thrust/system/omp/vector.h +96 -0
- cuda/cccl/headers/include/thrust/system/system_error.h +183 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +33 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +59 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +117 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +113 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +45 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +28 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +72 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +302 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +79 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +16 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +121 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +378 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +118 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +66 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +294 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +20 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +273 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +16 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +49 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +55 -0
- cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +139 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +57 -0
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +106 -0
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +82 -0
- cuda/cccl/headers/include/thrust/system_error.h +57 -0
- cuda/cccl/headers/include/thrust/tabulate.h +125 -0
- cuda/cccl/headers/include/thrust/transform.h +1056 -0
- cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
- cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
- cuda/cccl/headers/include/thrust/tuple.h +139 -0
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +256 -0
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +156 -0
- cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +180 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +112 -0
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +333 -0
- cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
- cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
- cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
- cuda/cccl/headers/include/thrust/unique.h +1089 -0
- cuda/cccl/headers/include/thrust/universal_allocator.h +101 -0
- cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +80 -0
- cuda/cccl/headers/include/thrust/version.h +93 -0
- cuda/cccl/headers/include/thrust/zip_function.h +149 -0
- cuda/cccl/headers/include_paths.py +51 -0
- cuda/cccl/headers/lib/cmake/cccl/cccl-config-version.cmake +25 -0
- cuda/cccl/headers/lib/cmake/cccl/cccl-config.cmake +143 -0
- cuda/cccl/headers/lib/cmake/cub/cub-config-version.cmake +29 -0
- cuda/cccl/headers/lib/cmake/cub/cub-config.cmake +172 -0
- cuda/cccl/headers/lib/cmake/cub/cub-header-search.cmake +15 -0
- cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config-version.cmake +37 -0
- cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config.cmake +297 -0
- cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-header-search.cmake +15 -0
- cuda/cccl/headers/lib/cmake/thrust/FindTBB.cmake +498 -0
- cuda/cccl/headers/lib/cmake/thrust/README.md +258 -0
- cuda/cccl/headers/lib/cmake/thrust/thrust-config-version.cmake +37 -0
- cuda/cccl/headers/lib/cmake/thrust/thrust-config.cmake +983 -0
- cuda/cccl/headers/lib/cmake/thrust/thrust-header-search.cmake +15 -0
- cuda/cccl/parallel/__init__.py +9 -0
- cuda/cccl/parallel/experimental/__init__.py +24 -0
- cuda/cccl/py.typed +0 -0
- cuda/compute/__init__.py +91 -0
- cuda/compute/_bindings.py +79 -0
- cuda/compute/_bindings.pyi +516 -0
- cuda/compute/_bindings_impl.pyx +2470 -0
- cuda/compute/_caching.py +83 -0
- cuda/compute/_cccl_interop.py +354 -0
- cuda/compute/_odr_helpers.py +238 -0
- cuda/compute/_utils/__init__.py +0 -0
- cuda/compute/_utils/protocols.py +145 -0
- cuda/compute/_utils/temp_storage_buffer.py +87 -0
- cuda/compute/algorithms/__init__.py +62 -0
- cuda/compute/algorithms/_histogram.py +243 -0
- cuda/compute/algorithms/_reduce.py +205 -0
- cuda/compute/algorithms/_scan.py +344 -0
- cuda/compute/algorithms/_segmented_reduce.py +265 -0
- cuda/compute/algorithms/_select.py +196 -0
- cuda/compute/algorithms/_sort/__init__.py +23 -0
- cuda/compute/algorithms/_sort/_merge_sort.py +235 -0
- cuda/compute/algorithms/_sort/_radix_sort.py +263 -0
- cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
- cuda/compute/algorithms/_sort/_sort_common.py +52 -0
- cuda/compute/algorithms/_three_way_partition.py +292 -0
- cuda/compute/algorithms/_transform.py +317 -0
- cuda/compute/algorithms/_unique_by_key.py +259 -0
- cuda/compute/cccl/.gitkeep +0 -0
- cuda/compute/cu12/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/determinism.py +3 -0
- cuda/compute/iterators/__init__.py +23 -0
- cuda/compute/iterators/_factories.py +251 -0
- cuda/compute/iterators/_iterators.py +680 -0
- cuda/compute/iterators/_permutation_iterator.py +266 -0
- cuda/compute/iterators/_zip_iterator.py +268 -0
- cuda/compute/numba_utils.py +54 -0
- cuda/compute/op.py +140 -0
- cuda/compute/struct.py +520 -0
- cuda/compute/typing.py +36 -0
- cuda/coop/__init__.py +8 -0
- cuda/coop/_caching.py +48 -0
- cuda/coop/_common.py +275 -0
- cuda/coop/_nvrtc.py +92 -0
- cuda/coop/_scan_op.py +181 -0
- cuda/coop/_types.py +937 -0
- cuda/coop/_typing.py +107 -0
- cuda/coop/block/__init__.py +39 -0
- cuda/coop/block/_block_exchange.py +251 -0
- cuda/coop/block/_block_load_store.py +215 -0
- cuda/coop/block/_block_merge_sort.py +125 -0
- cuda/coop/block/_block_radix_sort.py +214 -0
- cuda/coop/block/_block_reduce.py +294 -0
- cuda/coop/block/_block_scan.py +983 -0
- cuda/coop/warp/__init__.py +9 -0
- cuda/coop/warp/_warp_merge_sort.py +92 -0
- cuda/coop/warp/_warp_reduce.py +153 -0
- cuda/coop/warp/_warp_scan.py +78 -0
- cuda_cccl-0.4.3.dist-info/METADATA +84 -0
- cuda_cccl-0.4.3.dist-info/RECORD +2024 -0
- cuda_cccl-0.4.3.dist-info/WHEEL +5 -0
- cuda_cccl-0.4.3.dist-info/licenses/LICENSE +1 -0
|
@@ -0,0 +1,1398 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
3
|
+
|
|
4
|
+
//! @file
|
|
5
|
+
//! cub::DeviceSegmentedScan provides device-wide, parallel operations for computing a batched prefix
|
|
6
|
+
//! scan across multiple sequences of data items residing within device-accessible memory.
|
|
7
|
+
|
|
8
|
+
#pragma once
|
|
9
|
+
|
|
10
|
+
#include <cub/config.cuh>
|
|
11
|
+
|
|
12
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
13
|
+
# pragma GCC system_header
|
|
14
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
15
|
+
# pragma clang system_header
|
|
16
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
17
|
+
# pragma system_header
|
|
18
|
+
#endif // no system header
|
|
19
|
+
|
|
20
|
+
#include <cub/device/dispatch/dispatch_segmented_scan.cuh>
|
|
21
|
+
|
|
22
|
+
#include <cuda/std/cstdint>
|
|
23
|
+
|
|
24
|
+
CUB_NAMESPACE_BEGIN
|
|
25
|
+
|
|
26
|
+
//! @rst
|
|
27
|
+
//! DeviceSegmentedScan provides device-wide, parallel operations for computing a
|
|
28
|
+
//! batched prefix scan across multiple sequences of data items residing within
|
|
29
|
+
//! device-accessible memory.
|
|
30
|
+
//!
|
|
31
|
+
//! Overview
|
|
32
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
33
|
+
//!
|
|
34
|
+
//! Given a sequence of input elements and a binary reduction operator, a
|
|
35
|
+
//! `prefix scan <http://en.wikipedia.org/wiki/Prefix_sum>`_ produces an output
|
|
36
|
+
//! sequence where each element is computed to be the reduction of the elements
|
|
37
|
+
//! occurring earlier in the input sequence. *Prefix sum* connotes a prefix scan
|
|
38
|
+
//! with the addition operator. The term *inclusive* indicates that the
|
|
39
|
+
//! \ *i*\ :sup:`th` output reduction incorporates the \ *i*\ :sup:`th` input.
|
|
40
|
+
//! The term *exclusive* indicates the *i*\ :sup:`th` input is not
|
|
41
|
+
//! incorporated into the \ *i*\ :sup:`th` output reduction. When the input and
|
|
42
|
+
//! output sequences are the same, the scan is performed in-place.
|
|
43
|
+
//!
|
|
44
|
+
//! In order to provide an efficient parallel implementation, the binary reduction operator must be associative. That
|
|
45
|
+
//! is, ``op(op(a, b), c)`` must be equivalent to ``op(a, op(b, c))`` for any input values ``a``, ``b``, and ``c``.
|
|
46
|
+
//!
|
|
47
|
+
//! Usage Considerations
|
|
48
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
49
|
+
//!
|
|
50
|
+
//! @cdp_class{DeviceSegmentedScan}
|
|
51
|
+
//!
|
|
52
|
+
//! @endrst
|
|
53
|
+
struct DeviceSegmentedScan
|
|
54
|
+
{
|
|
55
|
+
//! @rst
|
|
56
|
+
//! Computes a device-wide segmented exclusive prefix sum.
|
|
57
|
+
//!
|
|
58
|
+
//! - Results are not deterministic for computation of prefix sum on floating-point types
|
|
59
|
+
//! and may vary from run to run.
|
|
60
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The input and output sequences
|
|
61
|
+
//! shall not overlap in any other way.
|
|
62
|
+
//! - @devicestorage
|
|
63
|
+
//!
|
|
64
|
+
//! Preconditions
|
|
65
|
+
//! +++++++++++++
|
|
66
|
+
//!
|
|
67
|
+
//! - When ``d_in`` and ``d_out`` are equal, the segmented scan is performed in-place.
|
|
68
|
+
//! The range ``[d_in, d_in + num_items_in)`` and ``[d_out, d_out + num_items_out)``
|
|
69
|
+
//! shall not overlap in any other way.
|
|
70
|
+
//! - ``d_in`` and ``d_out`` must not be null pointers
|
|
71
|
+
//!
|
|
72
|
+
//! Snippet
|
|
73
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
74
|
+
//!
|
|
75
|
+
//! The code snippet below illustrates the exclusive segmented prefix sum of an ``int``
|
|
76
|
+
//! device vector.
|
|
77
|
+
//!
|
|
78
|
+
//! .. code-block:: c++
|
|
79
|
+
//!
|
|
80
|
+
//! #include <cub/cub.cuh>
|
|
81
|
+
//! // or, equivalently
|
|
82
|
+
//! // #include <cub/device/device_segmented_scan.cuh>
|
|
83
|
+
//!
|
|
84
|
+
//! // Declare, allocate, and initialize device-accessible pointers for
|
|
85
|
+
//! // input and output
|
|
86
|
+
//! int num_segments; // e.g., 3
|
|
87
|
+
//! int *d_in; // e.g., [8, 6, 7, 5, 3, -2, 9]
|
|
88
|
+
//! int *d_offsets; // e.g., [0, 2, 5, 7]
|
|
89
|
+
//! int *d_out; // e.g., [ , , , , , , ]
|
|
90
|
+
//! ...
|
|
91
|
+
//!
|
|
92
|
+
//! // Determine temporary device storage requirements
|
|
93
|
+
//! void *d_temp_storage = nullptr;
|
|
94
|
+
//! size_t temp_storage_bytes = 0;
|
|
95
|
+
//! cub::DeviceScan::ExclusiveSegmentedSum(
|
|
96
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
97
|
+
//! d_in, d_out, d_offsets, d_offsets + 1, num_segments);
|
|
98
|
+
//!
|
|
99
|
+
//! // Allocate temporary storage
|
|
100
|
+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
|
|
101
|
+
//!
|
|
102
|
+
//! // Run exclusive prefix sum
|
|
103
|
+
//! cub::DeviceScan::ExclusiveSegmentedSum(
|
|
104
|
+
//! d_temp_storage, temp_storage_bytes,
|
|
105
|
+
//! d_in, d_out, d_offsets, d_offsets + 1, num_segments);
|
|
106
|
+
//!
|
|
107
|
+
//! // d_out <-- [0, 8, 0, 7, 12, 0, -2]
|
|
108
|
+
//!
|
|
109
|
+
//! @endrst
|
|
110
|
+
//!
|
|
111
|
+
//! @tparam InputIteratorT
|
|
112
|
+
//! **[inferred]** Random-access input iterator type for reading segmented scan inputs @iterator
|
|
113
|
+
//!
|
|
114
|
+
//! @tparam OutputIteratorT
|
|
115
|
+
//! **[inferred]** Random-access output iterator type for writing segmented scan outputs @iterator
|
|
116
|
+
//!
|
|
117
|
+
//! @tparam BeginOffsetIteratorInputT
|
|
118
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data
|
|
119
|
+
//! sequence @iterator
|
|
120
|
+
//!
|
|
121
|
+
//! @tparam EndOffsetIteratorInputT
|
|
122
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence
|
|
123
|
+
//! @iterator
|
|
124
|
+
//!
|
|
125
|
+
//! @param[in] d_temp_storage
|
|
126
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
127
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
128
|
+
//!
|
|
129
|
+
//! @param[in,out] temp_storage_bytes
|
|
130
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
131
|
+
//!
|
|
132
|
+
//! @param[in] d_in
|
|
133
|
+
//! Random-access iterator to the input sequence of data items
|
|
134
|
+
//!
|
|
135
|
+
//! @param[out] d_out
|
|
136
|
+
//! Random-access iterator to the output sequence of data items
|
|
137
|
+
//!
|
|
138
|
+
//! @param[in] d_in_begin_offsets
|
|
139
|
+
//! @rst
|
|
140
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
141
|
+
//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
|
|
142
|
+
//! element of the \ *i*\ :sup:`th` data segment in ``d_in`` and in ``d_out``.
|
|
143
|
+
//! @endrst
|
|
144
|
+
//!
|
|
145
|
+
//! @param[in] d_in_end_offsets
|
|
146
|
+
//! @rst
|
|
147
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
148
|
+
//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
|
|
149
|
+
//! the \ *i*\ :sup:`th` data segment in ``d_in``.
|
|
150
|
+
//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
|
|
151
|
+
//! is considered empty.
|
|
152
|
+
//! @endrst
|
|
153
|
+
//!
|
|
154
|
+
//! @param[in] num_segments
|
|
155
|
+
//! The number of segments that comprise the segmented prefix scan data.
|
|
156
|
+
//!
|
|
157
|
+
//! @param[in] stream
|
|
158
|
+
//! @rst
|
|
159
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
160
|
+
//! @endrst
|
|
161
|
+
template <typename InputIteratorT,
|
|
162
|
+
typename OutputIteratorT,
|
|
163
|
+
typename BeginOffsetIteratorInputT,
|
|
164
|
+
typename EndOffsetIteratorInputT>
|
|
165
|
+
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSegmentedSum(
|
|
166
|
+
void* d_temp_storage,
|
|
167
|
+
size_t& temp_storage_bytes,
|
|
168
|
+
InputIteratorT d_in,
|
|
169
|
+
OutputIteratorT d_out,
|
|
170
|
+
BeginOffsetIteratorInputT d_in_begin_offsets,
|
|
171
|
+
EndOffsetIteratorInputT d_in_end_offsets,
|
|
172
|
+
::cuda::std::int64_t num_segments,
|
|
173
|
+
cudaStream_t stream = 0)
|
|
174
|
+
{
|
|
175
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedScan::ExclusiveSegmentedSum");
|
|
176
|
+
|
|
177
|
+
using offset_t = detail::common_iterator_value_t<BeginOffsetIteratorInputT, EndOffsetIteratorInputT>;
|
|
178
|
+
using integral_offset_check = ::cuda::std::is_integral<offset_t>;
|
|
179
|
+
|
|
180
|
+
static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
|
|
181
|
+
|
|
182
|
+
using scan_op_t = ::cuda::std::plus<>;
|
|
183
|
+
scan_op_t scan_op{};
|
|
184
|
+
|
|
185
|
+
using init_value_t = cub::detail::it_value_t<InputIteratorT>;
|
|
186
|
+
init_value_t init_value{};
|
|
187
|
+
|
|
188
|
+
return cub::detail::segmented_scan::dispatch_segmented_scan<
|
|
189
|
+
InputIteratorT,
|
|
190
|
+
OutputIteratorT,
|
|
191
|
+
BeginOffsetIteratorInputT,
|
|
192
|
+
EndOffsetIteratorInputT,
|
|
193
|
+
BeginOffsetIteratorInputT,
|
|
194
|
+
scan_op_t,
|
|
195
|
+
detail::InputValue<init_value_t>>::
|
|
196
|
+
dispatch(
|
|
197
|
+
d_temp_storage,
|
|
198
|
+
temp_storage_bytes,
|
|
199
|
+
d_in,
|
|
200
|
+
d_out,
|
|
201
|
+
num_segments,
|
|
202
|
+
d_in_begin_offsets,
|
|
203
|
+
d_in_end_offsets,
|
|
204
|
+
d_in_begin_offsets,
|
|
205
|
+
scan_op,
|
|
206
|
+
detail::InputValue<init_value_t>(init_value),
|
|
207
|
+
stream);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
//! @rst
|
|
211
|
+
//! Computes a device-wide segmented exclusive prefix sum.
|
|
212
|
+
//!
|
|
213
|
+
//! - Results are not deterministic for computation of prefix sum on floating-point types
|
|
214
|
+
//! and may vary from run to run.
|
|
215
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The input and output sequences
|
|
216
|
+
//! shall not overlap in any other way.
|
|
217
|
+
//! - @devicestorage
|
|
218
|
+
//!
|
|
219
|
+
//! Snippet
|
|
220
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
221
|
+
//!
|
|
222
|
+
//! The code snippet below illustrates the exclusive segmented prefix sum of an ``int``
|
|
223
|
+
//! device vector.
|
|
224
|
+
//!
|
|
225
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_segmented_scan_api.cu
|
|
226
|
+
//! :language: c++
|
|
227
|
+
//! :dedent:
|
|
228
|
+
//! :start-after: example-begin exclusive-segmented-sum-three-offsets
|
|
229
|
+
//! :end-before: example-end exclusive-segmented-sum-three-offsets
|
|
230
|
+
//!
|
|
231
|
+
//! @endrst
|
|
232
|
+
//! @tparam InputIteratorT
|
|
233
|
+
//! **[inferred]** Random-access input iterator type for reading segmented scan inputs @iterator
|
|
234
|
+
//!
|
|
235
|
+
//! @tparam OutputIteratorT
|
|
236
|
+
//! **[inferred]** Random-access output iterator type for writing segmented scan outputs @iterator
|
|
237
|
+
//!
|
|
238
|
+
//! @tparam BeginOffsetIteratorInputT
|
|
239
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data
|
|
240
|
+
//! sequence @iterator
|
|
241
|
+
//!
|
|
242
|
+
//! @tparam EndOffsetIteratorInputT
|
|
243
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence
|
|
244
|
+
//! @iterator
|
|
245
|
+
//!
|
|
246
|
+
//! @tparam BeginOffsetIteratorOutputT
|
|
247
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets in the output sequence
|
|
248
|
+
//! @iterator
|
|
249
|
+
//!
|
|
250
|
+
//! @param[in] d_temp_storage
|
|
251
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
252
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
253
|
+
//!
|
|
254
|
+
//! @param[in,out] temp_storage_bytes
|
|
255
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
256
|
+
//!
|
|
257
|
+
//! @param[in] d_in
|
|
258
|
+
//! Random-access iterator to the input sequence of data items
|
|
259
|
+
//!
|
|
260
|
+
//! @param[out] d_out
|
|
261
|
+
//! Random-access iterator to the output sequence of data items
|
|
262
|
+
//!
|
|
263
|
+
//! @param[in] d_in_begin_offsets
|
|
264
|
+
//! @rst
|
|
265
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
266
|
+
//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
|
|
267
|
+
//! element of the \ *i*\ :sup:`th` data segment in ``d_in``
|
|
268
|
+
//! @endrst
|
|
269
|
+
//!
|
|
270
|
+
//! @param[in] d_in_end_offsets
|
|
271
|
+
//! @rst
|
|
272
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
273
|
+
//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
|
|
274
|
+
//! the \ *i*\ :sup:`th` data segment in ``d_in``.
|
|
275
|
+
//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
|
|
276
|
+
//! is considered empty.
|
|
277
|
+
//! @endrst
|
|
278
|
+
//!
|
|
279
|
+
//! @param[in] d_out_begin_offsets
|
|
280
|
+
//! @rst
|
|
281
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
282
|
+
//! length ``num_segments``, such that ``d_out_begin_offsets[i]`` is the first
|
|
283
|
+
//! element of the \ *i*\ :sup:`th` data segment in ``d_out``
|
|
284
|
+
//! @endrst
|
|
285
|
+
//!
|
|
286
|
+
//! @param[in] num_segments
|
|
287
|
+
//! The number of segments that comprise the segmented prefix scan data.
|
|
288
|
+
//!
|
|
289
|
+
//! @param[in] stream
|
|
290
|
+
//! @rst
|
|
291
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
292
|
+
//! @endrst
|
|
293
|
+
template <typename InputIteratorT,
|
|
294
|
+
typename OutputIteratorT,
|
|
295
|
+
typename BeginOffsetIteratorInputT,
|
|
296
|
+
typename EndOffsetIteratorInputT,
|
|
297
|
+
typename BeginOffsetIteratorOutputT>
|
|
298
|
+
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSegmentedSum(
|
|
299
|
+
void* d_temp_storage,
|
|
300
|
+
size_t& temp_storage_bytes,
|
|
301
|
+
InputIteratorT d_in,
|
|
302
|
+
OutputIteratorT d_out,
|
|
303
|
+
BeginOffsetIteratorInputT d_in_begin_offsets,
|
|
304
|
+
EndOffsetIteratorInputT d_in_end_offsets,
|
|
305
|
+
BeginOffsetIteratorOutputT d_out_begin_offsets,
|
|
306
|
+
::cuda::std::int64_t num_segments,
|
|
307
|
+
cudaStream_t stream = 0)
|
|
308
|
+
{
|
|
309
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedScan::ExclusiveSegmentedSum");
|
|
310
|
+
|
|
311
|
+
using offset_t =
|
|
312
|
+
detail::common_iterator_value_t<BeginOffsetIteratorInputT, EndOffsetIteratorInputT, BeginOffsetIteratorOutputT>;
|
|
313
|
+
using integral_offset_check = ::cuda::std::is_integral<offset_t>;
|
|
314
|
+
|
|
315
|
+
static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
|
|
316
|
+
|
|
317
|
+
using scan_op_t = ::cuda::std::plus<>;
|
|
318
|
+
scan_op_t scan_op{};
|
|
319
|
+
|
|
320
|
+
using init_value_t = cub::detail::it_value_t<InputIteratorT>;
|
|
321
|
+
init_value_t init_value{};
|
|
322
|
+
|
|
323
|
+
return cub::detail::segmented_scan::dispatch_segmented_scan<
|
|
324
|
+
InputIteratorT,
|
|
325
|
+
OutputIteratorT,
|
|
326
|
+
BeginOffsetIteratorInputT,
|
|
327
|
+
EndOffsetIteratorInputT,
|
|
328
|
+
BeginOffsetIteratorOutputT,
|
|
329
|
+
scan_op_t,
|
|
330
|
+
detail::InputValue<init_value_t>>::
|
|
331
|
+
dispatch(
|
|
332
|
+
d_temp_storage,
|
|
333
|
+
temp_storage_bytes,
|
|
334
|
+
d_in,
|
|
335
|
+
d_out,
|
|
336
|
+
num_segments,
|
|
337
|
+
d_in_begin_offsets,
|
|
338
|
+
d_in_end_offsets,
|
|
339
|
+
d_out_begin_offsets,
|
|
340
|
+
scan_op,
|
|
341
|
+
detail::InputValue<init_value_t>(init_value),
|
|
342
|
+
stream);
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
//! @rst
|
|
346
|
+
//! Computes a device-wide segmented exclusive prefix scan using the specified
|
|
347
|
+
//! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
|
|
348
|
+
//! the initial value, and is assigned to the first element in each output segment.
|
|
349
|
+
//!
|
|
350
|
+
//! - Supports non-commutative scan operators.
|
|
351
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
352
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
353
|
+
//! operators may vary from run to run.
|
|
354
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The input and output sequences
|
|
355
|
+
//! shall not overlap in any other way.
|
|
356
|
+
//! - @devicestorage
|
|
357
|
+
//!
|
|
358
|
+
//! Snippet
|
|
359
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
360
|
+
//!
|
|
361
|
+
//! The code snippet below illustrates the exclusive segmented prefix scan of an ``int``
|
|
362
|
+
//! device vector.
|
|
363
|
+
//!
|
|
364
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_segmented_scan_api.cu
|
|
365
|
+
//! :language: c++
|
|
366
|
+
//! :dedent:
|
|
367
|
+
//! :start-after: example-begin exclusive-segmented-scan-two-offsets
|
|
368
|
+
//! :end-before: example-end exclusive-segmented-scan-two-offsets
|
|
369
|
+
//!
|
|
370
|
+
//! @endrst
|
|
371
|
+
//! @tparam InputIteratorT
|
|
372
|
+
//! **[inferred]** Random-access input iterator type for reading segmented scan inputs @iterator
|
|
373
|
+
//!
|
|
374
|
+
//! @tparam OutputIteratorT
|
|
375
|
+
//! **[inferred]** Random-access output iterator type for writing segmented scan outputs @iterator
|
|
376
|
+
//!
|
|
377
|
+
//! @tparam BeginOffsetIteratorInputT
|
|
378
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data
|
|
379
|
+
//! sequence @iterator
|
|
380
|
+
//!
|
|
381
|
+
//! @tparam EndOffsetIteratorInputT
|
|
382
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence
|
|
383
|
+
//! @iterator
|
|
384
|
+
//!
|
|
385
|
+
//! @tparam ScanOpT
|
|
386
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
387
|
+
//!
|
|
388
|
+
//! @tparam InitValueT
|
|
389
|
+
//! **[inferred]** Type of the `init_value`
|
|
390
|
+
//!
|
|
391
|
+
//! @param[in] d_temp_storage
|
|
392
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
393
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
394
|
+
//!
|
|
395
|
+
//! @param[in,out] temp_storage_bytes
|
|
396
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
397
|
+
//!
|
|
398
|
+
//! @param[in] d_in
|
|
399
|
+
//! Random-access iterator to the input sequence of data items
|
|
400
|
+
//!
|
|
401
|
+
//! @param[out] d_out
|
|
402
|
+
//! Random-access iterator to the output sequence of data items
|
|
403
|
+
//!
|
|
404
|
+
//! @param[in] d_in_begin_offsets
|
|
405
|
+
//! @rst
|
|
406
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
407
|
+
//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
|
|
408
|
+
//! element of the \ *i*\ :sup:`th` data segment in ``d_in`` and in ``d_out``
|
|
409
|
+
//! @endrst
|
|
410
|
+
//!
|
|
411
|
+
//! @param[in] d_in_end_offsets
|
|
412
|
+
//! @rst
|
|
413
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
414
|
+
//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
|
|
415
|
+
//! the \ *i*\ :sup:`th` data segment in ``d_in``.
|
|
416
|
+
//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
|
|
417
|
+
//! is considered empty.
|
|
418
|
+
//! @endrst
|
|
419
|
+
//!
|
|
420
|
+
//! @param[in] num_segments
|
|
421
|
+
//! The number of segments that comprise the segmented prefix scan data.
|
|
422
|
+
//!
|
|
423
|
+
//! @param[in] scan_op
|
|
424
|
+
//! Binary associative scan functor
|
|
425
|
+
//!
|
|
426
|
+
//! @param[in] init_value
|
|
427
|
+
//! Initial value to seed the exclusive scan for each segment in the output sequence
|
|
428
|
+
//!
|
|
429
|
+
//! @param[in] stream
|
|
430
|
+
//! @rst
|
|
431
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
432
|
+
//! @endrst
|
|
433
|
+
template <typename InputIteratorT,
|
|
434
|
+
typename OutputIteratorT,
|
|
435
|
+
typename BeginOffsetIteratorInputT,
|
|
436
|
+
typename EndOffsetIteratorInputT,
|
|
437
|
+
typename ScanOpT,
|
|
438
|
+
typename InitValueT>
|
|
439
|
+
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSegmentedScan(
|
|
440
|
+
void* d_temp_storage,
|
|
441
|
+
size_t& temp_storage_bytes,
|
|
442
|
+
InputIteratorT d_in,
|
|
443
|
+
OutputIteratorT d_out,
|
|
444
|
+
BeginOffsetIteratorInputT d_in_begin_offsets,
|
|
445
|
+
EndOffsetIteratorInputT d_in_end_offsets,
|
|
446
|
+
::cuda::std::int64_t num_segments,
|
|
447
|
+
ScanOpT scan_op,
|
|
448
|
+
InitValueT init_value,
|
|
449
|
+
cudaStream_t stream = 0)
|
|
450
|
+
{
|
|
451
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedScan::ExclusiveSegmentedScan");
|
|
452
|
+
|
|
453
|
+
using offset_t = detail::common_iterator_value_t<BeginOffsetIteratorInputT, EndOffsetIteratorInputT>;
|
|
454
|
+
using integral_offset_check = ::cuda::std::is_integral<offset_t>;
|
|
455
|
+
|
|
456
|
+
static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
|
|
457
|
+
|
|
458
|
+
return cub::detail::segmented_scan::dispatch_segmented_scan<
|
|
459
|
+
InputIteratorT,
|
|
460
|
+
OutputIteratorT,
|
|
461
|
+
BeginOffsetIteratorInputT,
|
|
462
|
+
EndOffsetIteratorInputT,
|
|
463
|
+
BeginOffsetIteratorInputT,
|
|
464
|
+
ScanOpT,
|
|
465
|
+
detail::InputValue<InitValueT>>::
|
|
466
|
+
dispatch(
|
|
467
|
+
d_temp_storage,
|
|
468
|
+
temp_storage_bytes,
|
|
469
|
+
d_in,
|
|
470
|
+
d_out,
|
|
471
|
+
num_segments,
|
|
472
|
+
d_in_begin_offsets,
|
|
473
|
+
d_in_end_offsets,
|
|
474
|
+
d_in_begin_offsets,
|
|
475
|
+
scan_op,
|
|
476
|
+
detail::InputValue<InitValueT>(init_value),
|
|
477
|
+
stream);
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
//! @rst
|
|
481
|
+
//! Computes a device-wide segmented exclusive prefix scan using the specified
|
|
482
|
+
//! binary associative ``scan_op`` functor. The ``init_value`` value is applied as
|
|
483
|
+
//! the initial value, and is assigned to the first element in each output segment.
|
|
484
|
+
//!
|
|
485
|
+
//! - Supports non-commutative scan operators.
|
|
486
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
487
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
488
|
+
//! operators may vary from run to run.
|
|
489
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The input and output sequences
|
|
490
|
+
//! shall not overlap in any other way.
|
|
491
|
+
//! - @devicestorage
|
|
492
|
+
//!
|
|
493
|
+
//! @endrst
|
|
494
|
+
//! @tparam InputIteratorT
|
|
495
|
+
//! **[inferred]** Random-access input iterator type for reading segmented scan inputs @iterator
|
|
496
|
+
//!
|
|
497
|
+
//! @tparam OutputIteratorT
|
|
498
|
+
//! **[inferred]** Random-access output iterator type for writing segmented scan outputs @iterator
|
|
499
|
+
//!
|
|
500
|
+
//! @tparam BeginOffsetIteratorInputT
|
|
501
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data
|
|
502
|
+
//! sequence @iterator
|
|
503
|
+
//!
|
|
504
|
+
//! @tparam EndOffsetIteratorInputT
|
|
505
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence
|
|
506
|
+
//! @iterator
|
|
507
|
+
//!
|
|
508
|
+
//! @tparam BeginOffsetIteratorOutputT
|
|
509
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets in the output sequence
|
|
510
|
+
//! @iterator
|
|
511
|
+
//!
|
|
512
|
+
//! @tparam ScanOpT
|
|
513
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
514
|
+
//!
|
|
515
|
+
//! @tparam InitValueT
|
|
516
|
+
//! **[inferred]** Type of the `init_value`
|
|
517
|
+
//!
|
|
518
|
+
//! @param[in] d_temp_storage
|
|
519
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
520
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
521
|
+
//!
|
|
522
|
+
//! @param[in,out] temp_storage_bytes
|
|
523
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
524
|
+
//!
|
|
525
|
+
//! @param[in] d_in
|
|
526
|
+
//! Random-access iterator to the input sequence of data items
|
|
527
|
+
//!
|
|
528
|
+
//! @param[out] d_out
|
|
529
|
+
//! Random-access iterator to the output sequence of data items
|
|
530
|
+
//!
|
|
531
|
+
//! @param[in] d_in_begin_offsets
|
|
532
|
+
//! @rst
|
|
533
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
534
|
+
//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
|
|
535
|
+
//! element of the \ *i*\ :sup:`th` data segment in ``d_in``
|
|
536
|
+
//! @endrst
|
|
537
|
+
//!
|
|
538
|
+
//! @param[in] d_in_end_offsets
|
|
539
|
+
//! @rst
|
|
540
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
541
|
+
//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
|
|
542
|
+
//! the \ *i*\ :sup:`th` data segment in ``d_in``.
|
|
543
|
+
//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
|
|
544
|
+
//! is considered empty.
|
|
545
|
+
//! @endrst
|
|
546
|
+
//!
|
|
547
|
+
//! @param[in] d_out_begin_offsets
|
|
548
|
+
//! @rst
|
|
549
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
550
|
+
//! length ``num_segments``, such that ``d_out_begin_offsets[i]`` is the first
|
|
551
|
+
//! element of the \ *i*\ :sup:`th` data segment in ``d_out``
|
|
552
|
+
//! @endrst
|
|
553
|
+
//!
|
|
554
|
+
//! @param[in] num_segments
|
|
555
|
+
//! The number of segments that comprise the segmented prefix scan data.
|
|
556
|
+
//!
|
|
557
|
+
//! @param[in] scan_op
|
|
558
|
+
//! Binary associative scan functor
|
|
559
|
+
//!
|
|
560
|
+
//! @param[in] init_value
|
|
561
|
+
//! Initial value to seed the exclusive scan for each segment in the output sequence
|
|
562
|
+
//!
|
|
563
|
+
//! @param[in] stream
|
|
564
|
+
//! @rst
|
|
565
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
566
|
+
//! @endrst
|
|
567
|
+
template <typename InputIteratorT,
|
|
568
|
+
typename OutputIteratorT,
|
|
569
|
+
typename BeginOffsetIteratorInputT,
|
|
570
|
+
typename EndOffsetIteratorInputT,
|
|
571
|
+
typename BeginOffsetIteratorOutputT,
|
|
572
|
+
typename ScanOpT,
|
|
573
|
+
typename InitValueT>
|
|
574
|
+
CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSegmentedScan(
|
|
575
|
+
void* d_temp_storage,
|
|
576
|
+
size_t& temp_storage_bytes,
|
|
577
|
+
InputIteratorT d_in,
|
|
578
|
+
OutputIteratorT d_out,
|
|
579
|
+
BeginOffsetIteratorInputT d_in_begin_offsets,
|
|
580
|
+
EndOffsetIteratorInputT d_in_end_offsets,
|
|
581
|
+
BeginOffsetIteratorOutputT d_out_begin_offsets,
|
|
582
|
+
::cuda::std::int64_t num_segments,
|
|
583
|
+
ScanOpT scan_op,
|
|
584
|
+
InitValueT init_value,
|
|
585
|
+
cudaStream_t stream = 0)
|
|
586
|
+
{
|
|
587
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedScan::ExclusiveSegmentedScan");
|
|
588
|
+
|
|
589
|
+
using offset_t =
|
|
590
|
+
detail::common_iterator_value_t<BeginOffsetIteratorInputT, EndOffsetIteratorInputT, BeginOffsetIteratorOutputT>;
|
|
591
|
+
using integral_offset_check = ::cuda::std::is_integral<offset_t>;
|
|
592
|
+
|
|
593
|
+
static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
|
|
594
|
+
|
|
595
|
+
return cub::detail::segmented_scan::dispatch_segmented_scan<
|
|
596
|
+
InputIteratorT,
|
|
597
|
+
OutputIteratorT,
|
|
598
|
+
BeginOffsetIteratorInputT,
|
|
599
|
+
EndOffsetIteratorInputT,
|
|
600
|
+
BeginOffsetIteratorOutputT,
|
|
601
|
+
ScanOpT,
|
|
602
|
+
detail::InputValue<InitValueT>>::
|
|
603
|
+
dispatch(
|
|
604
|
+
d_temp_storage,
|
|
605
|
+
temp_storage_bytes,
|
|
606
|
+
d_in,
|
|
607
|
+
d_out,
|
|
608
|
+
num_segments,
|
|
609
|
+
d_in_begin_offsets,
|
|
610
|
+
d_in_end_offsets,
|
|
611
|
+
d_out_begin_offsets,
|
|
612
|
+
scan_op,
|
|
613
|
+
detail::InputValue<InitValueT>(init_value),
|
|
614
|
+
stream);
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
//! @rst
|
|
618
|
+
//! Computes a device-wide segmented inclusive prefix sum.
|
|
619
|
+
//!
|
|
620
|
+
//! - Results are not deterministic for computation of prefix sum on floating-point types
|
|
621
|
+
//! and may vary from run to run.
|
|
622
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The input and output sequences
|
|
623
|
+
//! shall not overlap in any other way.
|
|
624
|
+
//! - @devicestorage
|
|
625
|
+
//!
|
|
626
|
+
//! Snippet
|
|
627
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
628
|
+
//!
|
|
629
|
+
//! The code snippet below illustrates the inclusive segmented prefix sum of an ``int``
|
|
630
|
+
//! device vector.
|
|
631
|
+
//!
|
|
632
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_segmented_scan_api.cu
|
|
633
|
+
//! :language: c++
|
|
634
|
+
//! :dedent:
|
|
635
|
+
//! :start-after: example-begin inclusive-segmented-sum-two-offsets
|
|
636
|
+
//! :end-before: example-end inclusive-segmented-sum-two-offsets
|
|
637
|
+
//!
|
|
638
|
+
//! @endrst
|
|
639
|
+
//! @tparam InputIteratorT
|
|
640
|
+
//! **[inferred]** Random-access input iterator type for reading segmented scan inputs @iterator
|
|
641
|
+
//!
|
|
642
|
+
//! @tparam OutputIteratorT
|
|
643
|
+
//! **[inferred]** Random-access output iterator type for writing segmented scan outputs @iterator
|
|
644
|
+
//!
|
|
645
|
+
//! @tparam BeginOffsetIteratorInputT
|
|
646
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data
|
|
647
|
+
//! sequence @iterator
|
|
648
|
+
//!
|
|
649
|
+
//! @tparam EndOffsetIteratorInputT
|
|
650
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence
|
|
651
|
+
//! @iterator
|
|
652
|
+
//!
|
|
653
|
+
//! @tparam ScanOpT
|
|
654
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
655
|
+
//!
|
|
656
|
+
//! @param[in] d_temp_storage
|
|
657
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
658
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
659
|
+
//!
|
|
660
|
+
//! @param[in,out] temp_storage_bytes
|
|
661
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
662
|
+
//!
|
|
663
|
+
//! @param[in] d_in
|
|
664
|
+
//! Random-access iterator to the input sequence of data items
|
|
665
|
+
//!
|
|
666
|
+
//! @param[out] d_out
|
|
667
|
+
//! Random-access iterator to the output sequence of data items
|
|
668
|
+
//!
|
|
669
|
+
//! @param[in] d_in_begin_offsets
|
|
670
|
+
//! @rst
|
|
671
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
672
|
+
//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
|
|
673
|
+
//! element of the \ *i*\ :sup:`th` data segment in ``d_in`` and in ``d_out``
|
|
674
|
+
//! @endrst
|
|
675
|
+
//!
|
|
676
|
+
//! @param[in] d_in_end_offsets
|
|
677
|
+
//! @rst
|
|
678
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
679
|
+
//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
|
|
680
|
+
//! the \ *i*\ :sup:`th` data segment in ``d_in``.
|
|
681
|
+
//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
|
|
682
|
+
//! is considered empty.
|
|
683
|
+
//! @endrst
|
|
684
|
+
//!
|
|
685
|
+
//! @param[in] num_segments
|
|
686
|
+
//! The number of segments that comprise the segmented prefix scan data.
|
|
687
|
+
//!
|
|
688
|
+
//! @param[in] stream
|
|
689
|
+
//! @rst
|
|
690
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
691
|
+
//! @endrst
|
|
692
|
+
template <typename InputIteratorT,
|
|
693
|
+
typename OutputIteratorT,
|
|
694
|
+
typename BeginOffsetIteratorInputT,
|
|
695
|
+
typename EndOffsetIteratorInputT>
|
|
696
|
+
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSegmentedSum(
|
|
697
|
+
void* d_temp_storage,
|
|
698
|
+
size_t& temp_storage_bytes,
|
|
699
|
+
InputIteratorT d_in,
|
|
700
|
+
OutputIteratorT d_out,
|
|
701
|
+
BeginOffsetIteratorInputT d_in_begin_offsets,
|
|
702
|
+
EndOffsetIteratorInputT d_in_end_offsets,
|
|
703
|
+
::cuda::std::int64_t num_segments,
|
|
704
|
+
cudaStream_t stream = 0)
|
|
705
|
+
{
|
|
706
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedScan::InclusiveSegmentedSum");
|
|
707
|
+
|
|
708
|
+
using offset_t = detail::common_iterator_value_t<BeginOffsetIteratorInputT, EndOffsetIteratorInputT>;
|
|
709
|
+
using integral_offset_check = ::cuda::std::is_integral<offset_t>;
|
|
710
|
+
|
|
711
|
+
static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
|
|
712
|
+
|
|
713
|
+
using scan_op_t = ::cuda::std::plus<>;
|
|
714
|
+
scan_op_t scan_op{};
|
|
715
|
+
|
|
716
|
+
return cub::detail::segmented_scan::dispatch_segmented_scan<
|
|
717
|
+
InputIteratorT,
|
|
718
|
+
OutputIteratorT,
|
|
719
|
+
BeginOffsetIteratorInputT,
|
|
720
|
+
EndOffsetIteratorInputT,
|
|
721
|
+
BeginOffsetIteratorInputT,
|
|
722
|
+
scan_op_t,
|
|
723
|
+
NullType>::dispatch(d_temp_storage,
|
|
724
|
+
temp_storage_bytes,
|
|
725
|
+
d_in,
|
|
726
|
+
d_out,
|
|
727
|
+
num_segments,
|
|
728
|
+
d_in_begin_offsets,
|
|
729
|
+
d_in_end_offsets,
|
|
730
|
+
d_in_begin_offsets,
|
|
731
|
+
scan_op,
|
|
732
|
+
NullType(),
|
|
733
|
+
stream);
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
//! @rst
|
|
737
|
+
//! Computes a device-wide segmented inclusive prefix sum.
|
|
738
|
+
//!
|
|
739
|
+
//! - Results are not deterministic for computation of prefix sum on floating-point types
|
|
740
|
+
//! and may vary from run to run.
|
|
741
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The input and output sequences
|
|
742
|
+
//! shall not overlap in any other way.
|
|
743
|
+
//! - @devicestorage
|
|
744
|
+
//!
|
|
745
|
+
//! Snippet
|
|
746
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
747
|
+
//!
|
|
748
|
+
//! The code snippet below illustrates the inclusive segmented prefix sum of an ``int``
|
|
749
|
+
//! device vector.
|
|
750
|
+
//!
|
|
751
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_segmented_scan_api.cu
|
|
752
|
+
//! :language: c++
|
|
753
|
+
//! :dedent:
|
|
754
|
+
//! :start-after: example-begin inclusive-segmented-sum-three-offsets
|
|
755
|
+
//! :end-before: example-end inclusive-segmented-sum-three-offsets
|
|
756
|
+
//!
|
|
757
|
+
//! @endrst
|
|
758
|
+
//! @tparam InputIteratorT
|
|
759
|
+
//! **[inferred]** Random-access input iterator type for reading segmented scan inputs @iterator
|
|
760
|
+
//!
|
|
761
|
+
//! @tparam OutputIteratorT
|
|
762
|
+
//! **[inferred]** Random-access output iterator type for writing segmented scan outputs @iterator
|
|
763
|
+
//!
|
|
764
|
+
//! @tparam BeginOffsetIteratorInputT
|
|
765
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data
|
|
766
|
+
//! sequence @iterator
|
|
767
|
+
//!
|
|
768
|
+
//! @tparam EndOffsetIteratorInputT
|
|
769
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence
|
|
770
|
+
//! @iterator
|
|
771
|
+
//!
|
|
772
|
+
//! @tparam BeginOffsetIteratorOutputT
|
|
773
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets in the output sequence
|
|
774
|
+
//! @iterator
|
|
775
|
+
//!
|
|
776
|
+
//! @tparam ScanOpT
|
|
777
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
778
|
+
//!
|
|
779
|
+
//! @param[in] d_temp_storage
|
|
780
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
781
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
782
|
+
//!
|
|
783
|
+
//! @param[in,out] temp_storage_bytes
|
|
784
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
785
|
+
//!
|
|
786
|
+
//! @param[in] d_in
|
|
787
|
+
//! Random-access iterator to the input sequence of data items
|
|
788
|
+
//!
|
|
789
|
+
//! @param[out] d_out
|
|
790
|
+
//! Random-access iterator to the output sequence of data items
|
|
791
|
+
//!
|
|
792
|
+
//! @param[in] d_in_begin_offsets
|
|
793
|
+
//! @rst
|
|
794
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
795
|
+
//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
|
|
796
|
+
//! element of the \ *i*\ :sup:`th` data segment in ``d_in``
|
|
797
|
+
//! @endrst
|
|
798
|
+
//!
|
|
799
|
+
//! @param[in] d_in_end_offsets
|
|
800
|
+
//! @rst
|
|
801
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
802
|
+
//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
|
|
803
|
+
//! the \ *i*\ :sup:`th` data segment in ``d_in``.
|
|
804
|
+
//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
|
|
805
|
+
//! is considered empty.
|
|
806
|
+
//! @endrst
|
|
807
|
+
//!
|
|
808
|
+
//! @param[in] d_out_begin_offsets
|
|
809
|
+
//! @rst
|
|
810
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
811
|
+
//! length ``num_segments``, such that ``d_out_begin_offsets[i]`` is the first
|
|
812
|
+
//! element of the \ *i*\ :sup:`th` data segment in ``d_out``
|
|
813
|
+
//! @endrst
|
|
814
|
+
//!
|
|
815
|
+
//! @param[in] num_segments
|
|
816
|
+
//! The number of segments that comprise the segmented prefix scan data.
|
|
817
|
+
//!
|
|
818
|
+
//! @param[in] stream
|
|
819
|
+
//! @rst
|
|
820
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
821
|
+
//! @endrst
|
|
822
|
+
template <typename InputIteratorT,
|
|
823
|
+
typename OutputIteratorT,
|
|
824
|
+
typename BeginOffsetIteratorInputT,
|
|
825
|
+
typename EndOffsetIteratorInputT,
|
|
826
|
+
typename BeginOffsetIteratorOutputT>
|
|
827
|
+
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSegmentedSum(
|
|
828
|
+
void* d_temp_storage,
|
|
829
|
+
size_t& temp_storage_bytes,
|
|
830
|
+
InputIteratorT d_in,
|
|
831
|
+
OutputIteratorT d_out,
|
|
832
|
+
BeginOffsetIteratorInputT d_in_begin_offsets,
|
|
833
|
+
EndOffsetIteratorInputT d_in_end_offsets,
|
|
834
|
+
BeginOffsetIteratorOutputT d_out_begin_offsets,
|
|
835
|
+
::cuda::std::int64_t num_segments,
|
|
836
|
+
cudaStream_t stream = 0)
|
|
837
|
+
{
|
|
838
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedScan::InclusiveSegmentedSum");
|
|
839
|
+
|
|
840
|
+
using offset_t =
|
|
841
|
+
detail::common_iterator_value_t<BeginOffsetIteratorInputT, EndOffsetIteratorInputT, BeginOffsetIteratorOutputT>;
|
|
842
|
+
using integral_offset_check = ::cuda::std::is_integral<offset_t>;
|
|
843
|
+
|
|
844
|
+
static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
|
|
845
|
+
|
|
846
|
+
using scan_op_t = ::cuda::std::plus<>;
|
|
847
|
+
scan_op_t scan_op{};
|
|
848
|
+
|
|
849
|
+
return cub::detail::segmented_scan::dispatch_segmented_scan<
|
|
850
|
+
InputIteratorT,
|
|
851
|
+
OutputIteratorT,
|
|
852
|
+
BeginOffsetIteratorInputT,
|
|
853
|
+
EndOffsetIteratorInputT,
|
|
854
|
+
BeginOffsetIteratorOutputT,
|
|
855
|
+
scan_op_t,
|
|
856
|
+
NullType>::dispatch(d_temp_storage,
|
|
857
|
+
temp_storage_bytes,
|
|
858
|
+
d_in,
|
|
859
|
+
d_out,
|
|
860
|
+
num_segments,
|
|
861
|
+
d_in_begin_offsets,
|
|
862
|
+
d_in_end_offsets,
|
|
863
|
+
d_out_begin_offsets,
|
|
864
|
+
scan_op,
|
|
865
|
+
NullType(),
|
|
866
|
+
stream);
|
|
867
|
+
}
|
|
868
|
+
|
|
869
|
+
//! @rst
|
|
870
|
+
//! Computes a device-wide segmented inclusive prefix scan using the specified binary associative ``scan_op`` functor.
|
|
871
|
+
//!
|
|
872
|
+
//! - Supports non-commutative scan operators.
|
|
873
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
874
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
875
|
+
//! operators may vary from run to run.
|
|
876
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The input and output sequences
|
|
877
|
+
//! shall not overlap in any other way.
|
|
878
|
+
//! - @devicestorage
|
|
879
|
+
//!
|
|
880
|
+
//! @endrst
|
|
881
|
+
//! @tparam InputIteratorT
|
|
882
|
+
//! **[inferred]** Random-access input iterator type for reading segmented scan inputs @iterator
|
|
883
|
+
//!
|
|
884
|
+
//! @tparam OutputIteratorT
|
|
885
|
+
//! **[inferred]** Random-access output iterator type for writing segmented scan outputs @iterator
|
|
886
|
+
//!
|
|
887
|
+
//! @tparam BeginOffsetIteratorInputT
|
|
888
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data
|
|
889
|
+
//! sequence @iterator
|
|
890
|
+
//!
|
|
891
|
+
//! @tparam EndOffsetIteratorInputT
|
|
892
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence
|
|
893
|
+
//! @iterator
|
|
894
|
+
//!
|
|
895
|
+
//! @tparam ScanOpT
|
|
896
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
897
|
+
//!
|
|
898
|
+
//! @param[in] d_temp_storage
|
|
899
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
900
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
901
|
+
//!
|
|
902
|
+
//! @param[in,out] temp_storage_bytes
|
|
903
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
904
|
+
//!
|
|
905
|
+
//! @param[in] d_in
|
|
906
|
+
//! Random-access iterator to the input sequence of data items
|
|
907
|
+
//!
|
|
908
|
+
//! @param[out] d_out
|
|
909
|
+
//! Random-access iterator to the output sequence of data items
|
|
910
|
+
//!
|
|
911
|
+
//! @param[in] d_in_begin_offsets
|
|
912
|
+
//! @rst
|
|
913
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
914
|
+
//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
|
|
915
|
+
//! element of the \ *i*\ :sup:`th` data segment in ``d_in`` and in ``d_out``
|
|
916
|
+
//! @endrst
|
|
917
|
+
//!
|
|
918
|
+
//! @param[in] d_in_end_offsets
|
|
919
|
+
//! @rst
|
|
920
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
921
|
+
//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
|
|
922
|
+
//! the \ *i*\ :sup:`th` data segment in ``d_in``.
|
|
923
|
+
//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
|
|
924
|
+
//! is considered empty.
|
|
925
|
+
//! @endrst
|
|
926
|
+
//!
|
|
927
|
+
//! @param[in] num_segments
|
|
928
|
+
//! The number of segments that comprise the segmented prefix scan data.
|
|
929
|
+
//!
|
|
930
|
+
//! @param[in] scan_op
|
|
931
|
+
//! Binary associative scan functor
|
|
932
|
+
//!
|
|
933
|
+
//! @param[in] stream
|
|
934
|
+
//! @rst
|
|
935
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
936
|
+
//! @endrst
|
|
937
|
+
template <typename InputIteratorT,
|
|
938
|
+
typename OutputIteratorT,
|
|
939
|
+
typename BeginOffsetIteratorInputT,
|
|
940
|
+
typename EndOffsetIteratorInputT,
|
|
941
|
+
typename ScanOpT>
|
|
942
|
+
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSegmentedScan(
|
|
943
|
+
void* d_temp_storage,
|
|
944
|
+
size_t& temp_storage_bytes,
|
|
945
|
+
InputIteratorT d_in,
|
|
946
|
+
OutputIteratorT d_out,
|
|
947
|
+
BeginOffsetIteratorInputT d_in_begin_offsets,
|
|
948
|
+
EndOffsetIteratorInputT d_in_end_offsets,
|
|
949
|
+
::cuda::std::int64_t num_segments,
|
|
950
|
+
ScanOpT scan_op,
|
|
951
|
+
cudaStream_t stream = 0)
|
|
952
|
+
{
|
|
953
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedScan::InclusiveSegmentedScan");
|
|
954
|
+
|
|
955
|
+
using offset_t = detail::common_iterator_value_t<BeginOffsetIteratorInputT, EndOffsetIteratorInputT>;
|
|
956
|
+
using integral_offset_check = ::cuda::std::is_integral<offset_t>;
|
|
957
|
+
|
|
958
|
+
static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
|
|
959
|
+
|
|
960
|
+
return cub::detail::segmented_scan::dispatch_segmented_scan<
|
|
961
|
+
InputIteratorT,
|
|
962
|
+
OutputIteratorT,
|
|
963
|
+
BeginOffsetIteratorInputT,
|
|
964
|
+
EndOffsetIteratorInputT,
|
|
965
|
+
BeginOffsetIteratorInputT,
|
|
966
|
+
ScanOpT,
|
|
967
|
+
NullType>::dispatch(d_temp_storage,
|
|
968
|
+
temp_storage_bytes,
|
|
969
|
+
d_in,
|
|
970
|
+
d_out,
|
|
971
|
+
num_segments,
|
|
972
|
+
d_in_begin_offsets,
|
|
973
|
+
d_in_end_offsets,
|
|
974
|
+
d_in_begin_offsets,
|
|
975
|
+
scan_op,
|
|
976
|
+
NullType(),
|
|
977
|
+
stream);
|
|
978
|
+
}
|
|
979
|
+
|
|
980
|
+
//! @rst
|
|
981
|
+
//! Computes a device-wide segmented inclusive prefix scan using the specified binary associative ``scan_op`` functor.
|
|
982
|
+
//!
|
|
983
|
+
//! - Supports non-commutative scan operators.
|
|
984
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
985
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
986
|
+
//! operators may vary from run to run.
|
|
987
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The input and output sequences
|
|
988
|
+
//! shall not overlap in any other way.
|
|
989
|
+
//! - @devicestorage
|
|
990
|
+
//!
|
|
991
|
+
//! Snippet
|
|
992
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
993
|
+
//!
|
|
994
|
+
//! The code snippet below illustrates the exclusive segmented prefix sum of an ``int``
|
|
995
|
+
//! device vector.
|
|
996
|
+
//!
|
|
997
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_segmented_scan_api.cu
|
|
998
|
+
//! :language: c++
|
|
999
|
+
//! :dedent:
|
|
1000
|
+
//! :start-after: example-begin inclusive-segmented-scan-three-offsets
|
|
1001
|
+
//! :end-before: example-end inclusive-segmented-scan-three-offsets
|
|
1002
|
+
//!
|
|
1003
|
+
//! @endrst
|
|
1004
|
+
//! @tparam InputIteratorT
|
|
1005
|
+
//! **[inferred]** Random-access input iterator type for reading segmented scan inputs @iterator
|
|
1006
|
+
//!
|
|
1007
|
+
//! @tparam OutputIteratorT
|
|
1008
|
+
//! **[inferred]** Random-access output iterator type for writing segmented scan outputs @iterator
|
|
1009
|
+
//!
|
|
1010
|
+
//! @tparam BeginOffsetIteratorInputT
|
|
1011
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data
|
|
1012
|
+
//! sequence @iterator
|
|
1013
|
+
//!
|
|
1014
|
+
//! @tparam EndOffsetIteratorInputT
|
|
1015
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence
|
|
1016
|
+
//! @iterator
|
|
1017
|
+
//!
|
|
1018
|
+
//! @tparam BeginOffsetIteratorOutputT
|
|
1019
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets in the output sequence
|
|
1020
|
+
//! @iterator
|
|
1021
|
+
//!
|
|
1022
|
+
//! @tparam ScanOpT
|
|
1023
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
1024
|
+
//!
|
|
1025
|
+
//! @param[in] d_temp_storage
|
|
1026
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1027
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
1028
|
+
//!
|
|
1029
|
+
//! @param[in,out] temp_storage_bytes
|
|
1030
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1031
|
+
//!
|
|
1032
|
+
//! @param[in] d_in
|
|
1033
|
+
//! Random-access iterator to the input sequence of data items
|
|
1034
|
+
//!
|
|
1035
|
+
//! @param[out] d_out
|
|
1036
|
+
//! Random-access iterator to the output sequence of data items
|
|
1037
|
+
//!
|
|
1038
|
+
//! @param[in] d_in_begin_offsets
|
|
1039
|
+
//! @rst
|
|
1040
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1041
|
+
//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
|
|
1042
|
+
//! element of the \ *i*\ :sup:`th` data segment in ``d_in``
|
|
1043
|
+
//! @endrst
|
|
1044
|
+
//!
|
|
1045
|
+
//! @param[in] d_in_end_offsets
|
|
1046
|
+
//! @rst
|
|
1047
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1048
|
+
//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
|
|
1049
|
+
//! the \ *i*\ :sup:`th` data segment in ``d_in``.
|
|
1050
|
+
//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
|
|
1051
|
+
//! is considered empty.
|
|
1052
|
+
//! @endrst
|
|
1053
|
+
//!
|
|
1054
|
+
//! @param[in] d_out_begin_offsets
|
|
1055
|
+
//! @rst
|
|
1056
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1057
|
+
//! length ``num_segments``, such that ``d_out_begin_offsets[i]`` is the first
|
|
1058
|
+
//! element of the \ *i*\ :sup:`th` data segment in ``d_out``
|
|
1059
|
+
//! @endrst
|
|
1060
|
+
//!
|
|
1061
|
+
//! @param[in] num_segments
|
|
1062
|
+
//! The number of segments that comprise the segmented prefix scan data.
|
|
1063
|
+
//!
|
|
1064
|
+
//! @param[in] scan_op
|
|
1065
|
+
//! Binary associative scan functor
|
|
1066
|
+
//!
|
|
1067
|
+
//! @param[in] stream
|
|
1068
|
+
//! @rst
|
|
1069
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1070
|
+
//! @endrst
|
|
1071
|
+
template <typename InputIteratorT,
|
|
1072
|
+
typename OutputIteratorT,
|
|
1073
|
+
typename BeginOffsetIteratorInputT,
|
|
1074
|
+
typename EndOffsetIteratorInputT,
|
|
1075
|
+
typename BeginOffsetIteratorOutputT,
|
|
1076
|
+
typename ScanOpT>
|
|
1077
|
+
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSegmentedScan(
|
|
1078
|
+
void* d_temp_storage,
|
|
1079
|
+
size_t& temp_storage_bytes,
|
|
1080
|
+
InputIteratorT d_in,
|
|
1081
|
+
OutputIteratorT d_out,
|
|
1082
|
+
BeginOffsetIteratorInputT d_in_begin_offsets,
|
|
1083
|
+
EndOffsetIteratorInputT d_in_end_offsets,
|
|
1084
|
+
BeginOffsetIteratorOutputT d_out_begin_offsets,
|
|
1085
|
+
::cuda::std::int64_t num_segments,
|
|
1086
|
+
ScanOpT scan_op,
|
|
1087
|
+
cudaStream_t stream = 0)
|
|
1088
|
+
{
|
|
1089
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedScan::InclusiveSegmentedScan");
|
|
1090
|
+
|
|
1091
|
+
using offset_t =
|
|
1092
|
+
detail::common_iterator_value_t<BeginOffsetIteratorInputT, EndOffsetIteratorInputT, BeginOffsetIteratorOutputT>;
|
|
1093
|
+
using integral_offset_check = ::cuda::std::is_integral<offset_t>;
|
|
1094
|
+
|
|
1095
|
+
static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
|
|
1096
|
+
|
|
1097
|
+
return cub::detail::segmented_scan::dispatch_segmented_scan<
|
|
1098
|
+
InputIteratorT,
|
|
1099
|
+
OutputIteratorT,
|
|
1100
|
+
BeginOffsetIteratorInputT,
|
|
1101
|
+
EndOffsetIteratorInputT,
|
|
1102
|
+
BeginOffsetIteratorOutputT,
|
|
1103
|
+
ScanOpT,
|
|
1104
|
+
NullType>::dispatch(d_temp_storage,
|
|
1105
|
+
temp_storage_bytes,
|
|
1106
|
+
d_in,
|
|
1107
|
+
d_out,
|
|
1108
|
+
num_segments,
|
|
1109
|
+
d_in_begin_offsets,
|
|
1110
|
+
d_in_end_offsets,
|
|
1111
|
+
d_out_begin_offsets,
|
|
1112
|
+
scan_op,
|
|
1113
|
+
NullType(),
|
|
1114
|
+
stream);
|
|
1115
|
+
}
|
|
1116
|
+
|
|
1117
|
+
//! @rst
|
|
1118
|
+
//! Computes a device-wide segmented inclusive prefix scan using the specified binary associative ``scan_op`` functor.
|
|
1119
|
+
//! The result of applying the ``scan_op`` binary operator to ``init_value`` value and the first value in each input
|
|
1120
|
+
//! segment is assigned to the first value of the corresponding output segment.
|
|
1121
|
+
//!
|
|
1122
|
+
//! - Supports non-commutative scan operators.
|
|
1123
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
1124
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
1125
|
+
//! operators may vary from run to run.
|
|
1126
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The input and output sequences
|
|
1127
|
+
//! shall not overlap in any other way.
|
|
1128
|
+
//! - @devicestorage
|
|
1129
|
+
//!
|
|
1130
|
+
//! Snippet
|
|
1131
|
+
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
1132
|
+
//!
|
|
1133
|
+
//! The code snippet below illustrates the exclusive segmented prefix scan of an ``int``
|
|
1134
|
+
//! device vector.
|
|
1135
|
+
//!
|
|
1136
|
+
//! .. literalinclude:: ../../../cub/test/catch2_test_segmented_scan_api.cu
|
|
1137
|
+
//! :language: c++
|
|
1138
|
+
//! :dedent:
|
|
1139
|
+
//! :start-after: example-begin inclusive-segmented-scan-init-two-offsets
|
|
1140
|
+
//! :end-before: example-end inclusive-segmented-scan-init-two-offsets
|
|
1141
|
+
//!
|
|
1142
|
+
//! @endrst
|
|
1143
|
+
//!
|
|
1144
|
+
//! @tparam InputIteratorT
|
|
1145
|
+
//! **[inferred]** Random-access input iterator type for reading segmented scan inputs @iterator
|
|
1146
|
+
//!
|
|
1147
|
+
//! @tparam OutputIteratorT
|
|
1148
|
+
//! **[inferred]** Random-access output iterator type for writing segmented scan outputs @iterator
|
|
1149
|
+
//!
|
|
1150
|
+
//! @tparam BeginOffsetIteratorInputT
|
|
1151
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data
|
|
1152
|
+
//! sequence @iterator
|
|
1153
|
+
//!
|
|
1154
|
+
//! @tparam EndOffsetIteratorInputT
|
|
1155
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence
|
|
1156
|
+
//! @iterator
|
|
1157
|
+
//!
|
|
1158
|
+
//! @tparam ScanOpT
|
|
1159
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
1160
|
+
//!
|
|
1161
|
+
//! @tparam InitValueT
|
|
1162
|
+
//! **[inferred]** Type of the `init_value`
|
|
1163
|
+
//!
|
|
1164
|
+
//! @param[in] d_temp_storage
|
|
1165
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1166
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
1167
|
+
//!
|
|
1168
|
+
//! @param[in,out] temp_storage_bytes
|
|
1169
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1170
|
+
//!
|
|
1171
|
+
//! @param[in] d_in
|
|
1172
|
+
//! Random-access iterator to the input sequence of data items
|
|
1173
|
+
//!
|
|
1174
|
+
//! @param[out] d_out
|
|
1175
|
+
//! Random-access iterator to the output sequence of data items
|
|
1176
|
+
//!
|
|
1177
|
+
//! @param[in] d_in_begin_offsets
|
|
1178
|
+
//! @rst
|
|
1179
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1180
|
+
//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
|
|
1181
|
+
//! element of the \ *i*\ :sup:`th` data segment in ``d_in`` and in ``d_out``
|
|
1182
|
+
//! @endrst
|
|
1183
|
+
//!
|
|
1184
|
+
//! @param[in] d_in_end_offsets
|
|
1185
|
+
//! @rst
|
|
1186
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1187
|
+
//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
|
|
1188
|
+
//! the \ *i*\ :sup:`th` data segment in ``d_in``.
|
|
1189
|
+
//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
|
|
1190
|
+
//! is considered empty.
|
|
1191
|
+
//! @endrst
|
|
1192
|
+
//!
|
|
1193
|
+
//! @param[in] num_segments
|
|
1194
|
+
//! The number of segments that comprise the segmented prefix scan data.
|
|
1195
|
+
//!
|
|
1196
|
+
//! @param[in] scan_op
|
|
1197
|
+
//! Binary associative scan functor
|
|
1198
|
+
//!
|
|
1199
|
+
//! @param[in] init_value
|
|
1200
|
+
//! Initial value to seed the exclusive scan for each segment in the output sequence
|
|
1201
|
+
//!
|
|
1202
|
+
//! @param[in] stream
|
|
1203
|
+
//! @rst
|
|
1204
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1205
|
+
//! @endrst
|
|
1206
|
+
template <typename InputIteratorT,
|
|
1207
|
+
typename OutputIteratorT,
|
|
1208
|
+
typename BeginOffsetIteratorInputT,
|
|
1209
|
+
typename EndOffsetIteratorInputT,
|
|
1210
|
+
typename ScanOpT,
|
|
1211
|
+
typename InitValueT>
|
|
1212
|
+
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSegmentedScanInit(
|
|
1213
|
+
void* d_temp_storage,
|
|
1214
|
+
size_t& temp_storage_bytes,
|
|
1215
|
+
InputIteratorT d_in,
|
|
1216
|
+
OutputIteratorT d_out,
|
|
1217
|
+
BeginOffsetIteratorInputT d_in_begin_offsets,
|
|
1218
|
+
EndOffsetIteratorInputT d_in_end_offsets,
|
|
1219
|
+
::cuda::std::int64_t num_segments,
|
|
1220
|
+
ScanOpT scan_op,
|
|
1221
|
+
InitValueT init_value,
|
|
1222
|
+
cudaStream_t stream = 0)
|
|
1223
|
+
{
|
|
1224
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedScan::InclusiveSegmentedScanInit");
|
|
1225
|
+
|
|
1226
|
+
using offset_t = detail::common_iterator_value_t<BeginOffsetIteratorInputT, EndOffsetIteratorInputT>;
|
|
1227
|
+
using integral_offset_check = ::cuda::std::is_integral<offset_t>;
|
|
1228
|
+
|
|
1229
|
+
static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
|
|
1230
|
+
static_assert(!::cuda::std::is_same_v<InitValueT, NullType>);
|
|
1231
|
+
|
|
1232
|
+
using accum_t = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::it_value_t<InputIteratorT>, InitValueT>;
|
|
1233
|
+
|
|
1234
|
+
return cub::detail::segmented_scan::dispatch_segmented_scan<
|
|
1235
|
+
InputIteratorT,
|
|
1236
|
+
OutputIteratorT,
|
|
1237
|
+
BeginOffsetIteratorInputT,
|
|
1238
|
+
EndOffsetIteratorInputT,
|
|
1239
|
+
BeginOffsetIteratorInputT,
|
|
1240
|
+
ScanOpT,
|
|
1241
|
+
detail::InputValue<InitValueT>,
|
|
1242
|
+
accum_t,
|
|
1243
|
+
ForceInclusive::Yes>::dispatch(d_temp_storage,
|
|
1244
|
+
temp_storage_bytes,
|
|
1245
|
+
d_in,
|
|
1246
|
+
d_out,
|
|
1247
|
+
num_segments,
|
|
1248
|
+
d_in_begin_offsets,
|
|
1249
|
+
d_in_end_offsets,
|
|
1250
|
+
d_in_begin_offsets,
|
|
1251
|
+
scan_op,
|
|
1252
|
+
detail::InputValue<InitValueT>(init_value),
|
|
1253
|
+
stream);
|
|
1254
|
+
}
|
|
1255
|
+
|
|
1256
|
+
//! @rst
|
|
1257
|
+
//! Computes a device-wide segmented inclusive prefix scan using the specified binary associative ``scan_op`` functor.
|
|
1258
|
+
//! The result of applying the ``scan_op`` binary operator to ``init_value`` value and the first value in each input
|
|
1259
|
+
//! segment is assigned to the first value of the corresponding output segment.
|
|
1260
|
+
//!
|
|
1261
|
+
//! - Supports non-commutative scan operators.
|
|
1262
|
+
//! - Results are not deterministic for pseudo-associative operators (e.g.,
|
|
1263
|
+
//! addition of floating-point types). Results for pseudo-associative
|
|
1264
|
+
//! operators may vary from run to run.
|
|
1265
|
+
//! - When ``d_in`` and ``d_out`` are equal, the scan is performed in-place. The input and output sequences
|
|
1266
|
+
//! shall not overlap in any other way.
|
|
1267
|
+
//! - @devicestorage
|
|
1268
|
+
//!
|
|
1269
|
+
//! @endrst
|
|
1270
|
+
//!
|
|
1271
|
+
//! @tparam InputIteratorT
|
|
1272
|
+
//! **[inferred]** Random-access input iterator type for reading segmented scan inputs @iterator
|
|
1273
|
+
//!
|
|
1274
|
+
//! @tparam OutputIteratorT
|
|
1275
|
+
//! **[inferred]** Random-access output iterator type for writing segmented scan outputs @iterator
|
|
1276
|
+
//!
|
|
1277
|
+
//! @tparam BeginOffsetIteratorInputT
|
|
1278
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets in the input data
|
|
1279
|
+
//! sequence @iterator
|
|
1280
|
+
//!
|
|
1281
|
+
//! @tparam EndOffsetIteratorInputT
|
|
1282
|
+
//! **[inferred]** Random-access input iterator type for reading segment ending offsets in the input data sequence
|
|
1283
|
+
//! @iterator
|
|
1284
|
+
//!
|
|
1285
|
+
//! @tparam BeginOffsetIteratorOutputT
|
|
1286
|
+
//! **[inferred]** Random-access input iterator type for reading segment beginning offsets in the output sequence
|
|
1287
|
+
//! @iterator
|
|
1288
|
+
//!
|
|
1289
|
+
//! @tparam ScanOpT
|
|
1290
|
+
//! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)`
|
|
1291
|
+
//!
|
|
1292
|
+
//! @tparam InitValueT
|
|
1293
|
+
//! **[inferred]** Type of the `init_value`
|
|
1294
|
+
//!
|
|
1295
|
+
//! @param[in] d_temp_storage
|
|
1296
|
+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
|
|
1297
|
+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
|
|
1298
|
+
//!
|
|
1299
|
+
//! @param[in,out] temp_storage_bytes
|
|
1300
|
+
//! Reference to size in bytes of `d_temp_storage` allocation
|
|
1301
|
+
//!
|
|
1302
|
+
//! @param[in] d_in
|
|
1303
|
+
//! Random-access iterator to the input sequence of data items
|
|
1304
|
+
//!
|
|
1305
|
+
//! @param[out] d_out
|
|
1306
|
+
//! Random-access iterator to the output sequence of data items
|
|
1307
|
+
//!
|
|
1308
|
+
//! @param[in] d_in_begin_offsets
|
|
1309
|
+
//! @rst
|
|
1310
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1311
|
+
//! length ``num_segments``, such that ``d_in_begin_offsets[i]`` is the first
|
|
1312
|
+
//! element of the \ *i*\ :sup:`th` data segment in ``d_in``
|
|
1313
|
+
//! @endrst
|
|
1314
|
+
//!
|
|
1315
|
+
//! @param[in] d_in_end_offsets
|
|
1316
|
+
//! @rst
|
|
1317
|
+
//! Random-access input iterator to the sequence of ending offsets of length
|
|
1318
|
+
//! ``num_segments``, such that ``d_in_end_offsets[i] - 1`` is the last element of
|
|
1319
|
+
//! the \ *i*\ :sup:`th` data segment in ``d_in``.
|
|
1320
|
+
//! If ``d_in_end_offsets[i] - 1 <= d_in_begin_offsets[i]``, the \ *i*\ :sup:`th`
|
|
1321
|
+
//! is considered empty.
|
|
1322
|
+
//! @endrst
|
|
1323
|
+
//!
|
|
1324
|
+
//! @param[in] d_out_begin_offsets
|
|
1325
|
+
//! @rst
|
|
1326
|
+
//! Random-access input iterator to the sequence of beginning offsets of
|
|
1327
|
+
//! length ``num_segments``, such that ``d_out_begin_offsets[i]`` is the first
|
|
1328
|
+
//! element of the \ *i*\ :sup:`th` data segment in ``d_out``
|
|
1329
|
+
//! @endrst
|
|
1330
|
+
//!
|
|
1331
|
+
//! @param[in] num_segments
|
|
1332
|
+
//! The number of segments that comprise the segmented prefix scan data.
|
|
1333
|
+
//!
|
|
1334
|
+
//! @param[in] scan_op
|
|
1335
|
+
//! Binary associative scan functor
|
|
1336
|
+
//!
|
|
1337
|
+
//! @param[in] init_value
|
|
1338
|
+
//! Initial value to seed the exclusive scan for each segment in the output sequence
|
|
1339
|
+
//!
|
|
1340
|
+
//! @param[in] stream
|
|
1341
|
+
//! @rst
|
|
1342
|
+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
|
|
1343
|
+
//! @endrst
|
|
1344
|
+
template <typename InputIteratorT,
|
|
1345
|
+
typename OutputIteratorT,
|
|
1346
|
+
typename BeginOffsetIteratorInputT,
|
|
1347
|
+
typename EndOffsetIteratorInputT,
|
|
1348
|
+
typename BeginOffsetIteratorOutputT,
|
|
1349
|
+
typename ScanOpT,
|
|
1350
|
+
typename InitValueT>
|
|
1351
|
+
CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSegmentedScanInit(
|
|
1352
|
+
void* d_temp_storage,
|
|
1353
|
+
size_t& temp_storage_bytes,
|
|
1354
|
+
InputIteratorT d_in,
|
|
1355
|
+
OutputIteratorT d_out,
|
|
1356
|
+
BeginOffsetIteratorInputT d_in_begin_offsets,
|
|
1357
|
+
EndOffsetIteratorInputT d_in_end_offsets,
|
|
1358
|
+
BeginOffsetIteratorOutputT d_out_begin_offsets,
|
|
1359
|
+
::cuda::std::int64_t num_segments,
|
|
1360
|
+
ScanOpT scan_op,
|
|
1361
|
+
InitValueT init_value,
|
|
1362
|
+
cudaStream_t stream = 0)
|
|
1363
|
+
{
|
|
1364
|
+
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedScan::InclusiveSegmentedScanInit");
|
|
1365
|
+
|
|
1366
|
+
using offset_t =
|
|
1367
|
+
detail::common_iterator_value_t<BeginOffsetIteratorInputT, EndOffsetIteratorInputT, BeginOffsetIteratorOutputT>;
|
|
1368
|
+
using integral_offset_check = ::cuda::std::is_integral<offset_t>;
|
|
1369
|
+
|
|
1370
|
+
static_assert(integral_offset_check::value, "Offset iterator value type should be integral.");
|
|
1371
|
+
static_assert(!::cuda::std::is_same_v<InitValueT, NullType>);
|
|
1372
|
+
|
|
1373
|
+
using accum_t = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::it_value_t<InputIteratorT>, InitValueT>;
|
|
1374
|
+
|
|
1375
|
+
return cub::detail::segmented_scan::dispatch_segmented_scan<
|
|
1376
|
+
InputIteratorT,
|
|
1377
|
+
OutputIteratorT,
|
|
1378
|
+
BeginOffsetIteratorInputT,
|
|
1379
|
+
EndOffsetIteratorInputT,
|
|
1380
|
+
BeginOffsetIteratorOutputT,
|
|
1381
|
+
ScanOpT,
|
|
1382
|
+
detail::InputValue<InitValueT>,
|
|
1383
|
+
accum_t,
|
|
1384
|
+
ForceInclusive::Yes>::dispatch(d_temp_storage,
|
|
1385
|
+
temp_storage_bytes,
|
|
1386
|
+
d_in,
|
|
1387
|
+
d_out,
|
|
1388
|
+
num_segments,
|
|
1389
|
+
d_in_begin_offsets,
|
|
1390
|
+
d_in_end_offsets,
|
|
1391
|
+
d_out_begin_offsets,
|
|
1392
|
+
scan_op,
|
|
1393
|
+
detail::InputValue<InitValueT>(init_value),
|
|
1394
|
+
stream);
|
|
1395
|
+
}
|
|
1396
|
+
};
|
|
1397
|
+
|
|
1398
|
+
CUB_NAMESPACE_END
|