cuda-cccl 0.4.3__cp312-cp312-manylinux_2_26_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cuda/cccl/__init__.py +27 -0
- cuda/cccl/_cuda_version_utils.py +24 -0
- cuda/cccl/cooperative/__init__.py +9 -0
- cuda/cccl/cooperative/experimental/__init__.py +24 -0
- cuda/cccl/headers/__init__.py +7 -0
- cuda/cccl/headers/include/__init__.py +1 -0
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +233 -0
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1158 -0
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +55 -0
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +699 -0
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +365 -0
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +721 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +756 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +277 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +715 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +546 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +592 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +780 -0
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1092 -0
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +564 -0
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +448 -0
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +263 -0
- cuda/cccl/headers/include/cub/agent/agent_segmented_scan.cuh +292 -0
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1090 -0
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +584 -0
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +762 -0
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +599 -0
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1384 -0
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +939 -0
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1200 -0
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +1279 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +396 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +1269 -0
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +437 -0
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +771 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1215 -0
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2129 -0
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +124 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +661 -0
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +406 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +2168 -0
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +319 -0
- cuda/cccl/headers/include/cub/block/block_store.cuh +1238 -0
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +597 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +62 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +209 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +230 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +207 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +257 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +766 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +514 -0
- cuda/cccl/headers/include/cub/config.cuh +29 -0
- cuda/cccl/headers/include/cub/cub.cuh +96 -0
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +54 -0
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +135 -0
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +50 -0
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +94 -0
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +60 -0
- cuda/cccl/headers/include/cub/detail/env_dispatch.cuh +87 -0
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +227 -0
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +87 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +149 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +103 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +112 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/array.cuh +41 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/json.cuh +39 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/object.cuh +71 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/string.cuh +79 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/value.cuh +95 -0
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.cuh +39 -0
- cuda/cccl/headers/include/cub/detail/rfa.cuh +706 -0
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +163 -0
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +194 -0
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +377 -0
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +185 -0
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +48 -0
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +33 -0
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +572 -0
- cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
- cuda/cccl/headers/include/cub/device/device_for.cuh +1061 -0
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +1485 -0
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +171 -0
- cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +955 -0
- cuda/cccl/headers/include/cub/device/device_partition.cuh +644 -0
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3413 -0
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +2303 -0
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +346 -0
- cuda/cccl/headers/include/cub/device/device_scan.cuh +2152 -0
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1472 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1635 -0
- cuda/cccl/headers/include/cub/device/device_segmented_scan.cuh +1398 -0
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2787 -0
- cuda/cccl/headers/include/cub/device/device_select.cuh +1204 -0
- cuda/cccl/headers/include/cub/device/device_topk.cuh +521 -0
- cuda/cccl/headers/include/cub/device/device_transform.cuh +666 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +289 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +694 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +50 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +77 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh +349 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +160 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1849 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +317 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +429 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1066 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +830 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +629 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +479 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +256 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +603 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +447 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +545 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_radix_sort.cuh +638 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_reduce.cuh +410 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_scan.cuh +278 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +899 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +831 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +321 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +454 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +364 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +626 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +527 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +472 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_for_each.cuh +259 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_histogram.cuh +669 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_merge_sort.cuh +332 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_radix_sort.cuh +553 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_reduce.cuh +584 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_scan.cuh +178 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_radix_sort.cuh +262 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +295 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_scan.cuh +77 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_segmented_sort.cuh +521 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_three_way_partition.cuh +200 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_transform.cuh +1049 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/kernel_unique_by_key.cuh +175 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/common.cuh +97 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +43 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +94 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +34 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +268 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +108 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +100 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1045 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +681 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +918 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +647 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +571 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +986 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_scan.cuh +108 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +373 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1563 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +415 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +84 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +476 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +858 -0
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +203 -0
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +82 -0
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +175 -0
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +230 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +235 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +226 -0
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +293 -0
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +353 -0
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +664 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +525 -0
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +472 -0
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +214 -0
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +456 -0
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +78 -0
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +341 -0
- cuda/cccl/headers/include/cub/util_allocator.cuh +897 -0
- cuda/cccl/headers/include/cub/util_arch.cuh +176 -0
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +71 -0
- cuda/cccl/headers/include/cub/util_debug.cuh +183 -0
- cuda/cccl/headers/include/cub/util_device.cuh +838 -0
- cuda/cccl/headers/include/cub/util_macro.cuh +73 -0
- cuda/cccl/headers/include/cub/util_math.cuh +92 -0
- cuda/cccl/headers/include/cub/util_namespace.cuh +152 -0
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +31 -0
- cuda/cccl/headers/include/cub/util_ptx.cuh +483 -0
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +93 -0
- cuda/cccl/headers/include/cub/util_type.cuh +1084 -0
- cuda/cccl/headers/include/cub/util_vsmem.cuh +227 -0
- cuda/cccl/headers/include/cub/version.cuh +65 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +304 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +152 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +567 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +378 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +922 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +691 -0
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +381 -0
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +591 -0
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +145 -0
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +810 -0
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1863 -0
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +498 -0
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +59 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +199 -0
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +110 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +171 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +216 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
- cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
- cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +101 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +528 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +73 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
- cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +198 -0
- cuda/cccl/headers/include/cuda/__bit/bitfield.h +121 -0
- cuda/cccl/headers/include/cuda/__bit/bitmask.h +89 -0
- cuda/cccl/headers/include/cuda/__cccl_config +38 -0
- cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +123 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
- cuda/cccl/headers/include/cuda/__cmath/ilog.h +194 -0
- cuda/cccl/headers/include/cuda/__cmath/ipow.h +111 -0
- cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +145 -0
- cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
- cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
- cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
- cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
- cuda/cccl/headers/include/cuda/__cmath/sincos.h +134 -0
- cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
- cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
- cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
- cuda/cccl/headers/include/cuda/__complex_ +28 -0
- cuda/cccl/headers/include/cuda/__container/buffer.h +891 -0
- cuda/cccl/headers/include/cuda/__container/heterogeneous_iterator.h +436 -0
- cuda/cccl/headers/include/cuda/__container/uninitialized_async_buffer.h +416 -0
- cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
- cuda/cccl/headers/include/cuda/__device/arch_id.h +194 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +553 -0
- cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +172 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +168 -0
- cuda/cccl/headers/include/cuda/__device/physical_device.h +178 -0
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +1041 -0
- cuda/cccl/headers/include/cuda/__event/event.h +171 -0
- cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
- cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +89 -0
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +87 -0
- cuda/cccl/headers/include/cuda/__execution/policy.h +53 -0
- cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
- cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
- cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
- cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
- cuda/cccl/headers/include/cuda/__functional/maximum.h +77 -0
- cuda/cccl/headers/include/cuda/__functional/minimum.h +77 -0
- cuda/cccl/headers/include/cuda/__functional/minimum_maximum_common.h +52 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +106 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/execution_policy.h +47 -0
- cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
- cuda/cccl/headers/include/cuda/__hierarchy/dimensions.h +162 -0
- cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_dimensions.h +986 -0
- cuda/cccl/headers/include/cuda/__hierarchy/hierarchy_levels.h +494 -0
- cuda/cccl/headers/include/cuda/__hierarchy/level_dimensions.h +225 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +490 -0
- cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
- cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
- cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +147 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +114 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +555 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +589 -0
- cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
- cuda/cccl/headers/include/cuda/__launch/configuration.h +754 -0
- cuda/cccl/headers/include/cuda/__launch/host_launch.h +115 -0
- cuda/cccl/headers/include/cuda/__launch/launch.h +334 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +531 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +239 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +118 -0
- cuda/cccl/headers/include/cuda/__mdspan/shared_memory_accessor.h +208 -0
- cuda/cccl/headers/include/cuda/__mdspan/shared_memory_mdspan.h +129 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +77 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/elect_one.h +52 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +164 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +57 -0
- cuda/cccl/headers/include/cuda/__memory/address_space.h +256 -0
- cuda/cccl/headers/include/cuda/__memory/align_down.h +77 -0
- cuda/cccl/headers/include/cuda/__memory/align_up.h +77 -0
- cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
- cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
- cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
- cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
- cuda/cccl/headers/include/cuda/__memory/is_aligned.h +60 -0
- cuda/cccl/headers/include/cuda/__memory/is_pointer_accessible.h +278 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +92 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
- cuda/cccl/headers/include/cuda/__memory/ranges_overlap.h +125 -0
- cuda/cccl/headers/include/cuda/__memory_pool/device_memory_pool.h +166 -0
- cuda/cccl/headers/include/cuda/__memory_pool/managed_memory_pool.h +161 -0
- cuda/cccl/headers/include/cuda/__memory_pool/memory_pool_base.h +644 -0
- cuda/cccl/headers/include/cuda/__memory_pool/pinned_memory_pool.h +218 -0
- cuda/cccl/headers/include/cuda/__memory_resource/any_resource.h +882 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
- cuda/cccl/headers/include/cuda/__memory_resource/legacy_managed_memory_resource.h +148 -0
- cuda/cccl/headers/include/cuda/__memory_resource/legacy_pinned_memory_resource.h +141 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +130 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +135 -0
- cuda/cccl/headers/include/cuda/__memory_resource/shared_resource.h +261 -0
- cuda/cccl/headers/include/cuda/__memory_resource/synchronous_resource_adapter.h +136 -0
- cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +331 -0
- cuda/cccl/headers/include/cuda/__numeric/div_overflow.h +150 -0
- cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
- cuda/cccl/headers/include/cuda/__numeric/sub_overflow.h +359 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2977 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +240 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +245 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +52 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +977 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +302 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +631 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_inval.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/setmaxnreg.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +120 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +91 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +693 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +50 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +11437 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +6513 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +6726 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +40 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +4767 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +48 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +886 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_inval.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/setmaxnreg.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +178 -0
- cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
- cuda/cccl/headers/include/cuda/__random/pcg_engine.h +398 -0
- cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
- cuda/cccl/headers/include/cuda/__runtime/api_wrapper.h +62 -0
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
- cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
- cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +109 -0
- cuda/cccl/headers/include/cuda/__stream/internal_streams.h +49 -0
- cuda/cccl/headers/include/cuda/__stream/invalid_stream.h +47 -0
- cuda/cccl/headers/include/cuda/__stream/launch_transform.h +193 -0
- cuda/cccl/headers/include/cuda/__stream/stream.h +145 -0
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +356 -0
- cuda/cccl/headers/include/cuda/__tma/make_tma_descriptor.h +657 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_instantiable_with.h +47 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
- cuda/cccl/headers/include/cuda/__type_traits/vector_type.h +355 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +611 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +170 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +147 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +256 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +323 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +183 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
- cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
- cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
- cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
- cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
- cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
- cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
- cuda/cccl/headers/include/cuda/access_property +26 -0
- cuda/cccl/headers/include/cuda/algorithm +28 -0
- cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
- cuda/cccl/headers/include/cuda/atomic +27 -0
- cuda/cccl/headers/include/cuda/barrier +293 -0
- cuda/cccl/headers/include/cuda/bit +29 -0
- cuda/cccl/headers/include/cuda/buffer +27 -0
- cuda/cccl/headers/include/cuda/cmath +38 -0
- cuda/cccl/headers/include/cuda/devices +33 -0
- cuda/cccl/headers/include/cuda/discard_memory +32 -0
- cuda/cccl/headers/include/cuda/functional +32 -0
- cuda/cccl/headers/include/cuda/hierarchy +28 -0
- cuda/cccl/headers/include/cuda/iterator +39 -0
- cuda/cccl/headers/include/cuda/latch +27 -0
- cuda/cccl/headers/include/cuda/launch +28 -0
- cuda/cccl/headers/include/cuda/mdspan +29 -0
- cuda/cccl/headers/include/cuda/memory +37 -0
- cuda/cccl/headers/include/cuda/memory_pool +27 -0
- cuda/cccl/headers/include/cuda/memory_resource +41 -0
- cuda/cccl/headers/include/cuda/numeric +31 -0
- cuda/cccl/headers/include/cuda/pipeline +580 -0
- cuda/cccl/headers/include/cuda/ptx +131 -0
- cuda/cccl/headers/include/cuda/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +143 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/inplace_merge.h +293 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move.h +91 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/nth_element.h +309 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if.h +78 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_find_if_not.h +85 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +97 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sample.h +116 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shuffle.h +71 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sort.h +1097 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/stable_partition.h +359 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/stable_sort.h +321 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +153 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +458 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +4436 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +184 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +242 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
- cuda/cccl/headers/include/cuda/std/__atomic/order.h +158 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
- cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +242 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +103 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +238 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +51 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
- cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
- cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
- cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
- cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +81 -0
- cuda/cccl/headers/include/cuda/std/__bit/blsr.h +51 -0
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +209 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +191 -0
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +202 -0
- cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
- cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
- cuda/cccl/headers/include/cuda/std/__bit/integral.h +125 -0
- cuda/cccl/headers/include/cuda/std/__bit/popcount.h +172 -0
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
- cuda/cccl/headers/include/cuda/std/__bit/rotate.h +185 -0
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
- cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
- cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +494 -0
- cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +213 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +87 -0
- cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +197 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +355 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
- cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +139 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +59 -0
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +71 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1288 -0
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +312 -0
- cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +363 -0
- cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
- cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
- cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
- cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
- cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
- cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
- cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +171 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +192 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +52 -0
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +160 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +499 -0
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +53 -0
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +44 -0
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +185 -0
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +58 -0
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +78 -0
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +255 -0
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +184 -0
- cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
- cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +204 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +783 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +122 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +129 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +230 -0
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +204 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +285 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +220 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +285 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +370 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +166 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +203 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +184 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
- cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +533 -0
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
- cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
- cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +210 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +198 -0
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +983 -0
- cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +242 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +327 -0
- cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +403 -0
- cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +119 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +522 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
- cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +302 -0
- cuda/cccl/headers/include/cuda/std/__complex/math.h +161 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
- cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
- cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
- cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
- cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
- cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
- cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
- cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +385 -0
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
- cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
- cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +98 -0
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
- cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
- cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
- cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
- cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
- cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +110 -0
- cuda/cccl/headers/include/cuda/std/__exception/exception_macros.h +108 -0
- cuda/cccl/headers/include/cuda/std/__exception/format_error.h +62 -0
- cuda/cccl/headers/include/cuda/std/__exception/msg_storage.h +41 -0
- cuda/cccl/headers/include/cuda/std/__exception/terminate.h +74 -0
- cuda/cccl/headers/include/cuda/std/__exception/throw_error.h +120 -0
- cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
- cuda/cccl/headers/include/cuda/std/__execution/policy.h +90 -0
- cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1051 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +164 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +375 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +126 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
- cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
- cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
- cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
- cuda/cccl/headers/include/cuda/std/__format/format_context.h +93 -0
- cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
- cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
- cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1265 -0
- cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
- cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
- cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
- cuda/cccl/headers/include/cuda/std/__format_ +45 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +81 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/compose.h +69 -0
- cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +76 -0
- cuda/cccl/headers/include/cuda/std/__functional/function.h +1271 -0
- cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
- cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +298 -0
- cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
- cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
- cuda/cccl/headers/include/cuda/std/__functional/operations.h +535 -0
- cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +114 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
- cuda/cccl/headers/include/cuda/std/__fwd/execution_policy.h +73 -0
- cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
- cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
- cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +122 -0
- cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/ios.h +123 -0
- cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
- cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +39 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
- cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
- cuda/cccl/headers/include/cuda/std/__fwd/variant.h +51 -0
- cuda/cccl/headers/include/cuda/std/__internal/atomic.h +55 -0
- cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +104 -0
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +132 -0
- cuda/cccl/headers/include/cuda/std/__internal/pstl_config.h +32 -0
- cuda/cccl/headers/include/cuda/std/__internal/thread_api.h +58 -0
- cuda/cccl/headers/include/cuda/std/__internal/version.h +52 -0
- cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +227 -0
- cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +164 -0
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
- cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
- cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +465 -0
- cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +124 -0
- cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +100 -0
- cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +181 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
- cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
- cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
- cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +366 -0
- cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
- cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
- cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +603 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +76 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +52 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +136 -0
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +128 -0
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +315 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +137 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +348 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +749 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +598 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +515 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +190 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +187 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +339 -0
- cuda/cccl/headers/include/cuda/std/__memory/addressof.h +90 -0
- cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +82 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +327 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +85 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +517 -0
- cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
- cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +241 -0
- cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +51 -0
- cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
- cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +99 -0
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
- cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
- cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +131 -0
- cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
- cuda/cccl/headers/include/cuda/std/__new/device_new.h +30 -0
- cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
- cuda/cccl/headers/include/cuda/std/__new_ +30 -0
- cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
- cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
- cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
- cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
- cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
- cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
- cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
- cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
- cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
- cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +861 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +439 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
- cuda/cccl/headers/include/cuda/std/__pstl/cuda/for_each_n.h +97 -0
- cuda/cccl/headers/include/cuda/std/__pstl/dispatch.h +123 -0
- cuda/cccl/headers/include/cuda/std/__pstl/for_each.h +71 -0
- cuda/cccl/headers/include/cuda/std/__pstl/for_each_n.h +68 -0
- cuda/cccl/headers/include/cuda/std/__random/bernoulli_distribution.h +173 -0
- cuda/cccl/headers/include/cuda/std/__random/binomial_distribution.h +254 -0
- cuda/cccl/headers/include/cuda/std/__random/cauchy_distribution.h +192 -0
- cuda/cccl/headers/include/cuda/std/__random/chi_squared_distribution.h +179 -0
- cuda/cccl/headers/include/cuda/std/__random/exponential_distribution.h +187 -0
- cuda/cccl/headers/include/cuda/std/__random/extreme_value_distribution.h +196 -0
- cuda/cccl/headers/include/cuda/std/__random/fisher_f_distribution.h +196 -0
- cuda/cccl/headers/include/cuda/std/__random/gamma_distribution.h +257 -0
- cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
- cuda/cccl/headers/include/cuda/std/__random/geometric_distribution.h +179 -0
- cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
- cuda/cccl/headers/include/cuda/std/__random/is_valid.h +70 -0
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
- cuda/cccl/headers/include/cuda/std/__random/lognormal_distribution.h +174 -0
- cuda/cccl/headers/include/cuda/std/__random/negative_binomial_distribution.h +212 -0
- cuda/cccl/headers/include/cuda/std/__random/normal_distribution.h +232 -0
- cuda/cccl/headers/include/cuda/std/__random/philox_engine.h +562 -0
- cuda/cccl/headers/include/cuda/std/__random/poisson_distribution.h +338 -0
- cuda/cccl/headers/include/cuda/std/__random/seed_seq.h +204 -0
- cuda/cccl/headers/include/cuda/std/__random/student_t_distribution.h +186 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +341 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +192 -0
- cuda/cccl/headers/include/cuda/std/__random/weibull_distribution.h +189 -0
- cuda/cccl/headers/include/cuda/std/__random_ +47 -0
- cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
- cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +889 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
- cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
- cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
- cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
- cuda/cccl/headers/include/cuda/std/__ranges/drop_view.h +389 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
- cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
- cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +264 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +409 -0
- cuda/cccl/headers/include/cuda/std/__ranges/non_propagating_cache.h +210 -0
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +163 -0
- cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +111 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
- cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +346 -0
- cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
- cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
- cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +510 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +472 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
- cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
- cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +52 -0
- cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
- cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__string/char_traits.h +190 -0
- cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +580 -0
- cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
- cuda/cccl/headers/include/cuda/std/__string_ +29 -0
- cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
- cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/apply.h +82 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/get.h +122 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +100 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tie.h +55 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple.h +457 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_cat.h +158 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_constraints.h +286 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +77 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_leaf.h +452 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +83 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +155 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_comparable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +49 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +63 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fully_bounded_array.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +72 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +200 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/as_const.h +73 -0
- cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/cmp.h +114 -0
- cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
- cuda/cccl/headers/include/cuda/std/__utility/ctad_support.h +27 -0
- cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
- cuda/cccl/headers/include/cuda/std/__utility/delegate_constructors.h +51 -0
- cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +162 -0
- cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward.h +82 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +82 -0
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +249 -0
- cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
- cuda/cccl/headers/include/cuda/std/__utility/move.h +126 -0
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
- cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
- cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +425 -0
- cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +61 -0
- cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
- cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
- cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
- cuda/cccl/headers/include/cuda/std/__variant/bad_variant_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__variant/comparison.h +207 -0
- cuda/cccl/headers/include/cuda/std/__variant/get.h +192 -0
- cuda/cccl/headers/include/cuda/std/__variant/hash.h +82 -0
- cuda/cccl/headers/include/cuda/std/__variant/sfinae_helpers.h +89 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant.h +250 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_access.h +70 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_base.h +683 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_constraints.h +135 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_match.h +126 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_traits.h +184 -0
- cuda/cccl/headers/include/cuda/std/__variant/variant_visit.h +225 -0
- cuda/cccl/headers/include/cuda/std/__variant/visit.h +148 -0
- cuda/cccl/headers/include/cuda/std/algorithm +138 -0
- cuda/cccl/headers/include/cuda/std/array +519 -0
- cuda/cccl/headers/include/cuda/std/atomic +810 -0
- cuda/cccl/headers/include/cuda/std/barrier +42 -0
- cuda/cccl/headers/include/cuda/std/bit +35 -0
- cuda/cccl/headers/include/cuda/std/bitset +986 -0
- cuda/cccl/headers/include/cuda/std/cassert +28 -0
- cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
- cuda/cccl/headers/include/cuda/std/cfloat +59 -0
- cuda/cccl/headers/include/cuda/std/charconv +31 -0
- cuda/cccl/headers/include/cuda/std/chrono +26 -0
- cuda/cccl/headers/include/cuda/std/climits +61 -0
- cuda/cccl/headers/include/cuda/std/cmath +87 -0
- cuda/cccl/headers/include/cuda/std/complex +50 -0
- cuda/cccl/headers/include/cuda/std/concepts +48 -0
- cuda/cccl/headers/include/cuda/std/cstddef +28 -0
- cuda/cccl/headers/include/cuda/std/cstdint +178 -0
- cuda/cccl/headers/include/cuda/std/cstdlib +31 -0
- cuda/cccl/headers/include/cuda/std/cstring +110 -0
- cuda/cccl/headers/include/cuda/std/ctime +155 -0
- cuda/cccl/headers/include/cuda/std/detail/__config +22 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2506 -0
- cuda/cccl/headers/include/cuda/std/execution +29 -0
- cuda/cccl/headers/include/cuda/std/expected +30 -0
- cuda/cccl/headers/include/cuda/std/functional +56 -0
- cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +2171 -0
- cuda/cccl/headers/include/cuda/std/iterator +70 -0
- cuda/cccl/headers/include/cuda/std/latch +34 -0
- cuda/cccl/headers/include/cuda/std/limits +28 -0
- cuda/cccl/headers/include/cuda/std/linalg +30 -0
- cuda/cccl/headers/include/cuda/std/mdspan +38 -0
- cuda/cccl/headers/include/cuda/std/memory +40 -0
- cuda/cccl/headers/include/cuda/std/numbers +344 -0
- cuda/cccl/headers/include/cuda/std/numeric +41 -0
- cuda/cccl/headers/include/cuda/std/optional +31 -0
- cuda/cccl/headers/include/cuda/std/ranges +70 -0
- cuda/cccl/headers/include/cuda/std/ratio +416 -0
- cuda/cccl/headers/include/cuda/std/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/source_location +107 -0
- cuda/cccl/headers/include/cuda/std/span +599 -0
- cuda/cccl/headers/include/cuda/std/string_view +924 -0
- cuda/cccl/headers/include/cuda/std/tuple +43 -0
- cuda/cccl/headers/include/cuda/std/type_traits +176 -0
- cuda/cccl/headers/include/cuda/std/utility +70 -0
- cuda/cccl/headers/include/cuda/std/variant +32 -0
- cuda/cccl/headers/include/cuda/std/version +240 -0
- cuda/cccl/headers/include/cuda/stream +32 -0
- cuda/cccl/headers/include/cuda/stream_ref +59 -0
- cuda/cccl/headers/include/cuda/tma +25 -0
- cuda/cccl/headers/include/cuda/type_traits +27 -0
- cuda/cccl/headers/include/cuda/utility +28 -0
- cuda/cccl/headers/include/cuda/version +16 -0
- cuda/cccl/headers/include/cuda/warp +28 -0
- cuda/cccl/headers/include/cuda/work_stealing +26 -0
- cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
- cuda/cccl/headers/include/nv/detail/__target_macros +739 -0
- cuda/cccl/headers/include/nv/target +241 -0
- cuda/cccl/headers/include/thrust/addressof.h +22 -0
- cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
- cuda/cccl/headers/include/thrust/advance.h +60 -0
- cuda/cccl/headers/include/thrust/allocate_unique.h +301 -0
- cuda/cccl/headers/include/thrust/binary_search.h +1911 -0
- cuda/cccl/headers/include/thrust/complex.h +859 -0
- cuda/cccl/headers/include/thrust/copy.h +506 -0
- cuda/cccl/headers/include/thrust/count.h +245 -0
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +108 -0
- cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +629 -0
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +193 -0
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +95 -0
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +79 -0
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +76 -0
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +74 -0
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +113 -0
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +114 -0
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +96 -0
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +537 -0
- cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +210 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +877 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +591 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +234 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +162 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +194 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +170 -0
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +222 -0
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +209 -0
- cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
- cuda/cccl/headers/include/thrust/detail/complex/cpow.h +53 -0
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +75 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +169 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +212 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +136 -0
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +130 -0
- cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
- cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
- cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
- cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +57 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +50 -0
- cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +164 -0
- cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config.h +36 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +227 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +272 -0
- cuda/cccl/headers/include/thrust/detail/copy.h +70 -0
- cuda/cccl/headers/include/thrust/detail/copy.inl +146 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +114 -0
- cuda/cccl/headers/include/thrust/detail/count.h +55 -0
- cuda/cccl/headers/include/thrust/detail/count.inl +101 -0
- cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
- cuda/cccl/headers/include/thrust/detail/equal.inl +105 -0
- cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +79 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
- cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
- cuda/cccl/headers/include/thrust/detail/extrema.inl +196 -0
- cuda/cccl/headers/include/thrust/detail/fill.inl +97 -0
- cuda/cccl/headers/include/thrust/detail/find.inl +125 -0
- cuda/cccl/headers/include/thrust/detail/for_each.inl +96 -0
- cuda/cccl/headers/include/thrust/detail/function.h +49 -0
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +213 -0
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +384 -0
- cuda/cccl/headers/include/thrust/detail/gather.inl +185 -0
- cuda/cccl/headers/include/thrust/detail/generate.inl +97 -0
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +60 -0
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +130 -0
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +335 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +125 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +89 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +208 -0
- cuda/cccl/headers/include/thrust/detail/merge.inl +288 -0
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +107 -0
- cuda/cccl/headers/include/thrust/detail/nvtx_policy.h +41 -0
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +120 -0
- cuda/cccl/headers/include/thrust/detail/partition.inl +390 -0
- cuda/cccl/headers/include/thrust/detail/pointer.h +313 -0
- cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +114 -0
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +128 -0
- cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +191 -0
- cuda/cccl/headers/include/thrust/detail/reduce.inl +396 -0
- cuda/cccl/headers/include/thrust/detail/reference.h +521 -0
- cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
- cuda/cccl/headers/include/thrust/detail/remove.inl +225 -0
- cuda/cccl/headers/include/thrust/detail/replace.inl +243 -0
- cuda/cccl/headers/include/thrust/detail/reverse.inl +100 -0
- cuda/cccl/headers/include/thrust/detail/scan.inl +536 -0
- cuda/cccl/headers/include/thrust/detail/scatter.inl +169 -0
- cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
- cuda/cccl/headers/include/thrust/detail/sequence.inl +121 -0
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +993 -0
- cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/sort.inl +385 -0
- cuda/cccl/headers/include/thrust/detail/static_assert.h +56 -0
- cuda/cccl/headers/include/thrust/detail/static_map.h +164 -0
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +77 -0
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +74 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +150 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +121 -0
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +93 -0
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +81 -0
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +173 -0
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +128 -0
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +59 -0
- cuda/cccl/headers/include/thrust/detail/type_deduction.h +61 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +50 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +162 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +67 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +55 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +48 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +91 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
- cuda/cccl/headers/include/thrust/detail/type_traits.h +143 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +102 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +97 -0
- cuda/cccl/headers/include/thrust/detail/unique.inl +391 -0
- cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +1216 -0
- cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
- cuda/cccl/headers/include/thrust/device_delete.h +74 -0
- cuda/cccl/headers/include/thrust/device_free.h +85 -0
- cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
- cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
- cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
- cuda/cccl/headers/include/thrust/device_new.h +112 -0
- cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
- cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
- cuda/cccl/headers/include/thrust/device_reference.h +983 -0
- cuda/cccl/headers/include/thrust/device_vector.h +576 -0
- cuda/cccl/headers/include/thrust/distance.h +44 -0
- cuda/cccl/headers/include/thrust/equal.h +247 -0
- cuda/cccl/headers/include/thrust/execution_policy.h +252 -0
- cuda/cccl/headers/include/thrust/extrema.h +657 -0
- cuda/cccl/headers/include/thrust/fill.h +200 -0
- cuda/cccl/headers/include/thrust/find.h +382 -0
- cuda/cccl/headers/include/thrust/for_each.h +261 -0
- cuda/cccl/headers/include/thrust/functional.h +399 -0
- cuda/cccl/headers/include/thrust/gather.h +464 -0
- cuda/cccl/headers/include/thrust/generate.h +193 -0
- cuda/cccl/headers/include/thrust/host_vector.h +576 -0
- cuda/cccl/headers/include/thrust/inner_product.h +264 -0
- cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
- cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +338 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +43 -0
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +42 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +83 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +56 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +184 -0
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +55 -0
- cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
- cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +80 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +171 -0
- cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
- cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
- cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
- cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
- cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
- cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +190 -0
- cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +225 -0
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +353 -0
- cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
- cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +365 -0
- cuda/cccl/headers/include/thrust/logical.h +290 -0
- cuda/cccl/headers/include/thrust/memory.h +299 -0
- cuda/cccl/headers/include/thrust/merge.h +726 -0
- cuda/cccl/headers/include/thrust/mismatch.h +262 -0
- cuda/cccl/headers/include/thrust/mr/allocator.h +227 -0
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +46 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +526 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +116 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +65 -0
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +65 -0
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +42 -0
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +215 -0
- cuda/cccl/headers/include/thrust/mr/new.h +98 -0
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +61 -0
- cuda/cccl/headers/include/thrust/mr/pool.h +526 -0
- cuda/cccl/headers/include/thrust/mr/pool_options.h +172 -0
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +112 -0
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +62 -0
- cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
- cuda/cccl/headers/include/thrust/mr/validator.h +54 -0
- cuda/cccl/headers/include/thrust/pair.h +102 -0
- cuda/cccl/headers/include/thrust/partition.h +1392 -0
- cuda/cccl/headers/include/thrust/per_device_resource.h +110 -0
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +182 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +153 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +97 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +149 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +46 -0
- cuda/cccl/headers/include/thrust/random/detail/mod.h +94 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +185 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +157 -0
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +56 -0
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +199 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +196 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +198 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +181 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +186 -0
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +238 -0
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +287 -0
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +215 -0
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +256 -0
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +245 -0
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +259 -0
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +257 -0
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +253 -0
- cuda/cccl/headers/include/thrust/random.h +118 -0
- cuda/cccl/headers/include/thrust/reduce.h +1114 -0
- cuda/cccl/headers/include/thrust/remove.h +768 -0
- cuda/cccl/headers/include/thrust/replace.h +826 -0
- cuda/cccl/headers/include/thrust/reverse.h +215 -0
- cuda/cccl/headers/include/thrust/scan.h +1671 -0
- cuda/cccl/headers/include/thrust/scatter.h +446 -0
- cuda/cccl/headers/include/thrust/sequence.h +277 -0
- cuda/cccl/headers/include/thrust/set_operations.h +3027 -0
- cuda/cccl/headers/include/thrust/shuffle.h +182 -0
- cuda/cccl/headers/include/thrust/sort.h +1320 -0
- cuda/cccl/headers/include/thrust/swap.h +147 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +113 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +55 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +105 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +70 -0
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +118 -0
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +95 -0
- cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +218 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +272 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +251 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +280 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +162 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +578 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +241 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +230 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +62 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +87 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +266 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +473 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +99 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +165 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +89 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +59 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +77 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +79 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +55 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +119 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +55 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +205 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +88 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +79 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +401 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +70 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +774 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +994 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +105 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +121 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +83 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +340 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +412 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +90 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1722 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +473 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +99 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +62 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +428 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +139 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +117 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +114 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +102 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +288 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +307 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
- cuda/cccl/headers/include/thrust/system/cuda/error.h +159 -0
- cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +118 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +115 -0
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +155 -0
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +104 -0
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/errno.h +118 -0
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +298 -0
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +171 -0
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +77 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +159 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +370 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +62 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +56 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +145 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +46 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +82 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +65 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +246 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +135 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +56 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +94 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +57 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +61 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +83 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +97 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +146 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +67 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +127 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +208 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +41 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +69 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +98 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +81 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +183 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +84 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +93 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +172 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +46 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +65 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +61 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +125 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +124 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +229 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +83 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +105 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +281 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +474 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +123 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +111 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +176 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +42 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +39 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +53 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +81 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +393 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +78 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +111 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +164 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +112 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +69 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +112 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +80 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +124 -0
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +115 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +68 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +40 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +120 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +69 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +109 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +72 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +120 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +41 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +139 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +147 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +298 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +62 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +97 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +177 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +152 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +143 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +204 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +120 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +353 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +121 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +584 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +56 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +112 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +105 -0
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +106 -0
- cuda/cccl/headers/include/thrust/system/error_code.h +508 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +34 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +57 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +61 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +39 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +48 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +132 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +46 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +33 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +74 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +84 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +16 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +63 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +48 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +67 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +216 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +23 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +245 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +16 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +17 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +52 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/omp/memory.h +153 -0
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +71 -0
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +120 -0
- cuda/cccl/headers/include/thrust/system/omp/vector.h +96 -0
- cuda/cccl/headers/include/thrust/system/system_error.h +183 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +33 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +59 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +117 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +113 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +45 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +28 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +72 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +302 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +79 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +16 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +121 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +378 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +118 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +66 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +294 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +20 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +273 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +16 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +17 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +49 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +55 -0
- cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +139 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +57 -0
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +106 -0
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +82 -0
- cuda/cccl/headers/include/thrust/system_error.h +57 -0
- cuda/cccl/headers/include/thrust/tabulate.h +125 -0
- cuda/cccl/headers/include/thrust/transform.h +1056 -0
- cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
- cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
- cuda/cccl/headers/include/thrust/tuple.h +139 -0
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +256 -0
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +156 -0
- cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +180 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +112 -0
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +333 -0
- cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
- cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
- cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
- cuda/cccl/headers/include/thrust/unique.h +1089 -0
- cuda/cccl/headers/include/thrust/universal_allocator.h +101 -0
- cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +80 -0
- cuda/cccl/headers/include/thrust/version.h +93 -0
- cuda/cccl/headers/include/thrust/zip_function.h +149 -0
- cuda/cccl/headers/include_paths.py +51 -0
- cuda/cccl/headers/lib/cmake/cccl/cccl-config-version.cmake +25 -0
- cuda/cccl/headers/lib/cmake/cccl/cccl-config.cmake +143 -0
- cuda/cccl/headers/lib/cmake/cub/cub-config-version.cmake +29 -0
- cuda/cccl/headers/lib/cmake/cub/cub-config.cmake +172 -0
- cuda/cccl/headers/lib/cmake/cub/cub-header-search.cmake +15 -0
- cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config-version.cmake +37 -0
- cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-config.cmake +297 -0
- cuda/cccl/headers/lib/cmake/libcudacxx/libcudacxx-header-search.cmake +15 -0
- cuda/cccl/headers/lib/cmake/thrust/FindTBB.cmake +498 -0
- cuda/cccl/headers/lib/cmake/thrust/README.md +258 -0
- cuda/cccl/headers/lib/cmake/thrust/thrust-config-version.cmake +37 -0
- cuda/cccl/headers/lib/cmake/thrust/thrust-config.cmake +983 -0
- cuda/cccl/headers/lib/cmake/thrust/thrust-header-search.cmake +15 -0
- cuda/cccl/parallel/__init__.py +9 -0
- cuda/cccl/parallel/experimental/__init__.py +24 -0
- cuda/cccl/py.typed +0 -0
- cuda/compute/__init__.py +91 -0
- cuda/compute/_bindings.py +79 -0
- cuda/compute/_bindings.pyi +516 -0
- cuda/compute/_bindings_impl.pyx +2470 -0
- cuda/compute/_caching.py +83 -0
- cuda/compute/_cccl_interop.py +354 -0
- cuda/compute/_odr_helpers.py +238 -0
- cuda/compute/_utils/__init__.py +0 -0
- cuda/compute/_utils/protocols.py +145 -0
- cuda/compute/_utils/temp_storage_buffer.py +87 -0
- cuda/compute/algorithms/__init__.py +62 -0
- cuda/compute/algorithms/_histogram.py +243 -0
- cuda/compute/algorithms/_reduce.py +205 -0
- cuda/compute/algorithms/_scan.py +344 -0
- cuda/compute/algorithms/_segmented_reduce.py +265 -0
- cuda/compute/algorithms/_select.py +196 -0
- cuda/compute/algorithms/_sort/__init__.py +23 -0
- cuda/compute/algorithms/_sort/_merge_sort.py +235 -0
- cuda/compute/algorithms/_sort/_radix_sort.py +263 -0
- cuda/compute/algorithms/_sort/_segmented_sort.py +288 -0
- cuda/compute/algorithms/_sort/_sort_common.py +52 -0
- cuda/compute/algorithms/_three_way_partition.py +292 -0
- cuda/compute/algorithms/_transform.py +317 -0
- cuda/compute/algorithms/_unique_by_key.py +259 -0
- cuda/compute/cccl/.gitkeep +0 -0
- cuda/compute/cu12/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
- cuda/compute/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-312-x86_64-linux-gnu.so +0 -0
- cuda/compute/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/determinism.py +3 -0
- cuda/compute/iterators/__init__.py +23 -0
- cuda/compute/iterators/_factories.py +251 -0
- cuda/compute/iterators/_iterators.py +680 -0
- cuda/compute/iterators/_permutation_iterator.py +266 -0
- cuda/compute/iterators/_zip_iterator.py +268 -0
- cuda/compute/numba_utils.py +54 -0
- cuda/compute/op.py +140 -0
- cuda/compute/struct.py +520 -0
- cuda/compute/typing.py +36 -0
- cuda/coop/__init__.py +8 -0
- cuda/coop/_caching.py +48 -0
- cuda/coop/_common.py +275 -0
- cuda/coop/_nvrtc.py +92 -0
- cuda/coop/_scan_op.py +181 -0
- cuda/coop/_types.py +937 -0
- cuda/coop/_typing.py +107 -0
- cuda/coop/block/__init__.py +39 -0
- cuda/coop/block/_block_exchange.py +251 -0
- cuda/coop/block/_block_load_store.py +215 -0
- cuda/coop/block/_block_merge_sort.py +125 -0
- cuda/coop/block/_block_radix_sort.py +214 -0
- cuda/coop/block/_block_reduce.py +294 -0
- cuda/coop/block/_block_scan.py +983 -0
- cuda/coop/warp/__init__.py +9 -0
- cuda/coop/warp/_warp_merge_sort.py +92 -0
- cuda/coop/warp/_warp_reduce.py +153 -0
- cuda/coop/warp/_warp_scan.py +78 -0
- cuda_cccl-0.4.3.dist-info/METADATA +84 -0
- cuda_cccl-0.4.3.dist-info/RECORD +2024 -0
- cuda_cccl-0.4.3.dist-info/WHEEL +5 -0
- cuda_cccl-0.4.3.dist-info/licenses/LICENSE +1 -0
|
@@ -0,0 +1,1849 @@
|
|
|
1
|
+
// SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved.
|
|
2
|
+
// SPDX-FileCopyrightText: Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
|
|
3
|
+
// SPDX-License-Identifier: BSD-3
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* @file
|
|
7
|
+
* cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s)
|
|
8
|
+
* from a sequence of samples data residing within device-accessible memory.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
#pragma once
|
|
12
|
+
|
|
13
|
+
#include <cub/config.cuh>
|
|
14
|
+
|
|
15
|
+
#include <cuda/std/__type_traits/is_void.h>
|
|
16
|
+
|
|
17
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
18
|
+
# pragma GCC system_header
|
|
19
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
20
|
+
# pragma clang system_header
|
|
21
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
22
|
+
# pragma system_header
|
|
23
|
+
#endif // no system header
|
|
24
|
+
|
|
25
|
+
#include <cub/agent/agent_histogram.cuh>
|
|
26
|
+
#include <cub/device/dispatch/kernels/kernel_histogram.cuh>
|
|
27
|
+
#include <cub/device/dispatch/tuning/tuning_histogram.cuh>
|
|
28
|
+
#include <cub/grid/grid_queue.cuh>
|
|
29
|
+
#include <cub/thread/thread_search.cuh>
|
|
30
|
+
#include <cub/util_debug.cuh>
|
|
31
|
+
#include <cub/util_device.cuh>
|
|
32
|
+
#include <cub/util_math.cuh>
|
|
33
|
+
#include <cub/util_temporary_storage.cuh>
|
|
34
|
+
#include <cub/util_type.cuh>
|
|
35
|
+
|
|
36
|
+
#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
|
|
37
|
+
|
|
38
|
+
#include <cuda/__cmath/ceil_div.h>
|
|
39
|
+
#include <cuda/__functional/proclaim_return_type.h>
|
|
40
|
+
#include <cuda/std/__algorithm/copy.h>
|
|
41
|
+
#include <cuda/std/__algorithm/min.h>
|
|
42
|
+
#include <cuda/std/__algorithm/transform.h>
|
|
43
|
+
#include <cuda/std/__tuple_dir/apply.h>
|
|
44
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
45
|
+
#include <cuda/std/__type_traits/is_void.h>
|
|
46
|
+
#include <cuda/std/array>
|
|
47
|
+
#include <cuda/std/limits>
|
|
48
|
+
#include <cuda/std/tuple>
|
|
49
|
+
|
|
50
|
+
#include <nv/target>
|
|
51
|
+
|
|
52
|
+
CUB_NAMESPACE_BEGIN
|
|
53
|
+
|
|
54
|
+
namespace detail::histogram
|
|
55
|
+
{
|
|
56
|
+
// Maximum number of bins per channel for which we will use a privatized smem strategy
|
|
57
|
+
static constexpr int max_privatized_smem_bins = 256;
|
|
58
|
+
|
|
59
|
+
template <int NUM_CHANNELS,
|
|
60
|
+
int NUM_ACTIVE_CHANNELS,
|
|
61
|
+
typename SampleIteratorT,
|
|
62
|
+
typename CounterT,
|
|
63
|
+
typename LevelT,
|
|
64
|
+
typename OffsetT,
|
|
65
|
+
typename SampleT>
|
|
66
|
+
struct DeviceHistogramKernelSource
|
|
67
|
+
{
|
|
68
|
+
using TransformsT = detail::histogram::Transforms<LevelT, OffsetT, SampleT>;
|
|
69
|
+
|
|
70
|
+
template <typename PolicyT>
|
|
71
|
+
_CCCL_HIDE_FROM_ABI CUB_RUNTIME_FUNCTION static constexpr auto HistogramInitKernel()
|
|
72
|
+
{
|
|
73
|
+
return &DeviceHistogramInitKernel<PolicyT, NUM_ACTIVE_CHANNELS, CounterT, OffsetT>;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/// Returns the default histogram sweep kernel that receives pre-initialized decode operators from the host.
|
|
77
|
+
template <typename PolicyT, int PRIVATIZED_SMEM_BINS, typename PrivatizedDecodeOpT, typename OutputDecodeOpT>
|
|
78
|
+
_CCCL_HIDE_FROM_ABI CUB_RUNTIME_FUNCTION static constexpr auto HistogramSweepKernel()
|
|
79
|
+
{
|
|
80
|
+
return &DeviceHistogramSweepKernel<
|
|
81
|
+
PolicyT,
|
|
82
|
+
PRIVATIZED_SMEM_BINS,
|
|
83
|
+
NUM_CHANNELS,
|
|
84
|
+
NUM_ACTIVE_CHANNELS,
|
|
85
|
+
SampleIteratorT,
|
|
86
|
+
CounterT,
|
|
87
|
+
PrivatizedDecodeOpT,
|
|
88
|
+
OutputDecodeOpT,
|
|
89
|
+
OffsetT>;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/// Returns the device-init histogram sweep kernel that initializes decode operators from level arrays in the kernel.
|
|
93
|
+
template <typename PolicyT,
|
|
94
|
+
int PRIVATIZED_SMEM_BINS,
|
|
95
|
+
typename FirstLevelArrayT,
|
|
96
|
+
typename SecondLevelArrayT,
|
|
97
|
+
bool IsEven,
|
|
98
|
+
bool IsByteSample>
|
|
99
|
+
_CCCL_HIDE_FROM_ABI CUB_RUNTIME_FUNCTION static constexpr auto HistogramSweepKernelDeviceInit()
|
|
100
|
+
{
|
|
101
|
+
// For DispatchEven, we use the scale transform to convert samples to
|
|
102
|
+
// privatized bins and pass-thru transform to convert privatized bins to
|
|
103
|
+
// output bins, vice verse for byte samples.
|
|
104
|
+
|
|
105
|
+
// For DispatchRange, we use the search transform to convert samples to
|
|
106
|
+
// privatized bins and scale transform to convert privatized bins to output bins,
|
|
107
|
+
// vice verse for byte samples.
|
|
108
|
+
|
|
109
|
+
using DecodeOpT = ::cuda::std::conditional_t<IsEven,
|
|
110
|
+
typename TransformsT::ScaleTransform,
|
|
111
|
+
typename TransformsT::template SearchTransform<const LevelT*>>;
|
|
112
|
+
|
|
113
|
+
using PrivatizedDecodeOpT =
|
|
114
|
+
::cuda::std::conditional_t<IsByteSample, typename TransformsT::PassThruTransform, DecodeOpT>;
|
|
115
|
+
using OutputDecodeOpT =
|
|
116
|
+
::cuda::std::conditional_t<IsByteSample, DecodeOpT, typename TransformsT::PassThruTransform>;
|
|
117
|
+
|
|
118
|
+
return &DeviceHistogramSweepDeviceInitKernel<
|
|
119
|
+
PolicyT,
|
|
120
|
+
PRIVATIZED_SMEM_BINS,
|
|
121
|
+
NUM_CHANNELS,
|
|
122
|
+
NUM_ACTIVE_CHANNELS,
|
|
123
|
+
SampleIteratorT,
|
|
124
|
+
CounterT,
|
|
125
|
+
FirstLevelArrayT,
|
|
126
|
+
SecondLevelArrayT,
|
|
127
|
+
PrivatizedDecodeOpT,
|
|
128
|
+
OutputDecodeOpT,
|
|
129
|
+
OffsetT,
|
|
130
|
+
IsEven>;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
CUB_RUNTIME_FUNCTION static constexpr size_t CounterSize()
|
|
134
|
+
{
|
|
135
|
+
return sizeof(CounterT);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
template <typename NumBinsT, typename UpperLevelArrayT, typename LowerLevelArrayT>
|
|
139
|
+
CUB_RUNTIME_FUNCTION static constexpr bool MayOverflow(
|
|
140
|
+
[[maybe_unused]] NumBinsT num_bins,
|
|
141
|
+
[[maybe_unused]] const UpperLevelArrayT& upper_level,
|
|
142
|
+
[[maybe_unused]] const LowerLevelArrayT& lower_level,
|
|
143
|
+
[[maybe_unused]] int channel)
|
|
144
|
+
{
|
|
145
|
+
using CommonT = typename TransformsT::ScaleTransform::CommonT;
|
|
146
|
+
|
|
147
|
+
if constexpr (::cuda::std::is_integral_v<CommonT>)
|
|
148
|
+
{
|
|
149
|
+
using IntArithmeticT = typename TransformsT::ScaleTransform::IntArithmeticT;
|
|
150
|
+
return static_cast<IntArithmeticT>(upper_level[channel] - lower_level[channel])
|
|
151
|
+
> (::cuda::std::numeric_limits<IntArithmeticT>::max() / static_cast<IntArithmeticT>(num_bins));
|
|
152
|
+
}
|
|
153
|
+
else
|
|
154
|
+
{
|
|
155
|
+
return false;
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
};
|
|
159
|
+
|
|
160
|
+
/// Dispatch struct for histogram.
|
|
161
|
+
/// This struct is used for both host-init and device-init paths controlled by IsDeviceInit:
|
|
162
|
+
template <int NUM_CHANNELS,
|
|
163
|
+
int NUM_ACTIVE_CHANNELS,
|
|
164
|
+
int PRIVATIZED_SMEM_BINS,
|
|
165
|
+
typename SampleIteratorT,
|
|
166
|
+
typename CounterT,
|
|
167
|
+
typename FirstLevelArrayT,
|
|
168
|
+
typename SecondLevelArrayT,
|
|
169
|
+
typename OffsetT,
|
|
170
|
+
bool IsDeviceInit,
|
|
171
|
+
bool IsEven,
|
|
172
|
+
bool IsByteSample,
|
|
173
|
+
typename MaxPolicyT,
|
|
174
|
+
typename KernelSource,
|
|
175
|
+
typename KernelLauncherFactory>
|
|
176
|
+
struct dispatch_histogram
|
|
177
|
+
{
|
|
178
|
+
void* d_temp_storage;
|
|
179
|
+
size_t& temp_storage_bytes;
|
|
180
|
+
SampleIteratorT d_samples;
|
|
181
|
+
::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms;
|
|
182
|
+
::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_privatized_levels;
|
|
183
|
+
::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_output_levels;
|
|
184
|
+
|
|
185
|
+
// - For host-init (IsDeviceInit=false): FirstLevelArrayT = array of output decode ops,
|
|
186
|
+
// SecondLevelArrayT = array of privatized decode ops
|
|
187
|
+
// - For device-init (IsDeviceInit=true): FirstLevelArrayT = upper level array (Even) or num_output_levels (Range),
|
|
188
|
+
// SecondLevelArrayT = lower level array (Even) or d_levels (Range)
|
|
189
|
+
FirstLevelArrayT first_level_array;
|
|
190
|
+
SecondLevelArrayT second_level_array;
|
|
191
|
+
int max_num_output_bins;
|
|
192
|
+
OffsetT num_row_pixels;
|
|
193
|
+
OffsetT num_rows;
|
|
194
|
+
OffsetT row_stride_samples;
|
|
195
|
+
cudaStream_t stream;
|
|
196
|
+
KernelSource kernel_source;
|
|
197
|
+
KernelLauncherFactory launcher_factory;
|
|
198
|
+
|
|
199
|
+
template <typename ActivePolicyT, typename DeviceHistogramInitKernelT, typename DeviceHistogramSweepKernelT>
|
|
200
|
+
CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t
|
|
201
|
+
Invoke(DeviceHistogramInitKernelT histogram_init_kernel,
|
|
202
|
+
DeviceHistogramSweepKernelT histogram_sweep_kernel,
|
|
203
|
+
ActivePolicyT policy = {})
|
|
204
|
+
{
|
|
205
|
+
cudaError error = cudaSuccess;
|
|
206
|
+
|
|
207
|
+
auto wrapped_policy = detail::histogram::MakeHistogramPolicyWrapper(policy);
|
|
208
|
+
|
|
209
|
+
const int block_threads = wrapped_policy.BlockThreads();
|
|
210
|
+
const int pixels_per_thread = wrapped_policy.PixelsPerThread();
|
|
211
|
+
|
|
212
|
+
do
|
|
213
|
+
{
|
|
214
|
+
// Get SM count
|
|
215
|
+
int sm_count;
|
|
216
|
+
error = CubDebug(launcher_factory.MultiProcessorCount(sm_count));
|
|
217
|
+
|
|
218
|
+
if (cudaSuccess != error)
|
|
219
|
+
{
|
|
220
|
+
break;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// Get SM occupancy for histogram_sweep_kernel
|
|
224
|
+
int histogram_sweep_sm_occupancy;
|
|
225
|
+
error =
|
|
226
|
+
CubDebug(launcher_factory.MaxSmOccupancy(histogram_sweep_sm_occupancy, histogram_sweep_kernel, block_threads));
|
|
227
|
+
if (cudaSuccess != error)
|
|
228
|
+
{
|
|
229
|
+
break;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// Get device occupancy for histogram_sweep_kernel
|
|
233
|
+
int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
|
|
234
|
+
|
|
235
|
+
if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
|
|
236
|
+
{
|
|
237
|
+
// Treat as a single linear array of samples
|
|
238
|
+
num_row_pixels *= num_rows;
|
|
239
|
+
num_rows = 1;
|
|
240
|
+
row_stride_samples = num_row_pixels * NUM_CHANNELS;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
|
|
244
|
+
int pixels_per_tile = block_threads * pixels_per_thread;
|
|
245
|
+
int tiles_per_row = static_cast<int>(::cuda::ceil_div(num_row_pixels, pixels_per_tile));
|
|
246
|
+
int blocks_per_row = ::cuda::std::min(histogram_sweep_occupancy, tiles_per_row);
|
|
247
|
+
int blocks_per_col =
|
|
248
|
+
(blocks_per_row > 0)
|
|
249
|
+
? int(::cuda::std::min(static_cast<OffsetT>(histogram_sweep_occupancy / blocks_per_row), num_rows))
|
|
250
|
+
: 0;
|
|
251
|
+
int num_thread_blocks = blocks_per_row * blocks_per_col;
|
|
252
|
+
|
|
253
|
+
dim3 sweep_grid_dims;
|
|
254
|
+
sweep_grid_dims.x = (unsigned int) blocks_per_row;
|
|
255
|
+
sweep_grid_dims.y = (unsigned int) blocks_per_col;
|
|
256
|
+
sweep_grid_dims.z = 1;
|
|
257
|
+
|
|
258
|
+
// Temporary storage allocation requirements
|
|
259
|
+
constexpr int NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
|
|
260
|
+
void* allocations[NUM_ALLOCATIONS] = {};
|
|
261
|
+
size_t allocation_sizes[NUM_ALLOCATIONS];
|
|
262
|
+
|
|
263
|
+
for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
|
|
264
|
+
{
|
|
265
|
+
allocation_sizes[CHANNEL] =
|
|
266
|
+
size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * kernel_source.CounterSize();
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
|
|
270
|
+
|
|
271
|
+
// Alias the temporary allocations from the single storage blob (or compute the
|
|
272
|
+
// necessary size of the blob)
|
|
273
|
+
error = CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes));
|
|
274
|
+
if (cudaSuccess != error)
|
|
275
|
+
{
|
|
276
|
+
break;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
if (d_temp_storage == nullptr)
|
|
280
|
+
{
|
|
281
|
+
// Return if the caller is simply requesting the size of the storage allocation
|
|
282
|
+
break;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
// Construct the grid queue descriptor
|
|
286
|
+
GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
|
|
287
|
+
|
|
288
|
+
// Wrap arrays so we can pass them by-value to the kernel
|
|
289
|
+
::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;
|
|
290
|
+
::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;
|
|
291
|
+
::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
|
|
292
|
+
|
|
293
|
+
auto* typedAllocations = reinterpret_cast<CounterT**>(allocations);
|
|
294
|
+
::cuda::std::copy(
|
|
295
|
+
typedAllocations, typedAllocations + NUM_ACTIVE_CHANNELS, d_privatized_histograms_wrapper.begin());
|
|
296
|
+
|
|
297
|
+
auto minus_one = ::cuda::proclaim_return_type<int>([](int levels) {
|
|
298
|
+
return levels - 1;
|
|
299
|
+
});
|
|
300
|
+
::cuda::std::transform(
|
|
301
|
+
num_privatized_levels.begin(), num_privatized_levels.end(), num_privatized_bins_wrapper.begin(), minus_one);
|
|
302
|
+
::cuda::std::transform(
|
|
303
|
+
num_output_levels.begin(), num_output_levels.end(), num_output_bins_wrapper.begin(), minus_one);
|
|
304
|
+
int histogram_init_block_threads = 256;
|
|
305
|
+
|
|
306
|
+
int histogram_init_grid_dims =
|
|
307
|
+
(max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
|
|
308
|
+
|
|
309
|
+
// Log DeviceHistogramInitKernel configuration
|
|
310
|
+
#ifdef CUB_DEBUG_LOG
|
|
311
|
+
_CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
|
|
312
|
+
histogram_init_grid_dims,
|
|
313
|
+
histogram_init_block_threads,
|
|
314
|
+
(long long) stream);
|
|
315
|
+
#endif // CUB_DEBUG_LOG
|
|
316
|
+
|
|
317
|
+
// Invoke histogram_init_kernel
|
|
318
|
+
launcher_factory(histogram_init_grid_dims, histogram_init_block_threads, 0, stream, true)
|
|
319
|
+
.doit(histogram_init_kernel, num_output_bins_wrapper, d_output_histograms, tile_queue);
|
|
320
|
+
|
|
321
|
+
// Return if empty problem
|
|
322
|
+
if ((blocks_per_row == 0) || (blocks_per_col == 0))
|
|
323
|
+
{
|
|
324
|
+
break;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
// Log histogram_sweep_kernel configuration
|
|
328
|
+
#ifdef CUB_DEBUG_LOG
|
|
329
|
+
_CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels "
|
|
330
|
+
"per thread, %d SM occupancy\n",
|
|
331
|
+
sweep_grid_dims.x,
|
|
332
|
+
sweep_grid_dims.y,
|
|
333
|
+
sweep_grid_dims.z,
|
|
334
|
+
block_threads,
|
|
335
|
+
(long long) stream,
|
|
336
|
+
pixels_per_thread,
|
|
337
|
+
histogram_sweep_sm_occupancy);
|
|
338
|
+
#endif // CUB_DEBUG_LOG
|
|
339
|
+
|
|
340
|
+
launcher_factory(sweep_grid_dims, block_threads, 0, stream, true)
|
|
341
|
+
.doit(histogram_sweep_kernel,
|
|
342
|
+
d_samples,
|
|
343
|
+
num_output_bins_wrapper,
|
|
344
|
+
num_privatized_bins_wrapper,
|
|
345
|
+
d_output_histograms,
|
|
346
|
+
d_privatized_histograms_wrapper,
|
|
347
|
+
first_level_array,
|
|
348
|
+
second_level_array,
|
|
349
|
+
num_row_pixels,
|
|
350
|
+
num_rows,
|
|
351
|
+
row_stride_samples,
|
|
352
|
+
tiles_per_row,
|
|
353
|
+
tile_queue);
|
|
354
|
+
|
|
355
|
+
// Check for failure to launch
|
|
356
|
+
error = CubDebug(cudaPeekAtLastError());
|
|
357
|
+
if (cudaSuccess != error)
|
|
358
|
+
{
|
|
359
|
+
break;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
// Sync the stream if specified to flush runtime errors
|
|
363
|
+
error = CubDebug(detail::DebugSyncStream(stream));
|
|
364
|
+
if (cudaSuccess != error)
|
|
365
|
+
{
|
|
366
|
+
break;
|
|
367
|
+
}
|
|
368
|
+
} while (0);
|
|
369
|
+
|
|
370
|
+
return error;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
template <typename ActivePolicyT>
|
|
374
|
+
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT active_policy = {})
|
|
375
|
+
{
|
|
376
|
+
if constexpr (IsDeviceInit)
|
|
377
|
+
{
|
|
378
|
+
// Device-init path: kernel initializes decode operators from level arrays
|
|
379
|
+
return Invoke<ActivePolicyT>(
|
|
380
|
+
kernel_source.template HistogramInitKernel<MaxPolicyT>(),
|
|
381
|
+
kernel_source.template HistogramSweepKernelDeviceInit<
|
|
382
|
+
MaxPolicyT,
|
|
383
|
+
PRIVATIZED_SMEM_BINS,
|
|
384
|
+
FirstLevelArrayT,
|
|
385
|
+
SecondLevelArrayT,
|
|
386
|
+
IsEven,
|
|
387
|
+
IsByteSample>(),
|
|
388
|
+
active_policy);
|
|
389
|
+
}
|
|
390
|
+
else
|
|
391
|
+
{
|
|
392
|
+
// Host-init path: decode operators are pre-initialized and passed as arrays
|
|
393
|
+
// FirstLevelArrayT is array<OutputDecodeOpT, N>, SecondLevelArrayT is array<PrivatizedDecodeOpT, N>
|
|
394
|
+
using OutputDecodeOpT = typename FirstLevelArrayT::value_type;
|
|
395
|
+
using PrivatizedDecodeOpT = typename SecondLevelArrayT::value_type;
|
|
396
|
+
return Invoke<ActivePolicyT>(
|
|
397
|
+
kernel_source.template HistogramInitKernel<MaxPolicyT>(),
|
|
398
|
+
kernel_source
|
|
399
|
+
.template HistogramSweepKernel<MaxPolicyT, PRIVATIZED_SMEM_BINS, PrivatizedDecodeOpT, OutputDecodeOpT>(),
|
|
400
|
+
active_policy);
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
};
|
|
404
|
+
} // namespace detail::histogram
|
|
405
|
+
|
|
406
|
+
/******************************************************************************
|
|
407
|
+
* Dispatch
|
|
408
|
+
******************************************************************************/
|
|
409
|
+
|
|
410
|
+
/**
|
|
411
|
+
* Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
|
|
412
|
+
*
|
|
413
|
+
* @tparam NUM_CHANNELS
|
|
414
|
+
* Number of channels interleaved in the input data (may be greater than the number of channels
|
|
415
|
+
* being actively histogrammed)
|
|
416
|
+
*
|
|
417
|
+
* @tparam NUM_ACTIVE_CHANNELS
|
|
418
|
+
* Number of channels actively being histogrammed
|
|
419
|
+
*
|
|
420
|
+
* @tparam SampleIteratorT
|
|
421
|
+
* Random-access input iterator type for reading input items @iterator
|
|
422
|
+
*
|
|
423
|
+
* @tparam CounterT
|
|
424
|
+
* Integer type for counting sample occurrences per histogram bin
|
|
425
|
+
*
|
|
426
|
+
* @tparam LevelT
|
|
427
|
+
* Type for specifying bin level boundaries
|
|
428
|
+
*
|
|
429
|
+
* @tparam OffsetT
|
|
430
|
+
* Signed integer type for global offsets
|
|
431
|
+
*
|
|
432
|
+
* @tparam PolicyHub
|
|
433
|
+
* Implementation detail, do not specify directly, requirements on the
|
|
434
|
+
* content of this type are subject to breaking change.
|
|
435
|
+
*/
|
|
436
|
+
template <
|
|
437
|
+
int NUM_CHANNELS,
|
|
438
|
+
int NUM_ACTIVE_CHANNELS,
|
|
439
|
+
typename SampleIteratorT,
|
|
440
|
+
typename CounterT,
|
|
441
|
+
typename LevelT,
|
|
442
|
+
typename OffsetT,
|
|
443
|
+
typename PolicyHub = void, // if user passes a custom Policy this should not be void
|
|
444
|
+
typename SampleT = cub::detail::it_value_t<SampleIteratorT>, /// The sample value type of the input iterator
|
|
445
|
+
typename KernelSource = detail::histogram::
|
|
446
|
+
DeviceHistogramKernelSource<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT, SampleT>,
|
|
447
|
+
typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
|
|
448
|
+
struct DispatchHistogram
|
|
449
|
+
{
|
|
450
|
+
static_assert(NUM_CHANNELS <= 4, "Histograms only support up to 4 channels");
|
|
451
|
+
static_assert(NUM_ACTIVE_CHANNELS <= NUM_CHANNELS,
|
|
452
|
+
"Active channels must be at most the number of total channels of the input samples");
|
|
453
|
+
|
|
454
|
+
public:
|
|
455
|
+
//---------------------------------------------------------------------
|
|
456
|
+
// Dispatch entrypoints
|
|
457
|
+
//---------------------------------------------------------------------
|
|
458
|
+
|
|
459
|
+
//---------------------------------------------------------------------
|
|
460
|
+
// Default (host-init) dispatch entrypoints
|
|
461
|
+
// These methods initialize decode operators on the host before kernel launch.
|
|
462
|
+
//---------------------------------------------------------------------
|
|
463
|
+
|
|
464
|
+
/**
|
|
465
|
+
* Dispatch routine for HistogramRange with host-side decode operator initialization,
|
|
466
|
+
* specialized for sample types larger than 8bit.
|
|
467
|
+
* This variant initializes the decode operators on the host before kernel launch.
|
|
468
|
+
*
|
|
469
|
+
* @param d_temp_storage
|
|
470
|
+
* Device-accessible allocation of temporary storage.
|
|
471
|
+
* When nullptr, the required allocation size is written to `temp_storage_bytes` and
|
|
472
|
+
* no work is done.
|
|
473
|
+
*
|
|
474
|
+
* @param temp_storage_bytes
|
|
475
|
+
* Reference to size in bytes of `d_temp_storage` allocation
|
|
476
|
+
*
|
|
477
|
+
* @param d_samples
|
|
478
|
+
* The pointer to the multi-channel input sequence of data samples.
|
|
479
|
+
* The samples from different channels are assumed to be interleaved
|
|
480
|
+
* (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
|
|
481
|
+
*
|
|
482
|
+
* @param d_output_histograms
|
|
483
|
+
* The pointers to the histogram counter output arrays, one for each active channel.
|
|
484
|
+
* For channel<sub><em>i</em></sub>, the allocation length of `d_histograms[i]` should be
|
|
485
|
+
* `num_output_levels[i] - 1`.
|
|
486
|
+
*
|
|
487
|
+
* @param num_output_levels
|
|
488
|
+
* The number of boundaries (levels) for delineating histogram samples in each active channel.
|
|
489
|
+
* Implies that the number of bins for channel<sub><em>i</em></sub> is
|
|
490
|
+
* `num_output_levels[i] - 1`.
|
|
491
|
+
*
|
|
492
|
+
* @param d_levels
|
|
493
|
+
* The pointers to the arrays of boundaries (levels), one for each active channel.
|
|
494
|
+
* Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are
|
|
495
|
+
* inclusive and upper sample value boundaries are exclusive.
|
|
496
|
+
*
|
|
497
|
+
* @param num_row_pixels
|
|
498
|
+
* The number of multi-channel pixels per row in the region of interest
|
|
499
|
+
*
|
|
500
|
+
* @param num_rows
|
|
501
|
+
* The number of rows in the region of interest
|
|
502
|
+
*
|
|
503
|
+
* @param row_stride_samples
|
|
504
|
+
* The number of samples between starts of consecutive rows in the region of interest
|
|
505
|
+
*
|
|
506
|
+
* @param stream
|
|
507
|
+
* CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
|
|
508
|
+
*/
|
|
509
|
+
template <typename MaxPolicyT = typename ::cuda::std::_If<
|
|
510
|
+
::cuda::std::is_void_v<PolicyHub>,
|
|
511
|
+
/* fallback_policy_hub */
|
|
512
|
+
detail::histogram::policy_hub<SampleT, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, /* isEven */ 0>,
|
|
513
|
+
PolicyHub>::MaxPolicy>
|
|
514
|
+
CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange(
|
|
515
|
+
void* d_temp_storage,
|
|
516
|
+
size_t& temp_storage_bytes,
|
|
517
|
+
SampleIteratorT d_samples,
|
|
518
|
+
::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms,
|
|
519
|
+
::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_output_levels,
|
|
520
|
+
::cuda::std::array<const LevelT*, NUM_ACTIVE_CHANNELS> d_levels,
|
|
521
|
+
OffsetT num_row_pixels,
|
|
522
|
+
OffsetT num_rows,
|
|
523
|
+
OffsetT row_stride_samples,
|
|
524
|
+
cudaStream_t stream,
|
|
525
|
+
::cuda::std::false_type /*is_byte_sample*/,
|
|
526
|
+
KernelSource kernel_source = {},
|
|
527
|
+
KernelLauncherFactory launcher_factory = {},
|
|
528
|
+
MaxPolicyT max_policy = {})
|
|
529
|
+
{
|
|
530
|
+
cudaError error = cudaSuccess;
|
|
531
|
+
|
|
532
|
+
do
|
|
533
|
+
{
|
|
534
|
+
// Get PTX version
|
|
535
|
+
int ptx_version = 0;
|
|
536
|
+
error = CubDebug(launcher_factory.PtxVersion(ptx_version));
|
|
537
|
+
if (cudaSuccess != error)
|
|
538
|
+
{
|
|
539
|
+
break;
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
using TransformsT = detail::histogram::Transforms<LevelT, OffsetT, SampleT>;
|
|
543
|
+
|
|
544
|
+
// Use the search transform op for converting samples to privatized bins
|
|
545
|
+
using PrivatizedDecodeOpT = typename TransformsT::template SearchTransform<const LevelT*>;
|
|
546
|
+
|
|
547
|
+
// Use the pass-thru transform op for converting privatized bins to output bins
|
|
548
|
+
using OutputDecodeOpT = typename TransformsT::PassThruTransform;
|
|
549
|
+
|
|
550
|
+
::cuda::std::array<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op{};
|
|
551
|
+
::cuda::std::array<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op{};
|
|
552
|
+
int max_levels = num_output_levels[0];
|
|
553
|
+
|
|
554
|
+
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
|
|
555
|
+
{
|
|
556
|
+
privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
|
|
557
|
+
if (num_output_levels[channel] > max_levels)
|
|
558
|
+
{
|
|
559
|
+
max_levels = num_output_levels[channel];
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
int max_num_output_bins = max_levels - 1;
|
|
563
|
+
|
|
564
|
+
// Dispatch
|
|
565
|
+
if (max_num_output_bins > detail::histogram::max_privatized_smem_bins)
|
|
566
|
+
{
|
|
567
|
+
// Too many bins to keep in shared memory.
|
|
568
|
+
constexpr int PRIVATIZED_SMEM_BINS = 0;
|
|
569
|
+
|
|
570
|
+
detail::histogram::dispatch_histogram<
|
|
571
|
+
NUM_CHANNELS,
|
|
572
|
+
NUM_ACTIVE_CHANNELS,
|
|
573
|
+
PRIVATIZED_SMEM_BINS,
|
|
574
|
+
SampleIteratorT,
|
|
575
|
+
CounterT,
|
|
576
|
+
::cuda::std::array<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>,
|
|
577
|
+
::cuda::std::array<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>,
|
|
578
|
+
OffsetT,
|
|
579
|
+
false, // IsDeviceInit
|
|
580
|
+
false, // IsEven (unused for host-init)
|
|
581
|
+
false, // IsByteSample (unused for host-init)
|
|
582
|
+
MaxPolicyT,
|
|
583
|
+
KernelSource,
|
|
584
|
+
KernelLauncherFactory>
|
|
585
|
+
dispatch{
|
|
586
|
+
d_temp_storage,
|
|
587
|
+
temp_storage_bytes,
|
|
588
|
+
d_samples,
|
|
589
|
+
d_output_histograms,
|
|
590
|
+
num_output_levels,
|
|
591
|
+
num_output_levels,
|
|
592
|
+
output_decode_op,
|
|
593
|
+
privatized_decode_op,
|
|
594
|
+
max_num_output_bins,
|
|
595
|
+
num_row_pixels,
|
|
596
|
+
num_rows,
|
|
597
|
+
row_stride_samples,
|
|
598
|
+
stream,
|
|
599
|
+
kernel_source,
|
|
600
|
+
launcher_factory};
|
|
601
|
+
|
|
602
|
+
error = CubDebug(max_policy.Invoke(ptx_version, dispatch));
|
|
603
|
+
if (cudaSuccess != error)
|
|
604
|
+
{
|
|
605
|
+
break;
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
else
|
|
609
|
+
{
|
|
610
|
+
// Dispatch shared-privatized approach
|
|
611
|
+
constexpr int PRIVATIZED_SMEM_BINS = detail::histogram::max_privatized_smem_bins;
|
|
612
|
+
|
|
613
|
+
detail::histogram::dispatch_histogram<
|
|
614
|
+
NUM_CHANNELS,
|
|
615
|
+
NUM_ACTIVE_CHANNELS,
|
|
616
|
+
PRIVATIZED_SMEM_BINS,
|
|
617
|
+
SampleIteratorT,
|
|
618
|
+
CounterT,
|
|
619
|
+
::cuda::std::array<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>,
|
|
620
|
+
::cuda::std::array<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>,
|
|
621
|
+
OffsetT,
|
|
622
|
+
false, // IsDeviceInit
|
|
623
|
+
false, // IsEven (unused for host-init)
|
|
624
|
+
false, // IsByteSample (unused for host-init)
|
|
625
|
+
MaxPolicyT,
|
|
626
|
+
KernelSource,
|
|
627
|
+
KernelLauncherFactory>
|
|
628
|
+
dispatch{
|
|
629
|
+
d_temp_storage,
|
|
630
|
+
temp_storage_bytes,
|
|
631
|
+
d_samples,
|
|
632
|
+
d_output_histograms,
|
|
633
|
+
num_output_levels,
|
|
634
|
+
num_output_levels,
|
|
635
|
+
output_decode_op,
|
|
636
|
+
privatized_decode_op,
|
|
637
|
+
max_num_output_bins,
|
|
638
|
+
num_row_pixels,
|
|
639
|
+
num_rows,
|
|
640
|
+
row_stride_samples,
|
|
641
|
+
stream,
|
|
642
|
+
kernel_source,
|
|
643
|
+
launcher_factory};
|
|
644
|
+
|
|
645
|
+
error = CubDebug(max_policy.Invoke(ptx_version, dispatch));
|
|
646
|
+
if (cudaSuccess != error)
|
|
647
|
+
{
|
|
648
|
+
break;
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
} while (0);
|
|
652
|
+
|
|
653
|
+
return error;
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
/**
|
|
657
|
+
* Dispatch routine for HistogramRange with host-side decode operator initialization,
|
|
658
|
+
* specialized for 8-bit sample types
|
|
659
|
+
* (computes 256-bin privatized histograms and then reduces to user-specified levels).
|
|
660
|
+
* This variant initializes the decode operators on the host before kernel launch.
|
|
661
|
+
*
|
|
662
|
+
* @param d_temp_storage
|
|
663
|
+
* Device-accessible allocation of temporary storage.
|
|
664
|
+
* When nullptr, the required allocation size is written to `temp_storage_bytes` and
|
|
665
|
+
* no work is done.
|
|
666
|
+
*
|
|
667
|
+
* @param temp_storage_bytes
|
|
668
|
+
* Reference to size in bytes of `d_temp_storage` allocation
|
|
669
|
+
*
|
|
670
|
+
* @param d_samples
|
|
671
|
+
* The pointer to the multi-channel input sequence of data samples.
|
|
672
|
+
* The samples from different channels are assumed to be interleaved
|
|
673
|
+
* (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
|
|
674
|
+
*
|
|
675
|
+
* @param d_output_histograms
|
|
676
|
+
* The pointers to the histogram counter output arrays, one for each active channel.
|
|
677
|
+
* For channel<sub><em>i</em></sub>, the allocation length of
|
|
678
|
+
* `d_histograms[i]` should be `num_output_levels[i] - 1`.
|
|
679
|
+
*
|
|
680
|
+
* @param num_output_levels
|
|
681
|
+
* The number of boundaries (levels) for delineating histogram samples in each active channel.
|
|
682
|
+
* Implies that the number of bins for channel<sub><em>i</em></sub> is
|
|
683
|
+
* `num_output_levels[i] - 1`.
|
|
684
|
+
*
|
|
685
|
+
* @param d_levels
|
|
686
|
+
* The pointers to the arrays of boundaries (levels), one for each active channel.
|
|
687
|
+
* Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are
|
|
688
|
+
* inclusive and upper sample value boundaries are exclusive.
|
|
689
|
+
*
|
|
690
|
+
* @param num_row_pixels
|
|
691
|
+
* The number of multi-channel pixels per row in the region of interest
|
|
692
|
+
*
|
|
693
|
+
* @param num_rows
|
|
694
|
+
* The number of rows in the region of interest
|
|
695
|
+
*
|
|
696
|
+
* @param row_stride_samples
|
|
697
|
+
* The number of samples between starts of consecutive rows in the region of interest
|
|
698
|
+
*
|
|
699
|
+
* @param stream
|
|
700
|
+
* CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
|
|
701
|
+
*
|
|
702
|
+
*/
|
|
703
|
+
template <typename MaxPolicyT = typename ::cuda::std::_If<
|
|
704
|
+
::cuda::std::is_void_v<PolicyHub>,
|
|
705
|
+
/* fallback_policy_hub */
|
|
706
|
+
detail::histogram::policy_hub<SampleT, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, /* isEven */ 0>,
|
|
707
|
+
PolicyHub>::MaxPolicy>
|
|
708
|
+
CUB_RUNTIME_FUNCTION static cudaError_t DispatchRange(
|
|
709
|
+
void* d_temp_storage,
|
|
710
|
+
size_t& temp_storage_bytes,
|
|
711
|
+
SampleIteratorT d_samples,
|
|
712
|
+
::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms,
|
|
713
|
+
::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_output_levels,
|
|
714
|
+
::cuda::std::array<const LevelT*, NUM_ACTIVE_CHANNELS> d_levels,
|
|
715
|
+
OffsetT num_row_pixels,
|
|
716
|
+
OffsetT num_rows,
|
|
717
|
+
OffsetT row_stride_samples,
|
|
718
|
+
cudaStream_t stream,
|
|
719
|
+
::cuda::std::true_type /*is_byte_sample*/,
|
|
720
|
+
KernelSource kernel_source = {},
|
|
721
|
+
KernelLauncherFactory launcher_factory = {},
|
|
722
|
+
MaxPolicyT max_policy = {})
|
|
723
|
+
{
|
|
724
|
+
cudaError error = cudaSuccess;
|
|
725
|
+
|
|
726
|
+
do
|
|
727
|
+
{
|
|
728
|
+
// Get PTX version
|
|
729
|
+
int ptx_version = 0;
|
|
730
|
+
error = CubDebug(launcher_factory.PtxVersion(ptx_version));
|
|
731
|
+
if (cudaSuccess != error)
|
|
732
|
+
{
|
|
733
|
+
break;
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
using TransformsT = detail::histogram::Transforms<LevelT, OffsetT, SampleT>;
|
|
737
|
+
|
|
738
|
+
// Use the pass-thru transform op for converting samples to privatized bins
|
|
739
|
+
using PrivatizedDecodeOpT = typename TransformsT::PassThruTransform;
|
|
740
|
+
|
|
741
|
+
// Use the search transform op for converting privatized bins to output bins
|
|
742
|
+
using OutputDecodeOpT = typename TransformsT::template SearchTransform<const LevelT*>;
|
|
743
|
+
|
|
744
|
+
::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_privatized_levels;
|
|
745
|
+
::cuda::std::array<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op{};
|
|
746
|
+
::cuda::std::array<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op{};
|
|
747
|
+
int max_levels = num_output_levels[0]; // Maximum number of levels in any channel
|
|
748
|
+
|
|
749
|
+
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
|
|
750
|
+
{
|
|
751
|
+
num_privatized_levels[channel] = 257;
|
|
752
|
+
output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
|
|
753
|
+
|
|
754
|
+
if (num_output_levels[channel] > max_levels)
|
|
755
|
+
{
|
|
756
|
+
max_levels = num_output_levels[channel];
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
int max_num_output_bins = max_levels - 1;
|
|
760
|
+
|
|
761
|
+
constexpr int PRIVATIZED_SMEM_BINS = 256;
|
|
762
|
+
|
|
763
|
+
detail::histogram::dispatch_histogram<
|
|
764
|
+
NUM_CHANNELS,
|
|
765
|
+
NUM_ACTIVE_CHANNELS,
|
|
766
|
+
PRIVATIZED_SMEM_BINS,
|
|
767
|
+
SampleIteratorT,
|
|
768
|
+
CounterT,
|
|
769
|
+
::cuda::std::array<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>,
|
|
770
|
+
::cuda::std::array<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>,
|
|
771
|
+
OffsetT,
|
|
772
|
+
false, // IsDeviceInit
|
|
773
|
+
false, // IsEven (unused for host-init)
|
|
774
|
+
false, // IsByteSample (unused for host-init)
|
|
775
|
+
MaxPolicyT,
|
|
776
|
+
KernelSource,
|
|
777
|
+
KernelLauncherFactory>
|
|
778
|
+
dispatch{
|
|
779
|
+
d_temp_storage,
|
|
780
|
+
temp_storage_bytes,
|
|
781
|
+
d_samples,
|
|
782
|
+
d_output_histograms,
|
|
783
|
+
num_privatized_levels,
|
|
784
|
+
num_output_levels,
|
|
785
|
+
output_decode_op,
|
|
786
|
+
privatized_decode_op,
|
|
787
|
+
max_num_output_bins,
|
|
788
|
+
num_row_pixels,
|
|
789
|
+
num_rows,
|
|
790
|
+
row_stride_samples,
|
|
791
|
+
stream,
|
|
792
|
+
kernel_source,
|
|
793
|
+
launcher_factory};
|
|
794
|
+
|
|
795
|
+
error = CubDebug(max_policy.Invoke(ptx_version, dispatch));
|
|
796
|
+
if (cudaSuccess != error)
|
|
797
|
+
{
|
|
798
|
+
break;
|
|
799
|
+
}
|
|
800
|
+
} while (0);
|
|
801
|
+
|
|
802
|
+
return error;
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
/**
|
|
806
|
+
* Dispatch routine for HistogramEven with host-side decode operator initialization,
|
|
807
|
+
* specialized for sample types larger than 8-bit.
|
|
808
|
+
* This variant initializes the decode operators on the host before kernel launch.
|
|
809
|
+
*
|
|
810
|
+
* @param d_temp_storage
|
|
811
|
+
* Device-accessible allocation of temporary storage.
|
|
812
|
+
* When nullptr, the required allocation size is written to
|
|
813
|
+
* `temp_storage_bytes` and no work is done.
|
|
814
|
+
*
|
|
815
|
+
* @param temp_storage_bytes
|
|
816
|
+
* Reference to size in bytes of `d_temp_storage` allocation
|
|
817
|
+
*
|
|
818
|
+
* @param d_samples
|
|
819
|
+
* The pointer to the input sequence of sample items.
|
|
820
|
+
* The samples from different channels are assumed to be interleaved
|
|
821
|
+
* (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
|
|
822
|
+
*
|
|
823
|
+
* @param d_output_histograms
|
|
824
|
+
* The pointers to the histogram counter output arrays, one for each active channel.
|
|
825
|
+
* For channel<sub><em>i</em></sub>, the allocation length of `d_histograms[i]` should be
|
|
826
|
+
* `num_output_levels[i] - 1`.
|
|
827
|
+
*
|
|
828
|
+
* @param num_output_levels
|
|
829
|
+
* The number of bin level boundaries for delineating histogram samples in each active channel.
|
|
830
|
+
* Implies that the number of bins for channel<sub><em>i</em></sub> is
|
|
831
|
+
* `num_output_levels[i] - 1`.
|
|
832
|
+
*
|
|
833
|
+
* @param lower_level
|
|
834
|
+
* The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
|
|
835
|
+
*
|
|
836
|
+
* @param upper_level
|
|
837
|
+
* The upper sample value bound (exclusive) for the highest histogram bin in each active
|
|
838
|
+
* channel.
|
|
839
|
+
*
|
|
840
|
+
* @param num_row_pixels
|
|
841
|
+
* The number of multi-channel pixels per row in the region of interest
|
|
842
|
+
*
|
|
843
|
+
* @param num_rows
|
|
844
|
+
* The number of rows in the region of interest
|
|
845
|
+
*
|
|
846
|
+
* @param row_stride_samples
|
|
847
|
+
* The number of samples between starts of consecutive rows in the region of interest
|
|
848
|
+
*
|
|
849
|
+
* @param stream
|
|
850
|
+
* CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
|
|
851
|
+
*
|
|
852
|
+
*/
|
|
853
|
+
template <typename MaxPolicyT = typename ::cuda::std::_If<
|
|
854
|
+
::cuda::std::is_void_v<PolicyHub>,
|
|
855
|
+
/* fallback_policy_hub */
|
|
856
|
+
detail::histogram::policy_hub<SampleT, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, /* isEven */ 1>,
|
|
857
|
+
PolicyHub>::MaxPolicy>
|
|
858
|
+
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven(
|
|
859
|
+
void* d_temp_storage,
|
|
860
|
+
size_t& temp_storage_bytes,
|
|
861
|
+
SampleIteratorT d_samples,
|
|
862
|
+
::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms,
|
|
863
|
+
::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_output_levels,
|
|
864
|
+
::cuda::std::array<LevelT, NUM_ACTIVE_CHANNELS> lower_level,
|
|
865
|
+
::cuda::std::array<LevelT, NUM_ACTIVE_CHANNELS> upper_level,
|
|
866
|
+
OffsetT num_row_pixels,
|
|
867
|
+
OffsetT num_rows,
|
|
868
|
+
OffsetT row_stride_samples,
|
|
869
|
+
cudaStream_t stream,
|
|
870
|
+
::cuda::std::false_type /*is_byte_sample*/,
|
|
871
|
+
KernelSource kernel_source = {},
|
|
872
|
+
KernelLauncherFactory launcher_factory = {},
|
|
873
|
+
MaxPolicyT max_policy = {})
|
|
874
|
+
{
|
|
875
|
+
cudaError error = cudaSuccess;
|
|
876
|
+
|
|
877
|
+
do
|
|
878
|
+
{
|
|
879
|
+
// Get PTX version
|
|
880
|
+
int ptx_version = 0;
|
|
881
|
+
error = CubDebug(launcher_factory.PtxVersion(ptx_version));
|
|
882
|
+
if (cudaSuccess != error)
|
|
883
|
+
{
|
|
884
|
+
break;
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
using TransformsT = detail::histogram::Transforms<LevelT, OffsetT, SampleT>;
|
|
888
|
+
|
|
889
|
+
// Use the scale transform op for converting samples to privatized bins
|
|
890
|
+
using PrivatizedDecodeOpT = typename TransformsT::ScaleTransform;
|
|
891
|
+
|
|
892
|
+
// Use the pass-thru transform op for converting privatized bins to output bins
|
|
893
|
+
using OutputDecodeOpT = typename TransformsT::PassThruTransform;
|
|
894
|
+
|
|
895
|
+
using CommonT = typename TransformsT::ScaleTransform::CommonT;
|
|
896
|
+
|
|
897
|
+
::cuda::std::array<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op{};
|
|
898
|
+
::cuda::std::array<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op{};
|
|
899
|
+
int max_levels = num_output_levels[0];
|
|
900
|
+
|
|
901
|
+
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
|
|
902
|
+
{
|
|
903
|
+
int num_levels = num_output_levels[channel];
|
|
904
|
+
if (kernel_source.MayOverflow(static_cast<CommonT>(num_levels - 1), upper_level, lower_level, channel))
|
|
905
|
+
{
|
|
906
|
+
// Make sure to also return a reasonable value for `temp_storage_bytes` in case of
|
|
907
|
+
// an overflow of the bin computation, in which case a subsequent algorithm
|
|
908
|
+
// invocation will also fail
|
|
909
|
+
if (!d_temp_storage)
|
|
910
|
+
{
|
|
911
|
+
temp_storage_bytes = 1U;
|
|
912
|
+
}
|
|
913
|
+
return cudaErrorInvalidValue;
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
privatized_decode_op[channel].Init(num_levels, upper_level[channel], lower_level[channel]);
|
|
917
|
+
|
|
918
|
+
if (num_levels > max_levels)
|
|
919
|
+
{
|
|
920
|
+
max_levels = num_levels;
|
|
921
|
+
}
|
|
922
|
+
}
|
|
923
|
+
int max_num_output_bins = max_levels - 1;
|
|
924
|
+
|
|
925
|
+
if (max_num_output_bins > detail::histogram::max_privatized_smem_bins)
|
|
926
|
+
{
|
|
927
|
+
// Dispatch shared-privatized approach
|
|
928
|
+
constexpr int PRIVATIZED_SMEM_BINS = 0;
|
|
929
|
+
|
|
930
|
+
detail::histogram::dispatch_histogram<
|
|
931
|
+
NUM_CHANNELS,
|
|
932
|
+
NUM_ACTIVE_CHANNELS,
|
|
933
|
+
PRIVATIZED_SMEM_BINS,
|
|
934
|
+
SampleIteratorT,
|
|
935
|
+
CounterT,
|
|
936
|
+
::cuda::std::array<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>,
|
|
937
|
+
::cuda::std::array<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>,
|
|
938
|
+
OffsetT,
|
|
939
|
+
false, // IsDeviceInit
|
|
940
|
+
false, // IsEven (unused for host-init)
|
|
941
|
+
false, // IsByteSample (unused for host-init)
|
|
942
|
+
MaxPolicyT,
|
|
943
|
+
KernelSource,
|
|
944
|
+
KernelLauncherFactory>
|
|
945
|
+
dispatch{
|
|
946
|
+
d_temp_storage,
|
|
947
|
+
temp_storage_bytes,
|
|
948
|
+
d_samples,
|
|
949
|
+
d_output_histograms,
|
|
950
|
+
num_output_levels,
|
|
951
|
+
num_output_levels,
|
|
952
|
+
output_decode_op,
|
|
953
|
+
privatized_decode_op,
|
|
954
|
+
max_num_output_bins,
|
|
955
|
+
num_row_pixels,
|
|
956
|
+
num_rows,
|
|
957
|
+
row_stride_samples,
|
|
958
|
+
stream,
|
|
959
|
+
kernel_source,
|
|
960
|
+
launcher_factory};
|
|
961
|
+
|
|
962
|
+
error = CubDebug(max_policy.Invoke(ptx_version, dispatch));
|
|
963
|
+
if (cudaSuccess != error)
|
|
964
|
+
{
|
|
965
|
+
break;
|
|
966
|
+
}
|
|
967
|
+
}
|
|
968
|
+
else
|
|
969
|
+
{
|
|
970
|
+
// Dispatch shared-privatized approach
|
|
971
|
+
constexpr int PRIVATIZED_SMEM_BINS = detail::histogram::max_privatized_smem_bins;
|
|
972
|
+
|
|
973
|
+
detail::histogram::dispatch_histogram<
|
|
974
|
+
NUM_CHANNELS,
|
|
975
|
+
NUM_ACTIVE_CHANNELS,
|
|
976
|
+
PRIVATIZED_SMEM_BINS,
|
|
977
|
+
SampleIteratorT,
|
|
978
|
+
CounterT,
|
|
979
|
+
::cuda::std::array<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>,
|
|
980
|
+
::cuda::std::array<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>,
|
|
981
|
+
OffsetT,
|
|
982
|
+
false, // IsDeviceInit
|
|
983
|
+
false, // IsEven (unused for host-init)
|
|
984
|
+
false, // IsByteSample (unused for host-init)
|
|
985
|
+
MaxPolicyT,
|
|
986
|
+
KernelSource,
|
|
987
|
+
KernelLauncherFactory>
|
|
988
|
+
dispatch{
|
|
989
|
+
d_temp_storage,
|
|
990
|
+
temp_storage_bytes,
|
|
991
|
+
d_samples,
|
|
992
|
+
d_output_histograms,
|
|
993
|
+
num_output_levels,
|
|
994
|
+
num_output_levels,
|
|
995
|
+
output_decode_op,
|
|
996
|
+
privatized_decode_op,
|
|
997
|
+
max_num_output_bins,
|
|
998
|
+
num_row_pixels,
|
|
999
|
+
num_rows,
|
|
1000
|
+
row_stride_samples,
|
|
1001
|
+
stream,
|
|
1002
|
+
kernel_source,
|
|
1003
|
+
launcher_factory};
|
|
1004
|
+
|
|
1005
|
+
error = CubDebug(max_policy.Invoke(ptx_version, dispatch));
|
|
1006
|
+
if (cudaSuccess != error)
|
|
1007
|
+
{
|
|
1008
|
+
break;
|
|
1009
|
+
}
|
|
1010
|
+
}
|
|
1011
|
+
} while (0);
|
|
1012
|
+
|
|
1013
|
+
return error;
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
/**
|
|
1017
|
+
* Dispatch routine for HistogramEven with host-side decode operator initialization,
|
|
1018
|
+
* specialized for 8-bit sample types
|
|
1019
|
+
* (computes 256-bin privatized histograms and then reduces to user-specified levels).
|
|
1020
|
+
* This variant initializes the decode operators on the host before kernel launch.
|
|
1021
|
+
*
|
|
1022
|
+
* @param d_temp_storage
|
|
1023
|
+
* Device-accessible allocation of temporary storage.
|
|
1024
|
+
* When nullptr, the required allocation size is written to `temp_storage_bytes` and
|
|
1025
|
+
* no work is done.
|
|
1026
|
+
*
|
|
1027
|
+
* @param temp_storage_bytes
|
|
1028
|
+
* Reference to size in bytes of `d_temp_storage` allocation
|
|
1029
|
+
*
|
|
1030
|
+
* @param d_samples
|
|
1031
|
+
* The pointer to the input sequence of sample items. The samples from different channels are
|
|
1032
|
+
* assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of
|
|
1033
|
+
* four RGBA 8-bit samples).
|
|
1034
|
+
*
|
|
1035
|
+
* @param d_output_histograms
|
|
1036
|
+
* The pointers to the histogram counter output arrays, one for each active channel.
|
|
1037
|
+
* For channel<sub><em>i</em></sub>, the allocation length of `d_histograms[i]` should be
|
|
1038
|
+
* `num_output_levels[i] - 1`.
|
|
1039
|
+
*
|
|
1040
|
+
* @param num_output_levels
|
|
1041
|
+
* The number of bin level boundaries for delineating histogram samples in each active channel.
|
|
1042
|
+
* Implies that the number of bins for channel<sub><em>i</em></sub> is
|
|
1043
|
+
* `num_output_levels[i] - 1`.
|
|
1044
|
+
*
|
|
1045
|
+
* @param lower_level
|
|
1046
|
+
* The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
|
|
1047
|
+
*
|
|
1048
|
+
* @param upper_level
|
|
1049
|
+
* The upper sample value bound (exclusive) for the highest histogram bin in each active
|
|
1050
|
+
* channel.
|
|
1051
|
+
*
|
|
1052
|
+
* @param num_row_pixels
|
|
1053
|
+
* The number of multi-channel pixels per row in the region of interest
|
|
1054
|
+
*
|
|
1055
|
+
* @param num_rows
|
|
1056
|
+
* The number of rows in the region of interest
|
|
1057
|
+
*
|
|
1058
|
+
* @param row_stride_samples
|
|
1059
|
+
* The number of samples between starts of consecutive rows in the region of interest
|
|
1060
|
+
*
|
|
1061
|
+
* @param stream
|
|
1062
|
+
* CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
|
|
1063
|
+
*
|
|
1064
|
+
*/
|
|
1065
|
+
template <typename MaxPolicyT = typename ::cuda::std::_If<
|
|
1066
|
+
::cuda::std::is_void_v<PolicyHub>,
|
|
1067
|
+
/* fallback_policy_hub */
|
|
1068
|
+
detail::histogram::policy_hub<SampleT, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, /* isEven */ 1>,
|
|
1069
|
+
PolicyHub>::MaxPolicy>
|
|
1070
|
+
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t DispatchEven(
|
|
1071
|
+
void* d_temp_storage,
|
|
1072
|
+
size_t& temp_storage_bytes,
|
|
1073
|
+
SampleIteratorT d_samples,
|
|
1074
|
+
::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms,
|
|
1075
|
+
::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_output_levels,
|
|
1076
|
+
::cuda::std::array<LevelT, NUM_ACTIVE_CHANNELS> lower_level,
|
|
1077
|
+
::cuda::std::array<LevelT, NUM_ACTIVE_CHANNELS> upper_level,
|
|
1078
|
+
OffsetT num_row_pixels,
|
|
1079
|
+
OffsetT num_rows,
|
|
1080
|
+
OffsetT row_stride_samples,
|
|
1081
|
+
cudaStream_t stream,
|
|
1082
|
+
::cuda::std::true_type /*is_byte_sample*/,
|
|
1083
|
+
KernelSource kernel_source = {},
|
|
1084
|
+
KernelLauncherFactory launcher_factory = {},
|
|
1085
|
+
MaxPolicyT max_policy = {})
|
|
1086
|
+
{
|
|
1087
|
+
cudaError error = cudaSuccess;
|
|
1088
|
+
|
|
1089
|
+
do
|
|
1090
|
+
{
|
|
1091
|
+
// Get PTX version
|
|
1092
|
+
int ptx_version = 0;
|
|
1093
|
+
error = CubDebug(launcher_factory.PtxVersion(ptx_version));
|
|
1094
|
+
if (cudaSuccess != error)
|
|
1095
|
+
{
|
|
1096
|
+
break;
|
|
1097
|
+
}
|
|
1098
|
+
|
|
1099
|
+
using TransformsT = detail::histogram::Transforms<LevelT, OffsetT, SampleT>;
|
|
1100
|
+
|
|
1101
|
+
// Use the pass-thru transform op for converting samples to privatized bins
|
|
1102
|
+
using PrivatizedDecodeOpT = typename TransformsT::PassThruTransform;
|
|
1103
|
+
|
|
1104
|
+
// Use the scale transform op for converting privatized bins to output bins
|
|
1105
|
+
using OutputDecodeOpT = typename TransformsT::ScaleTransform;
|
|
1106
|
+
|
|
1107
|
+
using CommonT = typename TransformsT::ScaleTransform::CommonT;
|
|
1108
|
+
|
|
1109
|
+
::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_privatized_levels;
|
|
1110
|
+
::cuda::std::array<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op{};
|
|
1111
|
+
::cuda::std::array<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op{};
|
|
1112
|
+
int max_levels = num_output_levels[0];
|
|
1113
|
+
|
|
1114
|
+
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
|
|
1115
|
+
{
|
|
1116
|
+
num_privatized_levels[channel] = 257;
|
|
1117
|
+
|
|
1118
|
+
int num_levels = num_output_levels[channel];
|
|
1119
|
+
if (kernel_source.MayOverflow(static_cast<CommonT>(num_levels - 1), upper_level, lower_level, channel))
|
|
1120
|
+
{
|
|
1121
|
+
// Make sure to also return a reasonable value for `temp_storage_bytes` in case of
|
|
1122
|
+
// an overflow of the bin computation, in which case a subsequent algorithm
|
|
1123
|
+
// invocation will also fail
|
|
1124
|
+
if (!d_temp_storage)
|
|
1125
|
+
{
|
|
1126
|
+
temp_storage_bytes = 1U;
|
|
1127
|
+
}
|
|
1128
|
+
return cudaErrorInvalidValue;
|
|
1129
|
+
}
|
|
1130
|
+
|
|
1131
|
+
output_decode_op[channel].Init(num_levels, upper_level[channel], lower_level[channel]);
|
|
1132
|
+
|
|
1133
|
+
if (num_levels > max_levels)
|
|
1134
|
+
{
|
|
1135
|
+
max_levels = num_levels;
|
|
1136
|
+
}
|
|
1137
|
+
}
|
|
1138
|
+
int max_num_output_bins = max_levels - 1;
|
|
1139
|
+
|
|
1140
|
+
constexpr int PRIVATIZED_SMEM_BINS = 256;
|
|
1141
|
+
|
|
1142
|
+
detail::histogram::dispatch_histogram<
|
|
1143
|
+
NUM_CHANNELS,
|
|
1144
|
+
NUM_ACTIVE_CHANNELS,
|
|
1145
|
+
PRIVATIZED_SMEM_BINS,
|
|
1146
|
+
SampleIteratorT,
|
|
1147
|
+
CounterT,
|
|
1148
|
+
::cuda::std::array<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>,
|
|
1149
|
+
::cuda::std::array<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>,
|
|
1150
|
+
OffsetT,
|
|
1151
|
+
false, // IsDeviceInit
|
|
1152
|
+
false, // IsEven (unused for host-init)
|
|
1153
|
+
false, // IsByteSample (unused for host-init)
|
|
1154
|
+
MaxPolicyT,
|
|
1155
|
+
KernelSource,
|
|
1156
|
+
KernelLauncherFactory>
|
|
1157
|
+
dispatch{
|
|
1158
|
+
d_temp_storage,
|
|
1159
|
+
temp_storage_bytes,
|
|
1160
|
+
d_samples,
|
|
1161
|
+
d_output_histograms,
|
|
1162
|
+
num_privatized_levels,
|
|
1163
|
+
num_output_levels,
|
|
1164
|
+
output_decode_op,
|
|
1165
|
+
privatized_decode_op,
|
|
1166
|
+
max_num_output_bins,
|
|
1167
|
+
num_row_pixels,
|
|
1168
|
+
num_rows,
|
|
1169
|
+
row_stride_samples,
|
|
1170
|
+
stream,
|
|
1171
|
+
kernel_source,
|
|
1172
|
+
launcher_factory};
|
|
1173
|
+
|
|
1174
|
+
error = CubDebug(max_policy.Invoke(ptx_version, dispatch));
|
|
1175
|
+
if (cudaSuccess != error)
|
|
1176
|
+
{
|
|
1177
|
+
break;
|
|
1178
|
+
}
|
|
1179
|
+
} while (0);
|
|
1180
|
+
|
|
1181
|
+
return error;
|
|
1182
|
+
}
|
|
1183
|
+
|
|
1184
|
+
// Dispatch routines for device-side decode operator initialization. These
|
|
1185
|
+
// differ from the default dispatch routines in that they initialize the
|
|
1186
|
+
// decode operators inside the kernel from level arrays, instead of
|
|
1187
|
+
// initializing them on the host, but they are otherwise the same. This is
|
|
1188
|
+
// needed for c.parallel, since we cannot instantiate the Transforms class on
|
|
1189
|
+
// the host, as SampleT and LevelT are type erased. Another change needed is
|
|
1190
|
+
// that the level arrays are now templates instead of concrete
|
|
1191
|
+
// ::cuda::std::array types, since we are passing indirect_args from
|
|
1192
|
+
// c.parallel.
|
|
1193
|
+
//
|
|
1194
|
+
// Initializing the decode operators inside the kernel results in some
|
|
1195
|
+
// regressions (and some performance improvements) in the benchmark, which
|
|
1196
|
+
// indicates that we need to re-tune the algorithm. This is why we kept the
|
|
1197
|
+
// two dispatch paths (host init and device init) separate. We should think
|
|
1198
|
+
// about merging them back together later on.
|
|
1199
|
+
|
|
1200
|
+
/**
|
|
1201
|
+
* Dispatch routine for HistogramRange with device-side decode operator initialization,
|
|
1202
|
+
* specialized for sample types larger than 8bit.
|
|
1203
|
+
* This variant initializes the decode operators inside the kernel from level arrays.
|
|
1204
|
+
*
|
|
1205
|
+
* @param d_temp_storage
|
|
1206
|
+
* Device-accessible allocation of temporary storage.
|
|
1207
|
+
* When nullptr, the required allocation size is written to `temp_storage_bytes` and
|
|
1208
|
+
* no work is done.
|
|
1209
|
+
*
|
|
1210
|
+
* @param temp_storage_bytes
|
|
1211
|
+
* Reference to size in bytes of `d_temp_storage` allocation
|
|
1212
|
+
*
|
|
1213
|
+
* @param d_samples
|
|
1214
|
+
* The pointer to the multi-channel input sequence of data samples.
|
|
1215
|
+
* The samples from different channels are assumed to be interleaved
|
|
1216
|
+
* (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
|
|
1217
|
+
*
|
|
1218
|
+
* @param d_output_histograms
|
|
1219
|
+
* The pointers to the histogram counter output arrays, one for each active channel.
|
|
1220
|
+
* For channel<sub><em>i</em></sub>, the allocation length of `d_histograms[i]` should be
|
|
1221
|
+
* `num_output_levels[i] - 1`.
|
|
1222
|
+
*
|
|
1223
|
+
* @param num_output_levels
|
|
1224
|
+
* The number of boundaries (levels) for delineating histogram samples in each active channel.
|
|
1225
|
+
* Implies that the number of bins for channel<sub><em>i</em></sub> is
|
|
1226
|
+
* `num_output_levels[i] - 1`.
|
|
1227
|
+
*
|
|
1228
|
+
* @param d_levels
|
|
1229
|
+
* The pointers to the arrays of boundaries (levels), one for each active channel.
|
|
1230
|
+
* Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are
|
|
1231
|
+
* inclusive and upper sample value boundaries are exclusive.
|
|
1232
|
+
*
|
|
1233
|
+
* @param num_row_pixels
|
|
1234
|
+
* The number of multi-channel pixels per row in the region of interest
|
|
1235
|
+
*
|
|
1236
|
+
* @param num_rows
|
|
1237
|
+
* The number of rows in the region of interest
|
|
1238
|
+
*
|
|
1239
|
+
* @param row_stride_samples
|
|
1240
|
+
* The number of samples between starts of consecutive rows in the region of interest
|
|
1241
|
+
*
|
|
1242
|
+
* @param stream
|
|
1243
|
+
* CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
|
|
1244
|
+
*/
|
|
1245
|
+
template <typename MaxPolicyT = typename ::cuda::std::conditional_t<
|
|
1246
|
+
::cuda::std::is_void_v<PolicyHub>,
|
|
1247
|
+
/* fallback_policy_hub */
|
|
1248
|
+
detail::histogram::policy_hub<SampleT, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, /* isEven */ 0>,
|
|
1249
|
+
PolicyHub>::MaxPolicy,
|
|
1250
|
+
typename NumOutputLevelsArrayT = ::cuda::std::array<int, NUM_ACTIVE_CHANNELS>,
|
|
1251
|
+
typename LevelsArrayT = ::cuda::std::array<const LevelT*, NUM_ACTIVE_CHANNELS>>
|
|
1252
|
+
CUB_RUNTIME_FUNCTION static cudaError_t __dispatch_range_device_init(
|
|
1253
|
+
void* d_temp_storage,
|
|
1254
|
+
size_t& temp_storage_bytes,
|
|
1255
|
+
SampleIteratorT d_samples,
|
|
1256
|
+
::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms,
|
|
1257
|
+
NumOutputLevelsArrayT num_output_levels,
|
|
1258
|
+
LevelsArrayT d_levels,
|
|
1259
|
+
OffsetT num_row_pixels,
|
|
1260
|
+
OffsetT num_rows,
|
|
1261
|
+
OffsetT row_stride_samples,
|
|
1262
|
+
cudaStream_t stream,
|
|
1263
|
+
::cuda::std::false_type /*is_byte_sample*/,
|
|
1264
|
+
KernelSource kernel_source = {},
|
|
1265
|
+
KernelLauncherFactory launcher_factory = {},
|
|
1266
|
+
MaxPolicyT max_policy = {})
|
|
1267
|
+
{
|
|
1268
|
+
// Get PTX version
|
|
1269
|
+
int ptx_version = 0;
|
|
1270
|
+
if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
|
|
1271
|
+
{
|
|
1272
|
+
return error;
|
|
1273
|
+
}
|
|
1274
|
+
|
|
1275
|
+
int max_levels = num_output_levels[0];
|
|
1276
|
+
|
|
1277
|
+
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
|
|
1278
|
+
{
|
|
1279
|
+
if (num_output_levels[channel] > max_levels)
|
|
1280
|
+
{
|
|
1281
|
+
max_levels = num_output_levels[channel];
|
|
1282
|
+
}
|
|
1283
|
+
}
|
|
1284
|
+
int max_num_output_bins = max_levels - 1;
|
|
1285
|
+
|
|
1286
|
+
// Dispatch
|
|
1287
|
+
if (max_num_output_bins > detail::histogram::max_privatized_smem_bins)
|
|
1288
|
+
{
|
|
1289
|
+
// Too many bins to keep in shared memory.
|
|
1290
|
+
constexpr int PRIVATIZED_SMEM_BINS = 0;
|
|
1291
|
+
|
|
1292
|
+
detail::histogram::dispatch_histogram<
|
|
1293
|
+
NUM_CHANNELS,
|
|
1294
|
+
NUM_ACTIVE_CHANNELS,
|
|
1295
|
+
PRIVATIZED_SMEM_BINS,
|
|
1296
|
+
SampleIteratorT,
|
|
1297
|
+
CounterT,
|
|
1298
|
+
NumOutputLevelsArrayT,
|
|
1299
|
+
LevelsArrayT,
|
|
1300
|
+
OffsetT,
|
|
1301
|
+
true, // IsDeviceInit
|
|
1302
|
+
false, // IsEven
|
|
1303
|
+
false, // IsByteSample
|
|
1304
|
+
MaxPolicyT,
|
|
1305
|
+
KernelSource,
|
|
1306
|
+
KernelLauncherFactory>
|
|
1307
|
+
dispatch{
|
|
1308
|
+
d_temp_storage,
|
|
1309
|
+
temp_storage_bytes,
|
|
1310
|
+
d_samples,
|
|
1311
|
+
d_output_histograms,
|
|
1312
|
+
num_output_levels,
|
|
1313
|
+
num_output_levels,
|
|
1314
|
+
num_output_levels,
|
|
1315
|
+
d_levels,
|
|
1316
|
+
max_num_output_bins,
|
|
1317
|
+
num_row_pixels,
|
|
1318
|
+
num_rows,
|
|
1319
|
+
row_stride_samples,
|
|
1320
|
+
stream,
|
|
1321
|
+
kernel_source,
|
|
1322
|
+
launcher_factory};
|
|
1323
|
+
|
|
1324
|
+
if (const auto error = CubDebug(max_policy.Invoke(ptx_version, dispatch)))
|
|
1325
|
+
{
|
|
1326
|
+
return error;
|
|
1327
|
+
}
|
|
1328
|
+
}
|
|
1329
|
+
else
|
|
1330
|
+
{
|
|
1331
|
+
// Dispatch shared-privatized approach
|
|
1332
|
+
constexpr int PRIVATIZED_SMEM_BINS = detail::histogram::max_privatized_smem_bins;
|
|
1333
|
+
|
|
1334
|
+
detail::histogram::dispatch_histogram<
|
|
1335
|
+
NUM_CHANNELS,
|
|
1336
|
+
NUM_ACTIVE_CHANNELS,
|
|
1337
|
+
PRIVATIZED_SMEM_BINS,
|
|
1338
|
+
SampleIteratorT,
|
|
1339
|
+
CounterT,
|
|
1340
|
+
NumOutputLevelsArrayT,
|
|
1341
|
+
LevelsArrayT,
|
|
1342
|
+
OffsetT,
|
|
1343
|
+
true, // IsDeviceInit
|
|
1344
|
+
false, // IsEven
|
|
1345
|
+
false, // IsByteSample
|
|
1346
|
+
MaxPolicyT,
|
|
1347
|
+
KernelSource,
|
|
1348
|
+
KernelLauncherFactory>
|
|
1349
|
+
dispatch{
|
|
1350
|
+
d_temp_storage,
|
|
1351
|
+
temp_storage_bytes,
|
|
1352
|
+
d_samples,
|
|
1353
|
+
d_output_histograms,
|
|
1354
|
+
num_output_levels,
|
|
1355
|
+
num_output_levels,
|
|
1356
|
+
num_output_levels,
|
|
1357
|
+
d_levels,
|
|
1358
|
+
max_num_output_bins,
|
|
1359
|
+
num_row_pixels,
|
|
1360
|
+
num_rows,
|
|
1361
|
+
row_stride_samples,
|
|
1362
|
+
stream,
|
|
1363
|
+
kernel_source,
|
|
1364
|
+
launcher_factory};
|
|
1365
|
+
|
|
1366
|
+
if (const auto error = CubDebug(max_policy.Invoke(ptx_version, dispatch)))
|
|
1367
|
+
{
|
|
1368
|
+
return error;
|
|
1369
|
+
}
|
|
1370
|
+
}
|
|
1371
|
+
|
|
1372
|
+
return cudaSuccess;
|
|
1373
|
+
}
|
|
1374
|
+
|
|
1375
|
+
/**
|
|
1376
|
+
* Dispatch routine for HistogramRange with device-side decode operator initialization,
|
|
1377
|
+
* specialized for 8-bit sample types
|
|
1378
|
+
* (computes 256-bin privatized histograms and then reduces to user-specified levels).
|
|
1379
|
+
* This variant initializes the decode operators inside the kernel from level arrays.
|
|
1380
|
+
*
|
|
1381
|
+
* @param d_temp_storage
|
|
1382
|
+
* Device-accessible allocation of temporary storage.
|
|
1383
|
+
* When nullptr, the required allocation size is written to `temp_storage_bytes` and
|
|
1384
|
+
* no work is done.
|
|
1385
|
+
*
|
|
1386
|
+
* @param temp_storage_bytes
|
|
1387
|
+
* Reference to size in bytes of `d_temp_storage` allocation
|
|
1388
|
+
*
|
|
1389
|
+
* @param d_samples
|
|
1390
|
+
* The pointer to the multi-channel input sequence of data samples.
|
|
1391
|
+
* The samples from different channels are assumed to be interleaved
|
|
1392
|
+
* (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
|
|
1393
|
+
*
|
|
1394
|
+
* @param d_output_histograms
|
|
1395
|
+
* The pointers to the histogram counter output arrays, one for each active channel.
|
|
1396
|
+
* For channel<sub><em>i</em></sub>, the allocation length of
|
|
1397
|
+
* `d_histograms[i]` should be `num_output_levels[i] - 1`.
|
|
1398
|
+
*
|
|
1399
|
+
* @param num_output_levels
|
|
1400
|
+
* The number of boundaries (levels) for delineating histogram samples in each active channel.
|
|
1401
|
+
* Implies that the number of bins for channel<sub><em>i</em></sub> is
|
|
1402
|
+
* `num_output_levels[i] - 1`.
|
|
1403
|
+
*
|
|
1404
|
+
* @param d_levels
|
|
1405
|
+
* The pointers to the arrays of boundaries (levels), one for each active channel.
|
|
1406
|
+
* Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are
|
|
1407
|
+
* inclusive and upper sample value boundaries are exclusive.
|
|
1408
|
+
*
|
|
1409
|
+
* @param num_row_pixels
|
|
1410
|
+
* The number of multi-channel pixels per row in the region of interest
|
|
1411
|
+
*
|
|
1412
|
+
* @param num_rows
|
|
1413
|
+
* The number of rows in the region of interest
|
|
1414
|
+
*
|
|
1415
|
+
* @param row_stride_samples
|
|
1416
|
+
* The number of samples between starts of consecutive rows in the region of interest
|
|
1417
|
+
*
|
|
1418
|
+
* @param stream
|
|
1419
|
+
* CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
|
|
1420
|
+
*
|
|
1421
|
+
*/
|
|
1422
|
+
template <typename MaxPolicyT = typename ::cuda::std::conditional_t<
|
|
1423
|
+
::cuda::std::is_void_v<PolicyHub>,
|
|
1424
|
+
/* fallback_policy_hub */
|
|
1425
|
+
detail::histogram::policy_hub<SampleT, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, /* isEven */ 0>,
|
|
1426
|
+
PolicyHub>::MaxPolicy,
|
|
1427
|
+
typename NumOutputLevelsArrayT = ::cuda::std::array<int, NUM_ACTIVE_CHANNELS>,
|
|
1428
|
+
typename LevelsArrayT = ::cuda::std::array<const LevelT*, NUM_ACTIVE_CHANNELS>>
|
|
1429
|
+
CUB_RUNTIME_FUNCTION static cudaError_t __dispatch_range_device_init(
|
|
1430
|
+
void* d_temp_storage,
|
|
1431
|
+
size_t& temp_storage_bytes,
|
|
1432
|
+
SampleIteratorT d_samples,
|
|
1433
|
+
::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms,
|
|
1434
|
+
NumOutputLevelsArrayT num_output_levels,
|
|
1435
|
+
LevelsArrayT d_levels,
|
|
1436
|
+
OffsetT num_row_pixels,
|
|
1437
|
+
OffsetT num_rows,
|
|
1438
|
+
OffsetT row_stride_samples,
|
|
1439
|
+
cudaStream_t stream,
|
|
1440
|
+
::cuda::std::true_type /*is_byte_sample*/,
|
|
1441
|
+
KernelSource kernel_source = {},
|
|
1442
|
+
KernelLauncherFactory launcher_factory = {},
|
|
1443
|
+
MaxPolicyT max_policy = {})
|
|
1444
|
+
{
|
|
1445
|
+
// Get PTX version
|
|
1446
|
+
int ptx_version = 0;
|
|
1447
|
+
if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
|
|
1448
|
+
{
|
|
1449
|
+
return error;
|
|
1450
|
+
}
|
|
1451
|
+
|
|
1452
|
+
::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_privatized_levels;
|
|
1453
|
+
int max_levels = num_output_levels[0]; // Maximum number of levels in any channel
|
|
1454
|
+
|
|
1455
|
+
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
|
|
1456
|
+
{
|
|
1457
|
+
num_privatized_levels[channel] = 257;
|
|
1458
|
+
|
|
1459
|
+
if (num_output_levels[channel] > max_levels)
|
|
1460
|
+
{
|
|
1461
|
+
max_levels = num_output_levels[channel];
|
|
1462
|
+
}
|
|
1463
|
+
}
|
|
1464
|
+
int max_num_output_bins = max_levels - 1;
|
|
1465
|
+
|
|
1466
|
+
constexpr int PRIVATIZED_SMEM_BINS = 256;
|
|
1467
|
+
|
|
1468
|
+
detail::histogram::dispatch_histogram<
|
|
1469
|
+
NUM_CHANNELS,
|
|
1470
|
+
NUM_ACTIVE_CHANNELS,
|
|
1471
|
+
PRIVATIZED_SMEM_BINS,
|
|
1472
|
+
SampleIteratorT,
|
|
1473
|
+
CounterT,
|
|
1474
|
+
NumOutputLevelsArrayT,
|
|
1475
|
+
LevelsArrayT,
|
|
1476
|
+
OffsetT,
|
|
1477
|
+
true, // IsDeviceInit
|
|
1478
|
+
false, // IsEven
|
|
1479
|
+
true, // IsByteSample
|
|
1480
|
+
MaxPolicyT,
|
|
1481
|
+
KernelSource,
|
|
1482
|
+
KernelLauncherFactory>
|
|
1483
|
+
dispatch{
|
|
1484
|
+
d_temp_storage,
|
|
1485
|
+
temp_storage_bytes,
|
|
1486
|
+
d_samples,
|
|
1487
|
+
d_output_histograms,
|
|
1488
|
+
num_privatized_levels,
|
|
1489
|
+
num_output_levels,
|
|
1490
|
+
num_output_levels,
|
|
1491
|
+
d_levels,
|
|
1492
|
+
max_num_output_bins,
|
|
1493
|
+
num_row_pixels,
|
|
1494
|
+
num_rows,
|
|
1495
|
+
row_stride_samples,
|
|
1496
|
+
stream,
|
|
1497
|
+
kernel_source,
|
|
1498
|
+
launcher_factory};
|
|
1499
|
+
|
|
1500
|
+
if (const auto error = CubDebug(max_policy.Invoke(ptx_version, dispatch)))
|
|
1501
|
+
{
|
|
1502
|
+
return error;
|
|
1503
|
+
}
|
|
1504
|
+
|
|
1505
|
+
return cudaSuccess;
|
|
1506
|
+
}
|
|
1507
|
+
|
|
1508
|
+
/**
|
|
1509
|
+
* Dispatch routine for HistogramEven with device-side decode operator initialization,
|
|
1510
|
+
* specialized for sample types larger than 8-bit.
|
|
1511
|
+
* This variant initializes the decode operators inside the kernel from level bounds.
|
|
1512
|
+
*
|
|
1513
|
+
* @param d_temp_storage
|
|
1514
|
+
* Device-accessible allocation of temporary storage.
|
|
1515
|
+
* When nullptr, the required allocation size is written to
|
|
1516
|
+
* `temp_storage_bytes` and no work is done.
|
|
1517
|
+
*
|
|
1518
|
+
* @param temp_storage_bytes
|
|
1519
|
+
* Reference to size in bytes of `d_temp_storage` allocation
|
|
1520
|
+
*
|
|
1521
|
+
* @param d_samples
|
|
1522
|
+
* The pointer to the input sequence of sample items.
|
|
1523
|
+
* The samples from different channels are assumed to be interleaved
|
|
1524
|
+
* (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
|
|
1525
|
+
*
|
|
1526
|
+
* @param d_output_histograms
|
|
1527
|
+
* The pointers to the histogram counter output arrays, one for each active channel.
|
|
1528
|
+
* For channel<sub><em>i</em></sub>, the allocation length of `d_histograms[i]` should be
|
|
1529
|
+
* `num_output_levels[i] - 1`.
|
|
1530
|
+
*
|
|
1531
|
+
* @param num_output_levels
|
|
1532
|
+
* The number of bin level boundaries for delineating histogram samples in each active channel.
|
|
1533
|
+
* Implies that the number of bins for channel<sub><em>i</em></sub> is
|
|
1534
|
+
* `num_output_levels[i] - 1`.
|
|
1535
|
+
*
|
|
1536
|
+
* @param lower_level
|
|
1537
|
+
* The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
|
|
1538
|
+
*
|
|
1539
|
+
* @param upper_level
|
|
1540
|
+
* The upper sample value bound (exclusive) for the highest histogram bin in each active
|
|
1541
|
+
* channel.
|
|
1542
|
+
*
|
|
1543
|
+
* @param num_row_pixels
|
|
1544
|
+
* The number of multi-channel pixels per row in the region of interest
|
|
1545
|
+
*
|
|
1546
|
+
* @param num_rows
|
|
1547
|
+
* The number of rows in the region of interest
|
|
1548
|
+
*
|
|
1549
|
+
* @param row_stride_samples
|
|
1550
|
+
* The number of samples between starts of consecutive rows in the region of interest
|
|
1551
|
+
*
|
|
1552
|
+
* @param stream
|
|
1553
|
+
* CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
|
|
1554
|
+
*
|
|
1555
|
+
*/
|
|
1556
|
+
template <typename MaxPolicyT = typename ::cuda::std::conditional_t<
|
|
1557
|
+
::cuda::std::is_void_v<PolicyHub>,
|
|
1558
|
+
/* fallback_policy_hub */
|
|
1559
|
+
detail::histogram::policy_hub<SampleT, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, /* isEven */ 1>,
|
|
1560
|
+
PolicyHub>::MaxPolicy,
|
|
1561
|
+
typename LowerLevelArrayT = ::cuda::std::array<LevelT, NUM_ACTIVE_CHANNELS>,
|
|
1562
|
+
typename UpperLevelArrayT = ::cuda::std::array<LevelT, NUM_ACTIVE_CHANNELS>>
|
|
1563
|
+
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t __dispatch_even_device_init(
|
|
1564
|
+
void* d_temp_storage,
|
|
1565
|
+
size_t& temp_storage_bytes,
|
|
1566
|
+
SampleIteratorT d_samples,
|
|
1567
|
+
::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms,
|
|
1568
|
+
::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_output_levels,
|
|
1569
|
+
LowerLevelArrayT lower_level,
|
|
1570
|
+
UpperLevelArrayT upper_level,
|
|
1571
|
+
OffsetT num_row_pixels,
|
|
1572
|
+
OffsetT num_rows,
|
|
1573
|
+
OffsetT row_stride_samples,
|
|
1574
|
+
cudaStream_t stream,
|
|
1575
|
+
::cuda::std::false_type /*is_byte_sample*/,
|
|
1576
|
+
KernelSource kernel_source = {},
|
|
1577
|
+
KernelLauncherFactory launcher_factory = {},
|
|
1578
|
+
MaxPolicyT max_policy = {})
|
|
1579
|
+
{
|
|
1580
|
+
// Get PTX version
|
|
1581
|
+
int ptx_version = 0;
|
|
1582
|
+
if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
|
|
1583
|
+
{
|
|
1584
|
+
return error;
|
|
1585
|
+
}
|
|
1586
|
+
|
|
1587
|
+
int max_levels = num_output_levels[0];
|
|
1588
|
+
|
|
1589
|
+
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
|
|
1590
|
+
{
|
|
1591
|
+
int num_levels = num_output_levels[channel];
|
|
1592
|
+
if (kernel_source.MayOverflow(num_levels - 1, upper_level, lower_level, channel))
|
|
1593
|
+
{
|
|
1594
|
+
// Make sure to also return a reasonable value for `temp_storage_bytes` in case of
|
|
1595
|
+
// an overflow of the bin computation, in which case a subsequent algorithm
|
|
1596
|
+
// invocation will also fail
|
|
1597
|
+
if (!d_temp_storage)
|
|
1598
|
+
{
|
|
1599
|
+
temp_storage_bytes = 1U;
|
|
1600
|
+
}
|
|
1601
|
+
return cudaErrorInvalidValue;
|
|
1602
|
+
}
|
|
1603
|
+
|
|
1604
|
+
if (num_levels > max_levels)
|
|
1605
|
+
{
|
|
1606
|
+
max_levels = num_levels;
|
|
1607
|
+
}
|
|
1608
|
+
}
|
|
1609
|
+
int max_num_output_bins = max_levels - 1;
|
|
1610
|
+
|
|
1611
|
+
if (max_num_output_bins > detail::histogram::max_privatized_smem_bins)
|
|
1612
|
+
{
|
|
1613
|
+
// Dispatch shared-privatized approach
|
|
1614
|
+
constexpr int PRIVATIZED_SMEM_BINS = 0;
|
|
1615
|
+
|
|
1616
|
+
detail::histogram::dispatch_histogram<
|
|
1617
|
+
NUM_CHANNELS,
|
|
1618
|
+
NUM_ACTIVE_CHANNELS,
|
|
1619
|
+
PRIVATIZED_SMEM_BINS,
|
|
1620
|
+
SampleIteratorT,
|
|
1621
|
+
CounterT,
|
|
1622
|
+
UpperLevelArrayT,
|
|
1623
|
+
LowerLevelArrayT,
|
|
1624
|
+
OffsetT,
|
|
1625
|
+
true, // IsDeviceInit
|
|
1626
|
+
true, // IsEven
|
|
1627
|
+
false, // IsByteSample
|
|
1628
|
+
MaxPolicyT,
|
|
1629
|
+
KernelSource,
|
|
1630
|
+
KernelLauncherFactory>
|
|
1631
|
+
dispatch{
|
|
1632
|
+
d_temp_storage,
|
|
1633
|
+
temp_storage_bytes,
|
|
1634
|
+
d_samples,
|
|
1635
|
+
d_output_histograms,
|
|
1636
|
+
num_output_levels,
|
|
1637
|
+
num_output_levels,
|
|
1638
|
+
upper_level,
|
|
1639
|
+
lower_level,
|
|
1640
|
+
max_num_output_bins,
|
|
1641
|
+
num_row_pixels,
|
|
1642
|
+
num_rows,
|
|
1643
|
+
row_stride_samples,
|
|
1644
|
+
stream,
|
|
1645
|
+
kernel_source,
|
|
1646
|
+
launcher_factory};
|
|
1647
|
+
|
|
1648
|
+
if (const auto error = CubDebug(max_policy.Invoke(ptx_version, dispatch)))
|
|
1649
|
+
{
|
|
1650
|
+
return error;
|
|
1651
|
+
}
|
|
1652
|
+
}
|
|
1653
|
+
else
|
|
1654
|
+
{
|
|
1655
|
+
// Dispatch shared-privatized approach
|
|
1656
|
+
constexpr int PRIVATIZED_SMEM_BINS = detail::histogram::max_privatized_smem_bins;
|
|
1657
|
+
|
|
1658
|
+
detail::histogram::dispatch_histogram<
|
|
1659
|
+
NUM_CHANNELS,
|
|
1660
|
+
NUM_ACTIVE_CHANNELS,
|
|
1661
|
+
PRIVATIZED_SMEM_BINS,
|
|
1662
|
+
SampleIteratorT,
|
|
1663
|
+
CounterT,
|
|
1664
|
+
UpperLevelArrayT,
|
|
1665
|
+
LowerLevelArrayT,
|
|
1666
|
+
OffsetT,
|
|
1667
|
+
true, // IsDeviceInit
|
|
1668
|
+
true, // IsEven
|
|
1669
|
+
false, // IsByteSample
|
|
1670
|
+
MaxPolicyT,
|
|
1671
|
+
KernelSource,
|
|
1672
|
+
KernelLauncherFactory>
|
|
1673
|
+
dispatch{
|
|
1674
|
+
d_temp_storage,
|
|
1675
|
+
temp_storage_bytes,
|
|
1676
|
+
d_samples,
|
|
1677
|
+
d_output_histograms,
|
|
1678
|
+
num_output_levels,
|
|
1679
|
+
num_output_levels,
|
|
1680
|
+
upper_level,
|
|
1681
|
+
lower_level,
|
|
1682
|
+
max_num_output_bins,
|
|
1683
|
+
num_row_pixels,
|
|
1684
|
+
num_rows,
|
|
1685
|
+
row_stride_samples,
|
|
1686
|
+
stream,
|
|
1687
|
+
kernel_source,
|
|
1688
|
+
launcher_factory};
|
|
1689
|
+
|
|
1690
|
+
if (const auto error = CubDebug(max_policy.Invoke(ptx_version, dispatch)))
|
|
1691
|
+
{
|
|
1692
|
+
return error;
|
|
1693
|
+
}
|
|
1694
|
+
}
|
|
1695
|
+
|
|
1696
|
+
return cudaSuccess;
|
|
1697
|
+
}
|
|
1698
|
+
|
|
1699
|
+
/**
|
|
1700
|
+
* Dispatch routine for HistogramEven with device-side decode operator initialization,
|
|
1701
|
+
* specialized for 8-bit sample types
|
|
1702
|
+
* (computes 256-bin privatized histograms and then reduces to user-specified levels).
|
|
1703
|
+
* This variant initializes the decode operators inside the kernel from level bounds.
|
|
1704
|
+
*
|
|
1705
|
+
* @param d_temp_storage
|
|
1706
|
+
* Device-accessible allocation of temporary storage.
|
|
1707
|
+
* When nullptr, the required allocation size is written to `temp_storage_bytes` and
|
|
1708
|
+
* no work is done.
|
|
1709
|
+
*
|
|
1710
|
+
* @param temp_storage_bytes
|
|
1711
|
+
* Reference to size in bytes of `d_temp_storage` allocation
|
|
1712
|
+
*
|
|
1713
|
+
* @param d_samples
|
|
1714
|
+
* The pointer to the input sequence of sample items. The samples from different channels are
|
|
1715
|
+
* assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of
|
|
1716
|
+
* four RGBA 8-bit samples).
|
|
1717
|
+
*
|
|
1718
|
+
* @param d_output_histograms
|
|
1719
|
+
* The pointers to the histogram counter output arrays, one for each active channel.
|
|
1720
|
+
* For channel<sub><em>i</em></sub>, the allocation length of `d_histograms[i]` should be
|
|
1721
|
+
* `num_output_levels[i] - 1`.
|
|
1722
|
+
*
|
|
1723
|
+
* @param num_output_levels
|
|
1724
|
+
* The number of bin level boundaries for delineating histogram samples in each active channel.
|
|
1725
|
+
* Implies that the number of bins for channel<sub><em>i</em></sub> is
|
|
1726
|
+
* `num_output_levels[i] - 1`.
|
|
1727
|
+
*
|
|
1728
|
+
* @param lower_level
|
|
1729
|
+
* The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
|
|
1730
|
+
*
|
|
1731
|
+
* @param upper_level
|
|
1732
|
+
* The upper sample value bound (exclusive) for the highest histogram bin in each active
|
|
1733
|
+
* channel.
|
|
1734
|
+
*
|
|
1735
|
+
* @param num_row_pixels
|
|
1736
|
+
* The number of multi-channel pixels per row in the region of interest
|
|
1737
|
+
*
|
|
1738
|
+
* @param num_rows
|
|
1739
|
+
* The number of rows in the region of interest
|
|
1740
|
+
*
|
|
1741
|
+
* @param row_stride_samples
|
|
1742
|
+
* The number of samples between starts of consecutive rows in the region of interest
|
|
1743
|
+
*
|
|
1744
|
+
* @param stream
|
|
1745
|
+
* CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
|
|
1746
|
+
*
|
|
1747
|
+
*/
|
|
1748
|
+
template <typename MaxPolicyT = typename ::cuda::std::conditional_t<
|
|
1749
|
+
::cuda::std::is_void_v<PolicyHub>,
|
|
1750
|
+
/* fallback_policy_hub */
|
|
1751
|
+
detail::histogram::policy_hub<SampleT, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, /* isEven */ 1>,
|
|
1752
|
+
PolicyHub>::MaxPolicy,
|
|
1753
|
+
typename LowerLevelArrayT = ::cuda::std::array<LevelT, NUM_ACTIVE_CHANNELS>,
|
|
1754
|
+
typename UpperLevelArrayT = ::cuda::std::array<LevelT, NUM_ACTIVE_CHANNELS>>
|
|
1755
|
+
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t __dispatch_even_device_init(
|
|
1756
|
+
void* d_temp_storage,
|
|
1757
|
+
size_t& temp_storage_bytes,
|
|
1758
|
+
SampleIteratorT d_samples,
|
|
1759
|
+
::cuda::std::array<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms,
|
|
1760
|
+
::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_output_levels,
|
|
1761
|
+
LowerLevelArrayT lower_level,
|
|
1762
|
+
UpperLevelArrayT upper_level,
|
|
1763
|
+
OffsetT num_row_pixels,
|
|
1764
|
+
OffsetT num_rows,
|
|
1765
|
+
OffsetT row_stride_samples,
|
|
1766
|
+
cudaStream_t stream,
|
|
1767
|
+
::cuda::std::true_type /*is_byte_sample*/,
|
|
1768
|
+
KernelSource kernel_source = {},
|
|
1769
|
+
KernelLauncherFactory launcher_factory = {},
|
|
1770
|
+
MaxPolicyT max_policy = {})
|
|
1771
|
+
{
|
|
1772
|
+
// Get PTX version
|
|
1773
|
+
int ptx_version = 0;
|
|
1774
|
+
if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
|
|
1775
|
+
{
|
|
1776
|
+
return error;
|
|
1777
|
+
}
|
|
1778
|
+
|
|
1779
|
+
::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_privatized_levels;
|
|
1780
|
+
int max_levels = num_output_levels[0];
|
|
1781
|
+
|
|
1782
|
+
for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
|
|
1783
|
+
{
|
|
1784
|
+
num_privatized_levels[channel] = 257;
|
|
1785
|
+
|
|
1786
|
+
int num_levels = num_output_levels[channel];
|
|
1787
|
+
if (kernel_source.MayOverflow(num_levels - 1, upper_level, lower_level, channel))
|
|
1788
|
+
{
|
|
1789
|
+
// Make sure to also return a reasonable value for `temp_storage_bytes` in case of
|
|
1790
|
+
// an overflow of the bin computation, in which case a subsequent algorithm
|
|
1791
|
+
// invocation will also fail
|
|
1792
|
+
if (!d_temp_storage)
|
|
1793
|
+
{
|
|
1794
|
+
temp_storage_bytes = 1U;
|
|
1795
|
+
}
|
|
1796
|
+
return cudaErrorInvalidValue;
|
|
1797
|
+
}
|
|
1798
|
+
|
|
1799
|
+
if (num_levels > max_levels)
|
|
1800
|
+
{
|
|
1801
|
+
max_levels = num_levels;
|
|
1802
|
+
}
|
|
1803
|
+
}
|
|
1804
|
+
int max_num_output_bins = max_levels - 1;
|
|
1805
|
+
|
|
1806
|
+
constexpr int PRIVATIZED_SMEM_BINS = 256;
|
|
1807
|
+
|
|
1808
|
+
detail::histogram::dispatch_histogram<
|
|
1809
|
+
NUM_CHANNELS,
|
|
1810
|
+
NUM_ACTIVE_CHANNELS,
|
|
1811
|
+
PRIVATIZED_SMEM_BINS,
|
|
1812
|
+
SampleIteratorT,
|
|
1813
|
+
CounterT,
|
|
1814
|
+
UpperLevelArrayT,
|
|
1815
|
+
LowerLevelArrayT,
|
|
1816
|
+
OffsetT,
|
|
1817
|
+
true, // IsDeviceInit
|
|
1818
|
+
true, // IsEven
|
|
1819
|
+
true, // IsByteSample
|
|
1820
|
+
MaxPolicyT,
|
|
1821
|
+
KernelSource,
|
|
1822
|
+
KernelLauncherFactory>
|
|
1823
|
+
dispatch{
|
|
1824
|
+
d_temp_storage,
|
|
1825
|
+
temp_storage_bytes,
|
|
1826
|
+
d_samples,
|
|
1827
|
+
d_output_histograms,
|
|
1828
|
+
num_privatized_levels,
|
|
1829
|
+
num_output_levels,
|
|
1830
|
+
upper_level,
|
|
1831
|
+
lower_level,
|
|
1832
|
+
max_num_output_bins,
|
|
1833
|
+
num_row_pixels,
|
|
1834
|
+
num_rows,
|
|
1835
|
+
row_stride_samples,
|
|
1836
|
+
stream,
|
|
1837
|
+
kernel_source,
|
|
1838
|
+
launcher_factory};
|
|
1839
|
+
|
|
1840
|
+
if (const auto error = CubDebug(max_policy.Invoke(ptx_version, dispatch)))
|
|
1841
|
+
{
|
|
1842
|
+
return error;
|
|
1843
|
+
}
|
|
1844
|
+
|
|
1845
|
+
return cudaSuccess;
|
|
1846
|
+
}
|
|
1847
|
+
};
|
|
1848
|
+
|
|
1849
|
+
CUB_NAMESPACE_END
|