cuda-cccl 0.3.3__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/__init__.py +27 -0
- cuda/cccl/_cuda_version_utils.py +24 -0
- cuda/cccl/cooperative/__init__.py +9 -0
- cuda/cccl/cooperative/experimental/__init__.py +24 -0
- cuda/cccl/headers/__init__.py +7 -0
- cuda/cccl/headers/include/__init__.py +1 -0
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +259 -0
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +1182 -0
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +81 -0
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +709 -0
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +234 -0
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +748 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +786 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +286 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +703 -0
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +555 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +619 -0
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +806 -0
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +1124 -0
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +589 -0
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +474 -0
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +289 -0
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +1117 -0
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +346 -0
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +606 -0
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +631 -0
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +1424 -0
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +963 -0
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +1227 -0
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +1313 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +424 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +1264 -0
- cuda/cccl/headers/include/cub/block/block_load_to_shared.cuh +432 -0
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +800 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +1225 -0
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +2196 -0
- cuda/cccl/headers/include/cub/block/block_raking_layout.cuh +150 -0
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +667 -0
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +434 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +2315 -0
- cuda/cccl/headers/include/cub/block/block_shuffle.cuh +346 -0
- cuda/cccl/headers/include/cub/block/block_store.cuh +1247 -0
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +624 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_atomic.cuh +86 -0
- cuda/cccl/headers/include/cub/block/specializations/block_histogram_sort.cuh +240 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking.cuh +252 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_raking_commutative_only.cuh +238 -0
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +281 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_raking.cuh +790 -0
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +538 -0
- cuda/cccl/headers/include/cub/config.cuh +53 -0
- cuda/cccl/headers/include/cub/cub.cuh +120 -0
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +78 -0
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +161 -0
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +74 -0
- cuda/cccl/headers/include/cub/detail/device_double_buffer.cuh +96 -0
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +62 -0
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +253 -0
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +88 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +142 -0
- cuda/cccl/headers/include/cub/detail/launcher/cuda_runtime.cuh +100 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +114 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/README.md +71 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/array.h +68 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +62 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +100 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +53 -0
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +95 -0
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +63 -0
- cuda/cccl/headers/include/cub/detail/rfa.cuh +731 -0
- cuda/cccl/headers/include/cub/detail/strong_load.cuh +189 -0
- cuda/cccl/headers/include/cub/detail/strong_store.cuh +220 -0
- cuda/cccl/headers/include/cub/detail/temporary_storage.cuh +384 -0
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +187 -0
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +73 -0
- cuda/cccl/headers/include/cub/detail/unsafe_bitcast.cuh +56 -0
- cuda/cccl/headers/include/cub/device/device_adjacent_difference.cuh +596 -0
- cuda/cccl/headers/include/cub/device/device_copy.cuh +276 -0
- cuda/cccl/headers/include/cub/device/device_for.cuh +1063 -0
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +1509 -0
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +195 -0
- cuda/cccl/headers/include/cub/device/device_merge.cuh +203 -0
- cuda/cccl/headers/include/cub/device/device_merge_sort.cuh +979 -0
- cuda/cccl/headers/include/cub/device/device_partition.cuh +668 -0
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3437 -0
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +2518 -0
- cuda/cccl/headers/include/cub/device/device_run_length_encode.cuh +370 -0
- cuda/cccl/headers/include/cub/device/device_scan.cuh +2212 -0
- cuda/cccl/headers/include/cub/device/device_segmented_radix_sort.cuh +1496 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +1430 -0
- cuda/cccl/headers/include/cub/device/device_segmented_sort.cuh +2811 -0
- cuda/cccl/headers/include/cub/device/device_select.cuh +1228 -0
- cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
- cuda/cccl/headers/include/cub/device/device_transform.cuh +668 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +315 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +719 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +43 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_copy_mdspan.cuh +79 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +198 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +1046 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +303 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +473 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +1744 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +1310 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +655 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +531 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +313 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +615 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +517 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +602 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +975 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +842 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +341 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +440 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +389 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +627 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +569 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +545 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +261 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/histogram.cuh +505 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/merge_sort.cuh +334 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +803 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +583 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +189 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +321 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_sort.cuh +522 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +201 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +1028 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/unique_by_key.cuh +176 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +67 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +118 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +60 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +275 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +76 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +126 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +1065 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce.cuh +493 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +942 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +673 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +618 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +1010 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +398 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +1588 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +440 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +481 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +884 -0
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +227 -0
- cuda/cccl/headers/include/cub/grid/grid_mapping.cuh +106 -0
- cuda/cccl/headers/include/cub/grid/grid_queue.cuh +202 -0
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +254 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +259 -0
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +250 -0
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +320 -0
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +349 -0
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +688 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +548 -0
- cuda/cccl/headers/include/cub/thread/thread_scan.cuh +498 -0
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +199 -0
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +458 -0
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +102 -0
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +365 -0
- cuda/cccl/headers/include/cub/util_allocator.cuh +921 -0
- cuda/cccl/headers/include/cub/util_arch.cuh +167 -0
- cuda/cccl/headers/include/cub/util_cpp_dialect.cuh +95 -0
- cuda/cccl/headers/include/cub/util_debug.cuh +207 -0
- cuda/cccl/headers/include/cub/util_device.cuh +800 -0
- cuda/cccl/headers/include/cub/util_macro.cuh +97 -0
- cuda/cccl/headers/include/cub/util_math.cuh +118 -0
- cuda/cccl/headers/include/cub/util_namespace.cuh +176 -0
- cuda/cccl/headers/include/cub/util_policy_wrapper_t.cuh +55 -0
- cuda/cccl/headers/include/cub/util_ptx.cuh +513 -0
- cuda/cccl/headers/include/cub/util_temporary_storage.cuh +122 -0
- cuda/cccl/headers/include/cub/util_type.cuh +1120 -0
- cuda/cccl/headers/include/cub/util_vsmem.cuh +253 -0
- cuda/cccl/headers/include/cub/version.cuh +89 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +329 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +177 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +737 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +408 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +952 -0
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +715 -0
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +405 -0
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +614 -0
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +169 -0
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +829 -0
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +1890 -0
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +521 -0
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +61 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +68 -0
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +196 -0
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +107 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property.h +165 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/access_property_encoding.h +172 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr.h +217 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/annotated_ptr_base.h +100 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/apply_access_property.h +83 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/associate_access_property.h +128 -0
- cuda/cccl/headers/include/cuda/__annotated_ptr/createpolicy.h +210 -0
- cuda/cccl/headers/include/cuda/__atomic/atomic.h +145 -0
- cuda/cccl/headers/include/cuda/__barrier/async_contract_fulfillment.h +39 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier.h +65 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_arrive_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +487 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_expect_tx.h +74 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_native_handle.h +45 -0
- cuda/cccl/headers/include/cuda/__barrier/barrier_thread_scope.h +60 -0
- cuda/cccl/headers/include/cuda/__bit/bit_reverse.h +171 -0
- cuda/cccl/headers/include/cuda/__bit/bitfield.h +122 -0
- cuda/cccl/headers/include/cuda/__bit/bitmask.h +90 -0
- cuda/cccl/headers/include/cuda/__cccl_config +37 -0
- cuda/cccl/headers/include/cuda/__cmath/ceil_div.h +124 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +178 -0
- cuda/cccl/headers/include/cuda/__cmath/ilog.h +195 -0
- cuda/cccl/headers/include/cuda/__cmath/ipow.h +107 -0
- cuda/cccl/headers/include/cuda/__cmath/isqrt.h +80 -0
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__cmath/neg.h +47 -0
- cuda/cccl/headers/include/cuda/__cmath/pow2.h +74 -0
- cuda/cccl/headers/include/cuda/__cmath/round_down.h +102 -0
- cuda/cccl/headers/include/cuda/__cmath/round_up.h +104 -0
- cuda/cccl/headers/include/cuda/__cmath/uabs.h +57 -0
- cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +89 -0
- cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
- cuda/cccl/headers/include/cuda/__complex_ +28 -0
- cuda/cccl/headers/include/cuda/__device/all_devices.h +140 -0
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +537 -0
- cuda/cccl/headers/include/cuda/__device/attributes.h +772 -0
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +156 -0
- cuda/cccl/headers/include/cuda/__device/physical_device.h +172 -0
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +835 -0
- cuda/cccl/headers/include/cuda/__event/event.h +171 -0
- cuda/cccl/headers/include/cuda/__event/event_ref.h +157 -0
- cuda/cccl/headers/include/cuda/__event/timed_event.h +120 -0
- cuda/cccl/headers/include/cuda/__execution/determinism.h +91 -0
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
- cuda/cccl/headers/include/cuda/__execution/require.h +75 -0
- cuda/cccl/headers/include/cuda/__execution/tune.h +70 -0
- cuda/cccl/headers/include/cuda/__functional/address_stability.h +131 -0
- cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +321 -0
- cuda/cccl/headers/include/cuda/__functional/maximum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/minimum.h +58 -0
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +108 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/barrier_native_handle.h +42 -0
- cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/get_stream.h +38 -0
- cuda/cccl/headers/include/cuda/__fwd/pipeline.h +37 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +58 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +315 -0
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +483 -0
- cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +324 -0
- cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +456 -0
- cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +334 -0
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +418 -0
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +367 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +528 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +527 -0
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +486 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +112 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +557 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__latch/latch.h +44 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +533 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +238 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +152 -0
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +117 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/check_preconditions.h +79 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/completion_mechanism.h +47 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +60 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_fallback.h +72 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/cp_async_shared_global.h +148 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/dispatch_memcpy_async.h +165 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/is_local_smem_barrier.h +53 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async.h +179 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_barrier.h +99 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +104 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_completion.h +170 -0
- cuda/cccl/headers/include/cuda/__memcpy_async/try_get_barrier_handle.h +59 -0
- cuda/cccl/headers/include/cuda/__memory/address_space.h +227 -0
- cuda/cccl/headers/include/cuda/__memory/align_down.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/align_up.h +56 -0
- cuda/cccl/headers/include/cuda/__memory/aligned_size.h +61 -0
- cuda/cccl/headers/include/cuda/__memory/check_address.h +111 -0
- cuda/cccl/headers/include/cuda/__memory/discard_memory.h +64 -0
- cuda/cccl/headers/include/cuda/__memory/get_device_address.h +58 -0
- cuda/cccl/headers/include/cuda/__memory/is_aligned.h +47 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory/ptr_rebind.h +75 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +82 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_property.h +153 -0
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +113 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +125 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +652 -0
- cuda/cccl/headers/include/cuda/__numeric/add_overflow.h +306 -0
- cuda/cccl/headers/include/cuda/__numeric/narrow.h +108 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_cast.h +59 -0
- cuda/cccl/headers/include/cuda/__numeric/overflow_result.h +43 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +120 -0
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2983 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/barrier_cluster.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bfind.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/bmsk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/clusterlaunchcontrol.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk.h +44 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_commit_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_tensor.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_bulk_wait_group.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_async_mbarrier_arrive.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk.h +60 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/cp_reduce_async_bulk_tensor.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/elect_sync.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/exit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/fence.h +49 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/barrier_cluster.h +115 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bfind.h +190 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/bmsk.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/clusterlaunchcontrol.h +242 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk.h +197 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_commit_group.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_multicast.h +54 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor.h +997 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_gather_scatter.h +318 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_tensor_multicast.h +671 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_bulk_wait_group.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_async_mbarrier_arrive_noinc.h +26 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk.h +1470 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_bf16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_f16.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/cp_reduce_async_bulk_tensor.h +601 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/elect_sync.h +36 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/exit.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence.h +208 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_mbarrier_init.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_alias.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_async_generic_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_proxy_tensormap_generic.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/fence_sync_restrict.h +64 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/get_sreg.h +949 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/getctarank.h +32 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/ld.h +5542 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive.h +399 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_expect_tx.h +184 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_arrive_no_complete.h +34 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_expect_tx.h +102 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_init.h +27 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait.h +143 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_test_wait_parity.h +144 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait.h +286 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/mbarrier_try_wait_parity.h +290 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_ld_reduce.h +2202 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_red.h +1362 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/multimem_st.h +236 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/prmt.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/red_async.h +460 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shl.h +96 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/shr.h +168 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st.h +1490 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_async.h +123 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/st_bulk.h +31 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_alloc.h +132 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_commit.h +99 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_cp.h +765 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_fence.h +58 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_ld.h +4927 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma.h +4291 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_mma_ws.h +7110 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_shift.h +42 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_st.h +5063 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tcgen05_wait.h +56 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_cp_fenceproxy.h +71 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/tensormap_replace.h +1030 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/generated/trap.h +25 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/get_sreg.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/getctarank.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_arrive.h +45 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_expect_tx.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_init.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/mbarrier_wait.h +46 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_ld_reduce.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_red.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/multimem_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/prmt.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/red_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shfl_sync.h +244 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shl.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/shr.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_async.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/st_bulk.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_alloc.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_commit.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_cp.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_fence.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_ld.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_mma_ws.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_shift.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_st.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tcgen05_wait.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_cp_fenceproxy.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/tensormap_replace.h +43 -0
- cuda/cccl/headers/include/cuda/__ptx/instructions/trap.h +41 -0
- cuda/cccl/headers/include/cuda/__ptx/pragmas/enable_smem_spilling.h +47 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_dot_variants.h +230 -0
- cuda/cccl/headers/include/cuda/__ptx/ptx_helper_functions.h +176 -0
- cuda/cccl/headers/include/cuda/__random/feistel_bijection.h +105 -0
- cuda/cccl/headers/include/cuda/__random/random_bijection.h +88 -0
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +99 -0
- cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
- cuda/cccl/headers/include/cuda/__semaphore/counting_semaphore.h +53 -0
- cuda/cccl/headers/include/cuda/__stream/get_stream.h +110 -0
- cuda/cccl/headers/include/cuda/__stream/stream.h +141 -0
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +303 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_floating_point.h +47 -0
- cuda/cccl/headers/include/cuda/__type_traits/is_specialization_of.h +37 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/access.h +88 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/any_cast.h +83 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_base.h +148 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_from.h +96 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_fwd.h +128 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ptr.h +304 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_ref.h +337 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +590 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/conversions.h +169 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/dynamic_any_cast.h +107 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/interfaces.h +359 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/iset.h +142 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/overrides.h +64 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/rtti.h +257 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/semiregular.h +322 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +79 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/tagged_ptr.h +58 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtcall.h +162 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_functions.h +184 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_ptrs.h +80 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/virtual_tables.h +155 -0
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +507 -0
- cuda/cccl/headers/include/cuda/__utility/immovable.h +50 -0
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/__utility/inherit.h +36 -0
- cuda/cccl/headers/include/cuda/__utility/no_init.h +29 -0
- cuda/cccl/headers/include/cuda/__utility/static_for.h +79 -0
- cuda/cccl/headers/include/cuda/__warp/lane_mask.h +326 -0
- cuda/cccl/headers/include/cuda/__warp/warp_match_all.h +65 -0
- cuda/cccl/headers/include/cuda/__warp/warp_shuffle.h +251 -0
- cuda/cccl/headers/include/cuda/access_property +26 -0
- cuda/cccl/headers/include/cuda/algorithm +27 -0
- cuda/cccl/headers/include/cuda/annotated_ptr +29 -0
- cuda/cccl/headers/include/cuda/atomic +27 -0
- cuda/cccl/headers/include/cuda/barrier +267 -0
- cuda/cccl/headers/include/cuda/bit +29 -0
- cuda/cccl/headers/include/cuda/cmath +37 -0
- cuda/cccl/headers/include/cuda/devices +33 -0
- cuda/cccl/headers/include/cuda/discard_memory +32 -0
- cuda/cccl/headers/include/cuda/functional +32 -0
- cuda/cccl/headers/include/cuda/iterator +39 -0
- cuda/cccl/headers/include/cuda/latch +27 -0
- cuda/cccl/headers/include/cuda/mdspan +28 -0
- cuda/cccl/headers/include/cuda/memory +35 -0
- cuda/cccl/headers/include/cuda/memory_resource +35 -0
- cuda/cccl/headers/include/cuda/numeric +29 -0
- cuda/cccl/headers/include/cuda/pipeline +579 -0
- cuda/cccl/headers/include/cuda/ptx +129 -0
- cuda/cccl/headers/include/cuda/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/adjacent_find.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/all_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/any_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/binary_search.h +53 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/clamp.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/comp_ref_type.h +85 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy.h +142 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_backward.h +80 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/count_if.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +101 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill.h +58 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/fill_n.h +51 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_end.h +225 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_first_of.h +73 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/find_if_not.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each.h +42 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/for_each_n.h +48 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate.h +41 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/generate_n.h +46 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/half_positive.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/in_fun_result.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +90 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_heap_until.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_partitioned.h +57 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_permutation.h +252 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted.h +49 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/is_sorted_until.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iter_swap.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lexicographical_compare.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +82 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +67 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/merge.h +89 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min.h +62 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +87 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax.h +66 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/mismatch.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move.h +86 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/move_backward.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/next_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/none_of.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort.h +102 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition.h +120 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_copy.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/partition_point.h +61 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +93 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/prev_permutation.h +88 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +100 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_for_each_n.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_iterator_concept.h +65 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min.h +98 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/ranges_min_element.h +68 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove.h +55 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_copy_if.h +47 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/remove_if.h +56 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy.h +54 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_copy_if.h +50 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/replace_if.h +45 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse.h +81 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/reverse_copy.h +43 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate.h +261 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/rotate_copy.h +40 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search.h +185 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/search_n.h +163 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_difference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_intersection.h +122 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_symmetric_difference.h +134 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/set_union.h +128 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_left.h +84 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/shift_right.h +144 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sift_down.h +139 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/sort_heap.h +70 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/swap_ranges.h +78 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/transform.h +59 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique.h +76 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unique_copy.h +155 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +95 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_range.h +126 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +83 -0
- cuda/cccl/headers/include/cuda/std/__algorithm_ +26 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/common.h +192 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +136 -0
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +118 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/common.h +58 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_local.h +208 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_derived.h +401 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +3971 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/cuda_ptx_generated_helper.h +177 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +211 -0
- cuda/cccl/headers/include/cuda/std/__atomic/functions.h +33 -0
- cuda/cccl/headers/include/cuda/std/__atomic/order.h +159 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform/msvc_to_builtins.h +654 -0
- cuda/cccl/headers/include/cuda/std/__atomic/platform.h +93 -0
- cuda/cccl/headers/include/cuda/std/__atomic/scopes.h +105 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/base.h +249 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/common.h +104 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/locked.h +225 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types/small.h +228 -0
- cuda/cccl/headers/include/cuda/std/__atomic/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/notify_wait.h +95 -0
- cuda/cccl/headers/include/cuda/std/__atomic/wait/polling.h +65 -0
- cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +227 -0
- cuda/cccl/headers/include/cuda/std/__barrier/empty_completion.h +37 -0
- cuda/cccl/headers/include/cuda/std/__barrier/poll_tester.h +82 -0
- cuda/cccl/headers/include/cuda/std/__bit/bit_cast.h +76 -0
- cuda/cccl/headers/include/cuda/std/__bit/byteswap.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +174 -0
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +185 -0
- cuda/cccl/headers/include/cuda/std/__bit/endian.h +39 -0
- cuda/cccl/headers/include/cuda/std/__bit/has_single_bit.h +43 -0
- cuda/cccl/headers/include/cuda/std/__bit/integral.h +126 -0
- cuda/cccl/headers/include/cuda/std/__bit/popcount.h +154 -0
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +1272 -0
- cuda/cccl/headers/include/cuda/std/__bit/rotate.h +94 -0
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/architecture.h +78 -0
- cuda/cccl/headers/include/cuda/std/__cccl/assert.h +161 -0
- cuda/cccl/headers/include/cuda/std/__cccl/attributes.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +673 -0
- cuda/cccl/headers/include/cuda/std/__cccl/compiler.h +217 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_capabilities.h +51 -0
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cccl/diagnostic.h +131 -0
- cuda/cccl/headers/include/cuda/std/__cccl/dialect.h +123 -0
- cuda/cccl/headers/include/cuda/std/__cccl/epilogue.h +344 -0
- cuda/cccl/headers/include/cuda/std/__cccl/exceptions.h +91 -0
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +74 -0
- cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +160 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/is_non_narrowing_convertible.h +73 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/os.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +1286 -0
- cuda/cccl/headers/include/cuda/std/__cccl/prologue.h +281 -0
- cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +253 -0
- cuda/cccl/headers/include/cuda/std/__cccl/rtti.h +72 -0
- cuda/cccl/headers/include/cuda/std/__cccl/sequence_access.h +87 -0
- cuda/cccl/headers/include/cuda/std/__cccl/system_header.h +38 -0
- cuda/cccl/headers/include/cuda/std/__cccl/unreachable.h +31 -0
- cuda/cccl/headers/include/cuda/std/__cccl/version.h +26 -0
- cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +171 -0
- cuda/cccl/headers/include/cuda/std/__charconv/chars_format.h +81 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars.h +154 -0
- cuda/cccl/headers/include/cuda/std/__charconv/from_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars.h +148 -0
- cuda/cccl/headers/include/cuda/std/__charconv/to_chars_result.h +56 -0
- cuda/cccl/headers/include/cuda/std/__charconv_ +31 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +259 -0
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/abs.h +127 -0
- cuda/cccl/headers/include/cuda/std/__cmath/copysign.h +88 -0
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +784 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fma.h +125 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +231 -0
- cuda/cccl/headers/include/cuda/std/__cmath/gamma.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/hypot.h +221 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_hyperbolic_functions.h +286 -0
- cuda/cccl/headers/include/cuda/std/__cmath/inverse_trigonometric_functions.h +371 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isfinite.h +167 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isinf.h +205 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/isnormal.h +138 -0
- cuda/cccl/headers/include/cuda/std/__cmath/lerp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +534 -0
- cuda/cccl/headers/include/cuda/std/__cmath/min_max.h +287 -0
- cuda/cccl/headers/include/cuda/std/__cmath/modulo.h +208 -0
- cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
- cuda/cccl/headers/include/cuda/std/__cmath/remainder.h +206 -0
- cuda/cccl/headers/include/cuda/std/__cmath/roots.h +199 -0
- cuda/cccl/headers/include/cuda/std/__cmath/rounding_functions.h +984 -0
- cuda/cccl/headers/include/cuda/std/__cmath/signbit.h +56 -0
- cuda/cccl/headers/include/cuda/std/__cmath/traits.h +238 -0
- cuda/cccl/headers/include/cuda/std/__cmath/trigonometric_functions.h +328 -0
- cuda/cccl/headers/include/cuda/std/__complex/arg.h +84 -0
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +669 -0
- cuda/cccl/headers/include/cuda/std/__complex/exponential_functions.h +411 -0
- cuda/cccl/headers/include/cuda/std/__complex/hyperbolic_functions.h +117 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_hyperbolic_functions.h +216 -0
- cuda/cccl/headers/include/cuda/std/__complex/inverse_trigonometric_functions.h +131 -0
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +86 -0
- cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +303 -0
- cuda/cccl/headers/include/cuda/std/__complex/math.h +159 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +323 -0
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +322 -0
- cuda/cccl/headers/include/cuda/std/__complex/roots.h +214 -0
- cuda/cccl/headers/include/cuda/std/__complex/trigonometric_functions.h +61 -0
- cuda/cccl/headers/include/cuda/std/__complex/tuple.h +107 -0
- cuda/cccl/headers/include/cuda/std/__complex/vector_support.h +130 -0
- cuda/cccl/headers/include/cuda/std/__concepts/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/assignable.h +64 -0
- cuda/cccl/headers/include/cuda/std/__concepts/boolean_testable.h +63 -0
- cuda/cccl/headers/include/cuda/std/__concepts/class_or_enum.h +45 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_reference_with.h +69 -0
- cuda/cccl/headers/include/cuda/std/__concepts/common_with.h +82 -0
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +341 -0
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__concepts/convertible_to.h +70 -0
- cuda/cccl/headers/include/cuda/std/__concepts/copyable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__concepts/derived_from.h +56 -0
- cuda/cccl/headers/include/cuda/std/__concepts/destructible.h +76 -0
- cuda/cccl/headers/include/cuda/std/__concepts/different_from.h +38 -0
- cuda/cccl/headers/include/cuda/std/__concepts/equality_comparable.h +100 -0
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +80 -0
- cuda/cccl/headers/include/cuda/std/__concepts/movable.h +58 -0
- cuda/cccl/headers/include/cuda/std/__concepts/predicate.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/regular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/relation.h +77 -0
- cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +39 -0
- cuda/cccl/headers/include/cuda/std/__concepts/semiregular.h +54 -0
- cuda/cccl/headers/include/cuda/std/__concepts/swappable.h +206 -0
- cuda/cccl/headers/include/cuda/std/__concepts/totally_ordered.h +101 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/byte.h +113 -0
- cuda/cccl/headers/include/cuda/std/__cstddef/types.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/abs.h +57 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/aligned_alloc.h +66 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/div.h +96 -0
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +70 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +61 -0
- cuda/cccl/headers/include/cuda/std/__cstring/memset.h +46 -0
- cuda/cccl/headers/include/cuda/std/__cuda/api_wrapper.h +62 -0
- cuda/cccl/headers/include/cuda/std/__exception/cuda_error.h +139 -0
- cuda/cccl/headers/include/cuda/std/__exception/terminate.h +73 -0
- cuda/cccl/headers/include/cuda/std/__execution/env.h +455 -0
- cuda/cccl/headers/include/cuda/std/__execution/policy.h +88 -0
- cuda/cccl/headers/include/cuda/std/__expected/bad_expected_access.h +127 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +1941 -0
- cuda/cccl/headers/include/cuda/std/__expected/expected_base.h +1050 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpect.h +37 -0
- cuda/cccl/headers/include/cuda/std/__expected/unexpected.h +165 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/arithmetic.h +56 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cast.h +812 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cccl_fp.h +125 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/common_type.h +48 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +376 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/conversion_rank_order.h +124 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/cuda_fp_types.h +116 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/format.h +162 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +40 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +78 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/native_type.h +81 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/overflow_handler.h +139 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/properties.h +229 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/storage.h +248 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/traits.h +172 -0
- cuda/cccl/headers/include/cuda/std/__format/buffer.h +48 -0
- cuda/cccl/headers/include/cuda/std/__format/concepts.h +69 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg.h +282 -0
- cuda/cccl/headers/include/cuda/std/__format/format_arg_store.h +279 -0
- cuda/cccl/headers/include/cuda/std/__format/format_args.h +122 -0
- cuda/cccl/headers/include/cuda/std/__format/format_context.h +92 -0
- cuda/cccl/headers/include/cuda/std/__format/format_error.h +76 -0
- cuda/cccl/headers/include/cuda/std/__format/format_integral.h +237 -0
- cuda/cccl/headers/include/cuda/std/__format/format_parse_context.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/format_spec_parser.h +1230 -0
- cuda/cccl/headers/include/cuda/std/__format/formatter.h +59 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/bool.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/char.h +124 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/fp.h +101 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/int.h +174 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/ptr.h +104 -0
- cuda/cccl/headers/include/cuda/std/__format/formatters/str.h +178 -0
- cuda/cccl/headers/include/cuda/std/__format/output_utils.h +272 -0
- cuda/cccl/headers/include/cuda/std/__format/parse_arg_id.h +138 -0
- cuda/cccl/headers/include/cuda/std/__format_ +45 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +334 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +80 -0
- cuda/cccl/headers/include/cuda/std/__functional/bind_front.h +73 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +74 -0
- cuda/cccl/headers/include/cuda/std/__functional/compose.h +68 -0
- cuda/cccl/headers/include/cuda/std/__functional/default_searcher.h +75 -0
- cuda/cccl/headers/include/cuda/std/__functional/function.h +1275 -0
- cuda/cccl/headers/include/cuda/std/__functional/hash.h +649 -0
- cuda/cccl/headers/include/cuda/std/__functional/identity.h +57 -0
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +296 -0
- cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +41 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +66 -0
- cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +211 -0
- cuda/cccl/headers/include/cuda/std/__functional/not_fn.h +120 -0
- cuda/cccl/headers/include/cuda/std/__functional/operations.h +534 -0
- cuda/cccl/headers/include/cuda/std/__functional/perfect_forward.h +128 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +64 -0
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +63 -0
- cuda/cccl/headers/include/cuda/std/__functional/ranges_operations.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +113 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +62 -0
- cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +65 -0
- cuda/cccl/headers/include/cuda/std/__functional/unwrap_ref.h +56 -0
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +262 -0
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +53 -0
- cuda/cccl/headers/include/cuda/std/__fwd/array.h +42 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +74 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +75 -0
- cuda/cccl/headers/include/cuda/std/__fwd/expected.h +46 -0
- cuda/cccl/headers/include/cuda/std/__fwd/format.h +84 -0
- cuda/cccl/headers/include/cuda/std/__fwd/fp.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/get.h +123 -0
- cuda/cccl/headers/include/cuda/std/__fwd/hash.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/iterator.h +43 -0
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +122 -0
- cuda/cccl/headers/include/cuda/std/__fwd/memory_resource.h +37 -0
- cuda/cccl/headers/include/cuda/std/__fwd/optional.h +39 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +47 -0
- cuda/cccl/headers/include/cuda/std/__fwd/reference_wrapper.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/span.h +45 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +112 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +91 -0
- cuda/cccl/headers/include/cuda/std/__fwd/subrange.h +55 -0
- cuda/cccl/headers/include/cuda/std/__fwd/tuple.h +34 -0
- cuda/cccl/headers/include/cuda/std/__fwd/unexpected.h +40 -0
- cuda/cccl/headers/include/cuda/std/__internal/cpp_dialect.h +44 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +72 -0
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/access.h +128 -0
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +228 -0
- cuda/cccl/headers/include/cuda/std/__iterator/back_insert_iterator.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +253 -0
- cuda/cccl/headers/include/cuda/std/__iterator/concepts.h +645 -0
- cuda/cccl/headers/include/cuda/std/__iterator/counted_iterator.h +464 -0
- cuda/cccl/headers/include/cuda/std/__iterator/data.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/default_sentinel.h +36 -0
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +126 -0
- cuda/cccl/headers/include/cuda/std/__iterator/empty.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/erase_if_container.h +53 -0
- cuda/cccl/headers/include/cuda/std/__iterator/front_insert_iterator.h +99 -0
- cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +143 -0
- cuda/cccl/headers/include/cuda/std/__iterator/indirectly_comparable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/insert_iterator.h +107 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istream_iterator.h +146 -0
- cuda/cccl/headers/include/cuda/std/__iterator/istreambuf_iterator.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_move.h +161 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iter_swap.h +163 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +44 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +847 -0
- cuda/cccl/headers/include/cuda/std/__iterator/mergeable.h +72 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +432 -0
- cuda/cccl/headers/include/cuda/std/__iterator/move_sentinel.h +73 -0
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +101 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostream_iterator.h +95 -0
- cuda/cccl/headers/include/cuda/std/__iterator/ostreambuf_iterator.h +100 -0
- cuda/cccl/headers/include/cuda/std/__iterator/permutable.h +54 -0
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +90 -0
- cuda/cccl/headers/include/cuda/std/__iterator/projected.h +61 -0
- cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +156 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_access.h +142 -0
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +371 -0
- cuda/cccl/headers/include/cuda/std/__iterator/size.h +69 -0
- cuda/cccl/headers/include/cuda/std/__iterator/sortable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__iterator/unreachable_sentinel.h +84 -0
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +245 -0
- cuda/cccl/headers/include/cuda/std/__latch/latch.h +88 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +617 -0
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +753 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conj_if_needed.h +78 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugate_transposed.h +54 -0
- cuda/cccl/headers/include/cuda/std/__linalg/conjugated.h +139 -0
- cuda/cccl/headers/include/cuda/std/__linalg/scaled.h +132 -0
- cuda/cccl/headers/include/cuda/std/__linalg/transposed.h +321 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/aligned_accessor.h +97 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +139 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/default_accessor.h +73 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/empty_base.h +352 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +759 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +314 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +307 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +605 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +512 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_extents.h +193 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_helper.h +189 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/submdspan_mapping.h +344 -0
- cuda/cccl/headers/include/cuda/std/__memory/addressof.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/align.h +67 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocate_at_least.h +81 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocation_guard.h +100 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +320 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +84 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_destructor.h +59 -0
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +525 -0
- cuda/cccl/headers/include/cuda/std/__memory/assume_aligned.h +60 -0
- cuda/cccl/headers/include/cuda/std/__memory/builtin_new_allocator.h +87 -0
- cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +225 -0
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +91 -0
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +46 -0
- cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +246 -0
- cuda/cccl/headers/include/cuda/std/__memory/runtime_assume_aligned.h +62 -0
- cuda/cccl/headers/include/cuda/std/__memory/temporary_buffer.h +92 -0
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +678 -0
- cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +765 -0
- cuda/cccl/headers/include/cuda/std/__memory/uses_allocator.h +54 -0
- cuda/cccl/headers/include/cuda/std/__memory/voidify.h +41 -0
- cuda/cccl/headers/include/cuda/std/__memory_ +34 -0
- cuda/cccl/headers/include/cuda/std/__new/allocate.h +126 -0
- cuda/cccl/headers/include/cuda/std/__new/bad_alloc.h +57 -0
- cuda/cccl/headers/include/cuda/std/__new/launder.h +53 -0
- cuda/cccl/headers/include/cuda/std/__new_ +29 -0
- cuda/cccl/headers/include/cuda/std/__numeric/accumulate.h +56 -0
- cuda/cccl/headers/include/cuda/std/__numeric/adjacent_difference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__numeric/exclusive_scan.h +66 -0
- cuda/cccl/headers/include/cuda/std/__numeric/gcd_lcm.h +78 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inclusive_scan.h +73 -0
- cuda/cccl/headers/include/cuda/std/__numeric/inner_product.h +62 -0
- cuda/cccl/headers/include/cuda/std/__numeric/iota.h +42 -0
- cuda/cccl/headers/include/cuda/std/__numeric/midpoint.h +97 -0
- cuda/cccl/headers/include/cuda/std/__numeric/partial_sum.h +69 -0
- cuda/cccl/headers/include/cuda/std/__numeric/reduce.h +60 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_exclusive_scan.h +51 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_inclusive_scan.h +65 -0
- cuda/cccl/headers/include/cuda/std/__numeric/transform_reduce.h +72 -0
- cuda/cccl/headers/include/cuda/std/__optional/bad_optional_access.h +74 -0
- cuda/cccl/headers/include/cuda/std/__optional/hash.h +53 -0
- cuda/cccl/headers/include/cuda/std/__optional/make_optional.h +61 -0
- cuda/cccl/headers/include/cuda/std/__optional/nullopt.h +43 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional.h +859 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +433 -0
- cuda/cccl/headers/include/cuda/std/__optional/optional_ref.h +324 -0
- cuda/cccl/headers/include/cuda/std/__random/generate_canonical.h +56 -0
- cuda/cccl/headers/include/cuda/std/__random/is_seed_sequence.h +39 -0
- cuda/cccl/headers/include/cuda/std/__random/is_valid.h +106 -0
- cuda/cccl/headers/include/cuda/std/__random/linear_congruential_engine.h +398 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_int_distribution.h +335 -0
- cuda/cccl/headers/include/cuda/std/__random/uniform_real_distribution.h +183 -0
- cuda/cccl/headers/include/cuda/std/__random_ +29 -0
- cuda/cccl/headers/include/cuda/std/__ranges/access.h +303 -0
- cuda/cccl/headers/include/cuda/std/__ranges/all.h +98 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/concepts.h +302 -0
- cuda/cccl/headers/include/cuda/std/__ranges/counted.h +90 -0
- cuda/cccl/headers/include/cuda/std/__ranges/dangling.h +54 -0
- cuda/cccl/headers/include/cuda/std/__ranges/data.h +136 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty.h +109 -0
- cuda/cccl/headers/include/cuda/std/__ranges/empty_view.h +77 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_borrowed_range.h +41 -0
- cuda/cccl/headers/include/cuda/std/__ranges/enable_view.h +78 -0
- cuda/cccl/headers/include/cuda/std/__ranges/from_range.h +36 -0
- cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +266 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +410 -0
- cuda/cccl/headers/include/cuda/std/__ranges/owning_view.h +162 -0
- cuda/cccl/headers/include/cuda/std/__ranges/range_adaptor.h +110 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rbegin.h +175 -0
- cuda/cccl/headers/include/cuda/std/__ranges/ref_view.h +121 -0
- cuda/cccl/headers/include/cuda/std/__ranges/rend.h +182 -0
- cuda/cccl/headers/include/cuda/std/__ranges/repeat_view.h +345 -0
- cuda/cccl/headers/include/cuda/std/__ranges/single_view.h +155 -0
- cuda/cccl/headers/include/cuda/std/__ranges/size.h +201 -0
- cuda/cccl/headers/include/cuda/std/__ranges/subrange.h +513 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_view.h +476 -0
- cuda/cccl/headers/include/cuda/std/__ranges/take_while_view.h +259 -0
- cuda/cccl/headers/include/cuda/std/__ranges/transform_view.h +522 -0
- cuda/cccl/headers/include/cuda/std/__ranges/unwrap_end.h +53 -0
- cuda/cccl/headers/include/cuda/std/__ranges/view_interface.h +183 -0
- cuda/cccl/headers/include/cuda/std/__ranges/views.h +38 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +234 -0
- cuda/cccl/headers/include/cuda/std/__semaphore/counting_semaphore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__string/char_traits.h +191 -0
- cuda/cccl/headers/include/cuda/std/__string/constexpr_c_functions.h +581 -0
- cuda/cccl/headers/include/cuda/std/__string/helper_functions.h +296 -0
- cuda/cccl/headers/include/cuda/std/__string/string_view.h +244 -0
- cuda/cccl/headers/include/cuda/std/__string_ +29 -0
- cuda/cccl/headers/include/cuda/std/__system_error/errc.h +51 -0
- cuda/cccl/headers/include/cuda/std/__system_error_ +26 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +106 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +47 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +41 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +143 -0
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +87 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/ignore.h +51 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +120 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +260 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/structured_bindings.h +212 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_element.h +70 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_indices.h +44 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +84 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +68 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +79 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_types.h +35 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +290 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_const.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_cv.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_lvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_pointer.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_rvalue_reference.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/add_volatile.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_storage.h +149 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/aligned_union.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/alignment_of.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/always_false.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_reference.h +262 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/common_type.h +173 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conditional.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/conjunction.h +67 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cv.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/copy_cvref.h +148 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +83 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/dependent_type.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/disjunction.h +77 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/enable_if.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/extent.h +68 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/fold.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_allocator.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_arithmetic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_array.h +62 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +78 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_bounded_array.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_callable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_char_like_type.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_const.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constant_evaluated.h +51 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +174 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +211 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_core_convertible.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_corresponding_member.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_default_constructible.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_destructible.h +115 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_execution_policy.h +81 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_arithmetic.h +38 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_floating_point.h +53 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +61 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_fundamental.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_implicitly_default_constructible.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integer.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_integral.h +123 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_layout_compatible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +79 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_object_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_pointer.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +70 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_convertible.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_copy_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_default_constructible.h +54 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_destructible.h +82 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_assignable.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_move_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_null_pointer.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_object.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_one_of.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_base_of.h +84 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pointer_interconvertible_with_class.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +121 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference.h +95 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_reference_wrapper.h +50 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_referenceable.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +88 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scalar.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_scoped_enum.h +49 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_signed_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +202 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +46 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +45 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unbounded_array.h +43 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +42 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_unsigned_integer.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_valid_expansion.h +41 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_void.h +55 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_volatile.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/lazy.h +35 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_const_lvalue_ref.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_nbit_int.h +107 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +140 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +151 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/maybe_const.h +36 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/nat.h +39 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/negation.h +44 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +122 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +163 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/rank.h +60 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_constructs_from_temporary.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/reference_converts_from_temporary.h +56 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_all_extents.h +66 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const.h +59 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_const_ref.h +37 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cv.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_cvref.h +57 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_extent.h +65 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_pointer.h +73 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_reference.h +72 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/remove_volatile.h +58 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +47 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_identity.h +40 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_list.h +1067 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/type_set.h +131 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +52 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/void_t.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/as_const.h +52 -0
- cuda/cccl/headers/include/cuda/std/__utility/auto_cast.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/cmp.h +116 -0
- cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +101 -0
- cuda/cccl/headers/include/cuda/std/__utility/declval.h +76 -0
- cuda/cccl/headers/include/cuda/std/__utility/exception_guard.h +161 -0
- cuda/cccl/headers/include/cuda/std/__utility/exchange.h +46 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward.h +59 -0
- cuda/cccl/headers/include/cuda/std/__utility/forward_like.h +55 -0
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +86 -0
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +251 -0
- cuda/cccl/headers/include/cuda/std/__utility/monostate.h +99 -0
- cuda/cccl/headers/include/cuda/std/__utility/move.h +74 -0
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +791 -0
- cuda/cccl/headers/include/cuda/std/__utility/piecewise_construct.h +37 -0
- cuda/cccl/headers/include/cuda/std/__utility/pod_tuple.h +527 -0
- cuda/cccl/headers/include/cuda/std/__utility/priority_tag.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/rel_ops.h +63 -0
- cuda/cccl/headers/include/cuda/std/__utility/swap.h +64 -0
- cuda/cccl/headers/include/cuda/std/__utility/to_underlying.h +40 -0
- cuda/cccl/headers/include/cuda/std/__utility/typeid.h +421 -0
- cuda/cccl/headers/include/cuda/std/__utility/undefined.h +34 -0
- cuda/cccl/headers/include/cuda/std/__utility/unreachable.h +37 -0
- cuda/cccl/headers/include/cuda/std/array +518 -0
- cuda/cccl/headers/include/cuda/std/atomic +810 -0
- cuda/cccl/headers/include/cuda/std/barrier +42 -0
- cuda/cccl/headers/include/cuda/std/bit +35 -0
- cuda/cccl/headers/include/cuda/std/bitset +994 -0
- cuda/cccl/headers/include/cuda/std/cassert +28 -0
- cuda/cccl/headers/include/cuda/std/ccomplex +15 -0
- cuda/cccl/headers/include/cuda/std/cfloat +59 -0
- cuda/cccl/headers/include/cuda/std/chrono +26 -0
- cuda/cccl/headers/include/cuda/std/climits +61 -0
- cuda/cccl/headers/include/cuda/std/cmath +87 -0
- cuda/cccl/headers/include/cuda/std/complex +50 -0
- cuda/cccl/headers/include/cuda/std/concepts +48 -0
- cuda/cccl/headers/include/cuda/std/cstddef +28 -0
- cuda/cccl/headers/include/cuda/std/cstdint +178 -0
- cuda/cccl/headers/include/cuda/std/cstdlib +30 -0
- cuda/cccl/headers/include/cuda/std/cstring +110 -0
- cuda/cccl/headers/include/cuda/std/ctime +154 -0
- cuda/cccl/headers/include/cuda/std/detail/__config +45 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +207 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1721 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +2509 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/iosfwd +128 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/stdexcept +120 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +1365 -0
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +2144 -0
- cuda/cccl/headers/include/cuda/std/execution +29 -0
- cuda/cccl/headers/include/cuda/std/expected +30 -0
- cuda/cccl/headers/include/cuda/std/functional +56 -0
- cuda/cccl/headers/include/cuda/std/initializer_list +44 -0
- cuda/cccl/headers/include/cuda/std/inplace_vector +2170 -0
- cuda/cccl/headers/include/cuda/std/iterator +70 -0
- cuda/cccl/headers/include/cuda/std/latch +34 -0
- cuda/cccl/headers/include/cuda/std/limits +28 -0
- cuda/cccl/headers/include/cuda/std/linalg +30 -0
- cuda/cccl/headers/include/cuda/std/mdspan +38 -0
- cuda/cccl/headers/include/cuda/std/memory +39 -0
- cuda/cccl/headers/include/cuda/std/numbers +346 -0
- cuda/cccl/headers/include/cuda/std/numeric +41 -0
- cuda/cccl/headers/include/cuda/std/optional +31 -0
- cuda/cccl/headers/include/cuda/std/ranges +69 -0
- cuda/cccl/headers/include/cuda/std/ratio +416 -0
- cuda/cccl/headers/include/cuda/std/semaphore +31 -0
- cuda/cccl/headers/include/cuda/std/source_location +83 -0
- cuda/cccl/headers/include/cuda/std/span +628 -0
- cuda/cccl/headers/include/cuda/std/string_view +925 -0
- cuda/cccl/headers/include/cuda/std/tuple +26 -0
- cuda/cccl/headers/include/cuda/std/type_traits +177 -0
- cuda/cccl/headers/include/cuda/std/utility +70 -0
- cuda/cccl/headers/include/cuda/std/variant +25 -0
- cuda/cccl/headers/include/cuda/std/version +240 -0
- cuda/cccl/headers/include/cuda/stream +31 -0
- cuda/cccl/headers/include/cuda/stream_ref +59 -0
- cuda/cccl/headers/include/cuda/type_traits +27 -0
- cuda/cccl/headers/include/cuda/utility +28 -0
- cuda/cccl/headers/include/cuda/version +16 -0
- cuda/cccl/headers/include/cuda/warp +28 -0
- cuda/cccl/headers/include/cuda/work_stealing +26 -0
- cuda/cccl/headers/include/nv/detail/__preprocessor +169 -0
- cuda/cccl/headers/include/nv/detail/__target_macros +718 -0
- cuda/cccl/headers/include/nv/target +240 -0
- cuda/cccl/headers/include/thrust/addressof.h +22 -0
- cuda/cccl/headers/include/thrust/adjacent_difference.h +254 -0
- cuda/cccl/headers/include/thrust/advance.h +57 -0
- cuda/cccl/headers/include/thrust/allocate_unique.h +299 -0
- cuda/cccl/headers/include/thrust/binary_search.h +1910 -0
- cuda/cccl/headers/include/thrust/complex.h +858 -0
- cuda/cccl/headers/include/thrust/copy.h +506 -0
- cuda/cccl/headers/include/thrust/count.h +245 -0
- cuda/cccl/headers/include/thrust/detail/adjacent_difference.inl +95 -0
- cuda/cccl/headers/include/thrust/detail/alignment.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +626 -0
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +192 -0
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +96 -0
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +81 -0
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +78 -0
- cuda/cccl/headers/include/thrust/detail/allocator/no_throw_allocator.h +76 -0
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +115 -0
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +116 -0
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +77 -0
- cuda/cccl/headers/include/thrust/detail/allocator_aware_execution_policy.h +99 -0
- cuda/cccl/headers/include/thrust/detail/binary_search.inl +525 -0
- cuda/cccl/headers/include/thrust/detail/caching_allocator.h +47 -0
- cuda/cccl/headers/include/thrust/detail/complex/arithmetic.h +255 -0
- cuda/cccl/headers/include/thrust/detail/complex/c99math.h +64 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrig.h +875 -0
- cuda/cccl/headers/include/thrust/detail/complex/catrigf.h +589 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +233 -0
- cuda/cccl/headers/include/thrust/detail/complex/ccoshf.h +161 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexp.h +195 -0
- cuda/cccl/headers/include/thrust/detail/complex/cexpf.h +173 -0
- cuda/cccl/headers/include/thrust/detail/complex/clog.h +223 -0
- cuda/cccl/headers/include/thrust/detail/complex/clogf.h +210 -0
- cuda/cccl/headers/include/thrust/detail/complex/complex.inl +263 -0
- cuda/cccl/headers/include/thrust/detail/complex/cpow.h +50 -0
- cuda/cccl/headers/include/thrust/detail/complex/cproj.h +81 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinh.h +228 -0
- cuda/cccl/headers/include/thrust/detail/complex/csinhf.h +168 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrt.h +178 -0
- cuda/cccl/headers/include/thrust/detail/complex/csqrtf.h +174 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanh.h +208 -0
- cuda/cccl/headers/include/thrust/detail/complex/ctanhf.h +133 -0
- cuda/cccl/headers/include/thrust/detail/complex/math_private.h +138 -0
- cuda/cccl/headers/include/thrust/detail/complex/stream.h +73 -0
- cuda/cccl/headers/include/thrust/detail/config/compiler.h +38 -0
- cuda/cccl/headers/include/thrust/detail/config/config.h +43 -0
- cuda/cccl/headers/include/thrust/detail/config/cpp_dialect.h +78 -0
- cuda/cccl/headers/include/thrust/detail/config/device_system.h +55 -0
- cuda/cccl/headers/include/thrust/detail/config/host_system.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config/memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/detail/config/namespace.h +162 -0
- cuda/cccl/headers/include/thrust/detail/config/simple_defines.h +48 -0
- cuda/cccl/headers/include/thrust/detail/config.h +36 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.h +228 -0
- cuda/cccl/headers/include/thrust/detail/contiguous_storage.inl +273 -0
- cuda/cccl/headers/include/thrust/detail/copy.h +72 -0
- cuda/cccl/headers/include/thrust/detail/copy.inl +129 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.h +62 -0
- cuda/cccl/headers/include/thrust/detail/copy_if.inl +102 -0
- cuda/cccl/headers/include/thrust/detail/count.h +55 -0
- cuda/cccl/headers/include/thrust/detail/count.inl +89 -0
- cuda/cccl/headers/include/thrust/detail/device_ptr.inl +48 -0
- cuda/cccl/headers/include/thrust/detail/equal.inl +93 -0
- cuda/cccl/headers/include/thrust/detail/event_error.h +160 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator.h +81 -0
- cuda/cccl/headers/include/thrust/detail/execute_with_allocator_fwd.h +61 -0
- cuda/cccl/headers/include/thrust/detail/execution_policy.h +120 -0
- cuda/cccl/headers/include/thrust/detail/extrema.inl +184 -0
- cuda/cccl/headers/include/thrust/detail/fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/find.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/for_each.inl +84 -0
- cuda/cccl/headers/include/thrust/detail/function.h +49 -0
- cuda/cccl/headers/include/thrust/detail/functional/actor.h +214 -0
- cuda/cccl/headers/include/thrust/detail/functional/operators.h +386 -0
- cuda/cccl/headers/include/thrust/detail/gather.inl +173 -0
- cuda/cccl/headers/include/thrust/detail/generate.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/get_iterator_value.h +62 -0
- cuda/cccl/headers/include/thrust/detail/inner_product.inl +118 -0
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +328 -0
- cuda/cccl/headers/include/thrust/detail/logical.inl +113 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free.h +77 -0
- cuda/cccl/headers/include/thrust/detail/malloc_and_free_fwd.h +45 -0
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +209 -0
- cuda/cccl/headers/include/thrust/detail/merge.inl +276 -0
- cuda/cccl/headers/include/thrust/detail/mismatch.inl +94 -0
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +124 -0
- cuda/cccl/headers/include/thrust/detail/partition.inl +378 -0
- cuda/cccl/headers/include/thrust/detail/pointer.h +309 -0
- cuda/cccl/headers/include/thrust/detail/preprocessor.h +652 -0
- cuda/cccl/headers/include/thrust/detail/random_bijection.h +177 -0
- cuda/cccl/headers/include/thrust/detail/range/head_flags.h +116 -0
- cuda/cccl/headers/include/thrust/detail/range/tail_flags.h +130 -0
- cuda/cccl/headers/include/thrust/detail/raw_pointer_cast.h +52 -0
- cuda/cccl/headers/include/thrust/detail/raw_reference_cast.h +192 -0
- cuda/cccl/headers/include/thrust/detail/reduce.inl +377 -0
- cuda/cccl/headers/include/thrust/detail/reference.h +494 -0
- cuda/cccl/headers/include/thrust/detail/reference_forward_declaration.h +35 -0
- cuda/cccl/headers/include/thrust/detail/remove.inl +213 -0
- cuda/cccl/headers/include/thrust/detail/replace.inl +231 -0
- cuda/cccl/headers/include/thrust/detail/reverse.inl +88 -0
- cuda/cccl/headers/include/thrust/detail/scan.inl +518 -0
- cuda/cccl/headers/include/thrust/detail/scatter.inl +157 -0
- cuda/cccl/headers/include/thrust/detail/seq.h +66 -0
- cuda/cccl/headers/include/thrust/detail/sequence.inl +109 -0
- cuda/cccl/headers/include/thrust/detail/set_operations.inl +981 -0
- cuda/cccl/headers/include/thrust/detail/shuffle.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/sort.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/static_assert.h +58 -0
- cuda/cccl/headers/include/thrust/detail/static_map.h +167 -0
- cuda/cccl/headers/include/thrust/detail/swap_ranges.inl +65 -0
- cuda/cccl/headers/include/thrust/detail/tabulate.inl +62 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +153 -0
- cuda/cccl/headers/include/thrust/detail/temporary_array.inl +120 -0
- cuda/cccl/headers/include/thrust/detail/temporary_buffer.h +81 -0
- cuda/cccl/headers/include/thrust/detail/transform_reduce.inl +69 -0
- cuda/cccl/headers/include/thrust/detail/transform_scan.inl +161 -0
- cuda/cccl/headers/include/thrust/detail/trivial_sequence.h +130 -0
- cuda/cccl/headers/include/thrust/detail/tuple_meta_transform.h +61 -0
- cuda/cccl/headers/include/thrust/detail/type_deduction.h +62 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_member_function.h +47 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/has_nested_type.h +43 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_call_possible.h +167 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_commutative.h +69 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_metafunction_defined.h +39 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/is_thrust_pointer.h +59 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_output_iterator.h +46 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/minimum_type.h +89 -0
- cuda/cccl/headers/include/thrust/detail/type_traits/pointer_traits.h +332 -0
- cuda/cccl/headers/include/thrust/detail/type_traits.h +136 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_copy.inl +90 -0
- cuda/cccl/headers/include/thrust/detail/uninitialized_fill.inl +86 -0
- cuda/cccl/headers/include/thrust/detail/unique.inl +373 -0
- cuda/cccl/headers/include/thrust/detail/use_default.h +34 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.h +613 -0
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +1210 -0
- cuda/cccl/headers/include/thrust/device_allocator.h +134 -0
- cuda/cccl/headers/include/thrust/device_delete.h +74 -0
- cuda/cccl/headers/include/thrust/device_free.h +85 -0
- cuda/cccl/headers/include/thrust/device_make_unique.h +56 -0
- cuda/cccl/headers/include/thrust/device_malloc.h +84 -0
- cuda/cccl/headers/include/thrust/device_malloc_allocator.h +190 -0
- cuda/cccl/headers/include/thrust/device_new.h +112 -0
- cuda/cccl/headers/include/thrust/device_new_allocator.h +179 -0
- cuda/cccl/headers/include/thrust/device_ptr.h +196 -0
- cuda/cccl/headers/include/thrust/device_reference.h +983 -0
- cuda/cccl/headers/include/thrust/device_vector.h +576 -0
- cuda/cccl/headers/include/thrust/distance.h +43 -0
- cuda/cccl/headers/include/thrust/equal.h +247 -0
- cuda/cccl/headers/include/thrust/execution_policy.h +251 -0
- cuda/cccl/headers/include/thrust/extrema.h +657 -0
- cuda/cccl/headers/include/thrust/fill.h +200 -0
- cuda/cccl/headers/include/thrust/find.h +382 -0
- cuda/cccl/headers/include/thrust/for_each.h +261 -0
- cuda/cccl/headers/include/thrust/functional.h +395 -0
- cuda/cccl/headers/include/thrust/gather.h +464 -0
- cuda/cccl/headers/include/thrust/generate.h +193 -0
- cuda/cccl/headers/include/thrust/host_vector.h +576 -0
- cuda/cccl/headers/include/thrust/inner_product.h +264 -0
- cuda/cccl/headers/include/thrust/iterator/constant_iterator.h +221 -0
- cuda/cccl/headers/include/thrust/iterator/counting_iterator.h +335 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_assign.h +48 -0
- cuda/cccl/headers/include/thrust/iterator/detail/any_system_tag.h +43 -0
- cuda/cccl/headers/include/thrust/iterator/detail/device_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/host_system_tag.h +38 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_adaptor_base.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +60 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +65 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h +57 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +182 -0
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +58 -0
- cuda/cccl/headers/include/thrust/iterator/detail/normal_iterator.h +69 -0
- cuda/cccl/headers/include/thrust/iterator/detail/retag.h +104 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tagged_iterator.h +81 -0
- cuda/cccl/headers/include/thrust/iterator/detail/tuple_of_iterator_references.h +174 -0
- cuda/cccl/headers/include/thrust/iterator/discard_iterator.h +163 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +251 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +211 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +659 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +334 -0
- cuda/cccl/headers/include/thrust/iterator/iterator_traversal_tags.h +64 -0
- cuda/cccl/headers/include/thrust/iterator/offset_iterator.h +194 -0
- cuda/cccl/headers/include/thrust/iterator/permutation_iterator.h +204 -0
- cuda/cccl/headers/include/thrust/iterator/retag.h +72 -0
- cuda/cccl/headers/include/thrust/iterator/reverse_iterator.h +51 -0
- cuda/cccl/headers/include/thrust/iterator/shuffle_iterator.h +185 -0
- cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +152 -0
- cuda/cccl/headers/include/thrust/iterator/transform_input_output_iterator.h +226 -0
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +351 -0
- cuda/cccl/headers/include/thrust/iterator/transform_output_iterator.h +190 -0
- cuda/cccl/headers/include/thrust/iterator/zip_iterator.h +359 -0
- cuda/cccl/headers/include/thrust/logical.h +290 -0
- cuda/cccl/headers/include/thrust/memory.h +299 -0
- cuda/cccl/headers/include/thrust/merge.h +725 -0
- cuda/cccl/headers/include/thrust/mismatch.h +261 -0
- cuda/cccl/headers/include/thrust/mr/allocator.h +229 -0
- cuda/cccl/headers/include/thrust/mr/device_memory_resource.h +41 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +528 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_sync_pool.h +118 -0
- cuda/cccl/headers/include/thrust/mr/disjoint_tls_pool.h +67 -0
- cuda/cccl/headers/include/thrust/mr/fancy_pointer_resource.h +67 -0
- cuda/cccl/headers/include/thrust/mr/host_memory_resource.h +38 -0
- cuda/cccl/headers/include/thrust/mr/memory_resource.h +217 -0
- cuda/cccl/headers/include/thrust/mr/new.h +100 -0
- cuda/cccl/headers/include/thrust/mr/polymorphic_adaptor.h +63 -0
- cuda/cccl/headers/include/thrust/mr/pool.h +528 -0
- cuda/cccl/headers/include/thrust/mr/pool_options.h +174 -0
- cuda/cccl/headers/include/thrust/mr/sync_pool.h +114 -0
- cuda/cccl/headers/include/thrust/mr/tls_pool.h +64 -0
- cuda/cccl/headers/include/thrust/mr/universal_memory_resource.h +29 -0
- cuda/cccl/headers/include/thrust/mr/validator.h +56 -0
- cuda/cccl/headers/include/thrust/pair.h +99 -0
- cuda/cccl/headers/include/thrust/partition.h +1391 -0
- cuda/cccl/headers/include/thrust/per_device_resource.h +98 -0
- cuda/cccl/headers/include/thrust/random/detail/discard_block_engine.inl +184 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine.inl +155 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_congruential_engine_discard.h +104 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine.inl +151 -0
- cuda/cccl/headers/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h +53 -0
- cuda/cccl/headers/include/thrust/random/detail/mod.h +101 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution.inl +187 -0
- cuda/cccl/headers/include/thrust/random/detail/normal_distribution_base.h +160 -0
- cuda/cccl/headers/include/thrust/random/detail/random_core_access.h +63 -0
- cuda/cccl/headers/include/thrust/random/detail/subtract_with_carry_engine.inl +201 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_int_distribution.inl +198 -0
- cuda/cccl/headers/include/thrust/random/detail/uniform_real_distribution.inl +200 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine.inl +183 -0
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +187 -0
- cuda/cccl/headers/include/thrust/random/discard_block_engine.h +240 -0
- cuda/cccl/headers/include/thrust/random/linear_congruential_engine.h +289 -0
- cuda/cccl/headers/include/thrust/random/linear_feedback_shift_engine.h +217 -0
- cuda/cccl/headers/include/thrust/random/normal_distribution.h +257 -0
- cuda/cccl/headers/include/thrust/random/subtract_with_carry_engine.h +247 -0
- cuda/cccl/headers/include/thrust/random/uniform_int_distribution.h +261 -0
- cuda/cccl/headers/include/thrust/random/uniform_real_distribution.h +258 -0
- cuda/cccl/headers/include/thrust/random/xor_combine_engine.h +255 -0
- cuda/cccl/headers/include/thrust/random.h +120 -0
- cuda/cccl/headers/include/thrust/reduce.h +1113 -0
- cuda/cccl/headers/include/thrust/remove.h +768 -0
- cuda/cccl/headers/include/thrust/replace.h +826 -0
- cuda/cccl/headers/include/thrust/reverse.h +215 -0
- cuda/cccl/headers/include/thrust/scan.h +1671 -0
- cuda/cccl/headers/include/thrust/scatter.h +446 -0
- cuda/cccl/headers/include/thrust/sequence.h +277 -0
- cuda/cccl/headers/include/thrust/set_operations.h +3026 -0
- cuda/cccl/headers/include/thrust/shuffle.h +182 -0
- cuda/cccl/headers/include/thrust/sort.h +1320 -0
- cuda/cccl/headers/include/thrust/swap.h +147 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/adjacent_difference.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/binary_search.h +32 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/copy_if.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +109 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/extrema.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/find.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/for_each.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/memory.inl +60 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/partition.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reduce_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/remove.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scan_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/sort.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/detail/unique_by_key.h +30 -0
- cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +63 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +106 -0
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +120 -0
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/config.h +126 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +219 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/assign_value.h +124 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/binary_search.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cdp_dispatch.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +273 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +255 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/agent_launcher.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +191 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +593 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/count.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/cross_system.h +243 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/dispatch.h +233 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/equal.h +64 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/error.inl +96 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +264 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/extrema.h +476 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +100 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +170 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +83 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/gather.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +60 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/get_value.h +65 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/inner_product.h +75 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +80 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/make_unsigned_special.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/malloc_and_free.h +121 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/memory.inl +57 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/merge.h +228 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +223 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/parallel_for.h +81 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +405 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/per_device_resource.h +72 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +785 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +1001 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/remove.h +107 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reverse.h +87 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan.h +341 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +414 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +91 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +1734 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +469 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/swap_ranges.h +98 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/tabulate.h +61 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +132 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/terminate.h +53 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +429 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +143 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_scan.h +119 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +117 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +105 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +289 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +310 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +253 -0
- cuda/cccl/headers/include/thrust/system/cuda/error.h +168 -0
- cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +15 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/memory_resource.h +122 -0
- cuda/cccl/headers/include/thrust/system/cuda/pointer.h +160 -0
- cuda/cccl/headers/include/thrust/system/cuda/vector.h +108 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/adjacent_difference.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/assign_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/binary_search.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/copy_if.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/count.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/equal.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/extrema.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/find.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/gather.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/generate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/get_value.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/inner_product.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/iter_swap.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/logical.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/malloc_and_free.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/merge.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/mismatch.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/partition.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/per_device_resource.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reduce_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/remove.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/replace.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/reverse.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scan_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/scatter.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sequence.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/set_operations.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/sort.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/swap_ranges.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/tabulate.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/temporary_buffer.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_reduce.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/transform_scan.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_copy.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/uninitialized_fill.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/adl/unique_by_key.h +51 -0
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +61 -0
- cuda/cccl/headers/include/thrust/system/detail/errno.h +120 -0
- cuda/cccl/headers/include/thrust/system/detail/error_category.inl +302 -0
- cuda/cccl/headers/include/thrust/system/detail/error_code.inl +173 -0
- cuda/cccl/headers/include/thrust/system/detail/error_condition.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +53 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +79 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +161 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +384 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +146 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +84 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +252 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +137 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +96 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.inl +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +60 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +99 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +148 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +68 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +129 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +207 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +100 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +83 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +186 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +86 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +121 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +95 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +175 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +67 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +63 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +126 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +72 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +126 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +232 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +73 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +85 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +104 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sequence.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +282 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +476 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +125 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +175 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +44 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +76 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +41 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +47 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +54 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +82 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform.h +395 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +56 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +80 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +166 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +115 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +113 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +81 -0
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +126 -0
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +117 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +70 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +42 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +136 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +119 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +49 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +71 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/count.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/equal.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +52 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +110 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +62 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +74 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/gather.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +123 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/generate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +43 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/inner_product.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +141 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +45 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/logical.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +50 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +75 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +145 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/mismatch.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +301 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +64 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +98 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +179 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/replace.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/reverse.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +154 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +145 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/scatter.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sequence.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +206 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +59 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.inl +116 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +55 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +356 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +124 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +48 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +586 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/swap_ranges.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/tabulate.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_reduce.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/transform_scan.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +58 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_copy.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/uninitialized_fill.h +29 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +115 -0
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +106 -0
- cuda/cccl/headers/include/thrust/system/detail/system_error.inl +108 -0
- cuda/cccl/headers/include/thrust/system/error_code.h +512 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/binary_search.h +77 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy.inl +74 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/copy_if.inl +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.h +50 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/default_decomposition.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +127 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/find.h +53 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.h +56 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/for_each.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/memory.inl +93 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/merge.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.h +88 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/partition.inl +102 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/pragma_omp.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce.inl +78 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.h +64 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_by_key.inl +65 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.h +59 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reduce_intervals.inl +103 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.h +72 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.h +73 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan.inl +172 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scan_by_key.h +36 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/sort.inl +265 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/omp/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/omp/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/omp/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/omp/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/omp/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system/system_error.h +185 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/adjacent_difference.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/assign_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/binary_search.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.h +50 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy.inl +73 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.h +47 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/copy_if.inl +136 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/count.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/equal.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +109 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/extrema.h +66 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/find.h +49 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.h +51 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +91 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/gather.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/generate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/get_value.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/inner_product.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/iter_swap.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/logical.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/malloc_and_free.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/memory.inl +94 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.h +77 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/merge.inl +327 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/mismatch.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.h +84 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/partition.inl +98 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/per_device_resource.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.h +54 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce.inl +137 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.h +61 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_by_key.inl +400 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reduce_intervals.h +140 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.h +76 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/remove.inl +87 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/replace.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/reverse.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.h +59 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan.inl +312 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scan_by_key.h +33 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/scatter.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sequence.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/set_operations.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/sort.inl +295 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/swap_ranges.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/tabulate.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/temporary_buffer.h +29 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_reduce.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/transform_scan.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_copy.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/uninitialized_fill.h +30 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.h +60 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique.inl +71 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.h +67 -0
- cuda/cccl/headers/include/thrust/system/tbb/detail/unique_by_key.inl +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +62 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory.h +111 -0
- cuda/cccl/headers/include/thrust/system/tbb/memory_resource.h +75 -0
- cuda/cccl/headers/include/thrust/system/tbb/pointer.h +124 -0
- cuda/cccl/headers/include/thrust/system/tbb/vector.h +99 -0
- cuda/cccl/headers/include/thrust/system_error.h +57 -0
- cuda/cccl/headers/include/thrust/tabulate.h +125 -0
- cuda/cccl/headers/include/thrust/transform.h +1045 -0
- cuda/cccl/headers/include/thrust/transform_reduce.h +190 -0
- cuda/cccl/headers/include/thrust/transform_scan.h +442 -0
- cuda/cccl/headers/include/thrust/tuple.h +139 -0
- cuda/cccl/headers/include/thrust/type_traits/integer_sequence.h +261 -0
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +154 -0
- cuda/cccl/headers/include/thrust/type_traits/is_execution_policy.h +65 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_less_or_greater_function_object.h +184 -0
- cuda/cccl/headers/include/thrust/type_traits/is_operator_plus_function_object.h +116 -0
- cuda/cccl/headers/include/thrust/type_traits/is_trivially_relocatable.h +336 -0
- cuda/cccl/headers/include/thrust/type_traits/logical_metafunctions.h +42 -0
- cuda/cccl/headers/include/thrust/type_traits/unwrap_contiguous_iterator.h +63 -0
- cuda/cccl/headers/include/thrust/uninitialized_copy.h +300 -0
- cuda/cccl/headers/include/thrust/uninitialized_fill.h +268 -0
- cuda/cccl/headers/include/thrust/unique.h +1088 -0
- cuda/cccl/headers/include/thrust/universal_allocator.h +93 -0
- cuda/cccl/headers/include/thrust/universal_ptr.h +34 -0
- cuda/cccl/headers/include/thrust/universal_vector.h +71 -0
- cuda/cccl/headers/include/thrust/version.h +93 -0
- cuda/cccl/headers/include/thrust/zip_function.h +176 -0
- cuda/cccl/headers/include_paths.py +51 -0
- cuda/cccl/parallel/__init__.py +9 -0
- cuda/cccl/parallel/experimental/__init__.py +24 -0
- cuda/cccl/py.typed +0 -0
- cuda/compute/__init__.py +79 -0
- cuda/compute/_bindings.py +79 -0
- cuda/compute/_bindings.pyi +475 -0
- cuda/compute/_bindings_impl.pyx +2273 -0
- cuda/compute/_caching.py +71 -0
- cuda/compute/_cccl_interop.py +422 -0
- cuda/compute/_utils/__init__.py +0 -0
- cuda/compute/_utils/protocols.py +132 -0
- cuda/compute/_utils/temp_storage_buffer.py +86 -0
- cuda/compute/algorithms/__init__.py +54 -0
- cuda/compute/algorithms/_histogram.py +243 -0
- cuda/compute/algorithms/_merge_sort.py +225 -0
- cuda/compute/algorithms/_radix_sort.py +312 -0
- cuda/compute/algorithms/_reduce.py +182 -0
- cuda/compute/algorithms/_scan.py +331 -0
- cuda/compute/algorithms/_segmented_reduce.py +257 -0
- cuda/compute/algorithms/_three_way_partition.py +261 -0
- cuda/compute/algorithms/_transform.py +329 -0
- cuda/compute/algorithms/_unique_by_key.py +252 -0
- cuda/compute/cccl/.gitkeep +0 -0
- cuda/compute/cu12/_bindings_impl.cp313-win_amd64.pyd +0 -0
- cuda/compute/cu12/cccl/cccl.c.parallel.dll +0 -0
- cuda/compute/cu12/cccl/cccl.c.parallel.lib +0 -0
- cuda/compute/cu13/_bindings_impl.cp313-win_amd64.pyd +0 -0
- cuda/compute/cu13/cccl/cccl.c.parallel.dll +0 -0
- cuda/compute/cu13/cccl/cccl.c.parallel.lib +0 -0
- cuda/compute/iterators/__init__.py +21 -0
- cuda/compute/iterators/_factories.py +219 -0
- cuda/compute/iterators/_iterators.py +817 -0
- cuda/compute/iterators/_zip_iterator.py +199 -0
- cuda/compute/numba_utils.py +53 -0
- cuda/compute/op.py +3 -0
- cuda/compute/struct.py +272 -0
- cuda/compute/typing.py +37 -0
- cuda/coop/__init__.py +8 -0
- cuda/coop/_caching.py +48 -0
- cuda/coop/_common.py +275 -0
- cuda/coop/_nvrtc.py +92 -0
- cuda/coop/_scan_op.py +181 -0
- cuda/coop/_types.py +937 -0
- cuda/coop/_typing.py +107 -0
- cuda/coop/block/__init__.py +39 -0
- cuda/coop/block/_block_exchange.py +251 -0
- cuda/coop/block/_block_load_store.py +215 -0
- cuda/coop/block/_block_merge_sort.py +125 -0
- cuda/coop/block/_block_radix_sort.py +214 -0
- cuda/coop/block/_block_reduce.py +294 -0
- cuda/coop/block/_block_scan.py +983 -0
- cuda/coop/warp/__init__.py +9 -0
- cuda/coop/warp/_warp_merge_sort.py +92 -0
- cuda/coop/warp/_warp_reduce.py +153 -0
- cuda/coop/warp/_warp_scan.py +78 -0
- cuda_cccl-0.3.3.dist-info/METADATA +41 -0
- cuda_cccl-0.3.3.dist-info/RECORD +1968 -0
- cuda_cccl-0.3.3.dist-info/WHEEL +5 -0
- cuda_cccl-0.3.3.dist-info/licenses/LICENSE +1 -0
|
@@ -0,0 +1,2273 @@
|
|
|
1
|
+
# distutils: language = c++
|
|
2
|
+
# cython: language_level=3
|
|
3
|
+
# cython: linetrace=True
|
|
4
|
+
|
|
5
|
+
# Python signatures are declared in the companion Python stub file _bindings.pyi
|
|
6
|
+
# Make sure to update PYI with change to Python API to ensure that Python
|
|
7
|
+
# static type checker tools like mypy green-lights cuda.compute
|
|
8
|
+
|
|
9
|
+
from libc.string cimport memset, memcpy
|
|
10
|
+
from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
|
|
11
|
+
from cpython.bytes cimport PyBytes_FromStringAndSize
|
|
12
|
+
|
|
13
|
+
from cpython.buffer cimport (
|
|
14
|
+
Py_buffer, PyBUF_SIMPLE, PyBUF_ANY_CONTIGUOUS,
|
|
15
|
+
PyBuffer_Release, PyObject_CheckBuffer, PyObject_GetBuffer
|
|
16
|
+
)
|
|
17
|
+
from cpython.pycapsule cimport (
|
|
18
|
+
PyCapsule_CheckExact, PyCapsule_IsValid, PyCapsule_GetPointer
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
import ctypes
|
|
22
|
+
from enum import IntEnum
|
|
23
|
+
cdef extern from "<cuda.h>":
|
|
24
|
+
cdef struct OpaqueCUstream_st
|
|
25
|
+
cdef struct OpaqueCUkernel_st
|
|
26
|
+
cdef struct OpaqueCUlibrary_st
|
|
27
|
+
|
|
28
|
+
ctypedef int CUresult
|
|
29
|
+
ctypedef OpaqueCUstream_st *CUstream
|
|
30
|
+
ctypedef OpaqueCUkernel_st *CUkernel
|
|
31
|
+
ctypedef OpaqueCUlibrary_st *CUlibrary
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
cdef extern from "cccl/c/types.h":
|
|
35
|
+
cpdef enum cccl_type_enum:
|
|
36
|
+
INT8 "CCCL_INT8"
|
|
37
|
+
INT16 "CCCL_INT16"
|
|
38
|
+
INT32 "CCCL_INT32"
|
|
39
|
+
INT64 "CCCL_INT64"
|
|
40
|
+
UINT8 "CCCL_UINT8"
|
|
41
|
+
UINT16 "CCCL_UINT16"
|
|
42
|
+
UINT32 "CCCL_UINT32"
|
|
43
|
+
UINT64 "CCCL_UINT64"
|
|
44
|
+
FLOAT16 "CCCL_FLOAT16"
|
|
45
|
+
FLOAT32 "CCCL_FLOAT32"
|
|
46
|
+
FLOAT64 "CCCL_FLOAT64"
|
|
47
|
+
STORAGE "CCCL_STORAGE"
|
|
48
|
+
BOOLEAN "CCCL_BOOLEAN"
|
|
49
|
+
|
|
50
|
+
cpdef enum cccl_op_kind_t:
|
|
51
|
+
STATELESS "CCCL_STATELESS"
|
|
52
|
+
STATEFUL "CCCL_STATEFUL"
|
|
53
|
+
PLUS "CCCL_PLUS"
|
|
54
|
+
MINUS "CCCL_MINUS"
|
|
55
|
+
MULTIPLIES "CCCL_MULTIPLIES"
|
|
56
|
+
DIVIDES "CCCL_DIVIDES"
|
|
57
|
+
MODULUS "CCCL_MODULUS"
|
|
58
|
+
EQUAL_TO "CCCL_EQUAL_TO"
|
|
59
|
+
NOT_EQUAL_TO "CCCL_NOT_EQUAL_TO"
|
|
60
|
+
GREATER "CCCL_GREATER"
|
|
61
|
+
LESS "CCCL_LESS"
|
|
62
|
+
GREATER_EQUAL "CCCL_GREATER_EQUAL"
|
|
63
|
+
LESS_EQUAL "CCCL_LESS_EQUAL"
|
|
64
|
+
LOGICAL_AND "CCCL_LOGICAL_AND"
|
|
65
|
+
LOGICAL_OR "CCCL_LOGICAL_OR"
|
|
66
|
+
LOGICAL_NOT "CCCL_LOGICAL_NOT"
|
|
67
|
+
BIT_AND "CCCL_BIT_AND"
|
|
68
|
+
BIT_OR "CCCL_BIT_OR"
|
|
69
|
+
BIT_XOR "CCCL_BIT_XOR"
|
|
70
|
+
BIT_NOT "CCCL_BIT_NOT"
|
|
71
|
+
IDENTITY "CCCL_IDENTITY"
|
|
72
|
+
NEGATE "CCCL_NEGATE"
|
|
73
|
+
MINIMUM "CCCL_MINIMUM"
|
|
74
|
+
MAXIMUM "CCCL_MAXIMUM"
|
|
75
|
+
|
|
76
|
+
cpdef enum cccl_iterator_kind_t:
|
|
77
|
+
POINTER "CCCL_POINTER"
|
|
78
|
+
ITERATOR "CCCL_ITERATOR"
|
|
79
|
+
|
|
80
|
+
cdef struct cccl_type_info:
|
|
81
|
+
size_t size
|
|
82
|
+
size_t alignment
|
|
83
|
+
cccl_type_enum type
|
|
84
|
+
|
|
85
|
+
cdef enum cccl_op_code_type:
|
|
86
|
+
CCCL_OP_LTOIR
|
|
87
|
+
CCCL_OP_CPP_SOURCE
|
|
88
|
+
|
|
89
|
+
cdef struct cccl_op_t:
|
|
90
|
+
cccl_op_kind_t type
|
|
91
|
+
const char* name
|
|
92
|
+
const char* code
|
|
93
|
+
size_t code_size
|
|
94
|
+
cccl_op_code_type code_type
|
|
95
|
+
size_t size
|
|
96
|
+
size_t alignment
|
|
97
|
+
void *state
|
|
98
|
+
|
|
99
|
+
cdef struct cccl_value_t:
|
|
100
|
+
cccl_type_info type
|
|
101
|
+
void *state
|
|
102
|
+
|
|
103
|
+
cdef union cccl_increment_t:
|
|
104
|
+
int64_t signed_offset
|
|
105
|
+
uint64_t unsigned_offset
|
|
106
|
+
|
|
107
|
+
ctypedef void (*cccl_host_op_fn_ptr_t)(void *, cccl_increment_t) nogil
|
|
108
|
+
|
|
109
|
+
cdef struct cccl_iterator_t:
|
|
110
|
+
size_t size
|
|
111
|
+
size_t alignment
|
|
112
|
+
cccl_iterator_kind_t type
|
|
113
|
+
cccl_op_t advance
|
|
114
|
+
cccl_op_t dereference
|
|
115
|
+
cccl_type_info value_type
|
|
116
|
+
void *state
|
|
117
|
+
cccl_host_op_fn_ptr_t host_advance
|
|
118
|
+
|
|
119
|
+
cpdef enum cccl_sort_order_t:
|
|
120
|
+
ASCENDING "CCCL_ASCENDING"
|
|
121
|
+
DESCENDING "CCCL_DESCENDING"
|
|
122
|
+
|
|
123
|
+
cpdef enum cccl_init_kind_t:
|
|
124
|
+
VALUE_INIT "CCCL_VALUE_INIT"
|
|
125
|
+
FUTURE_VALUE_INIT "CCCL_FUTURE_VALUE_INIT"
|
|
126
|
+
NO_INIT "CCCL_NO_INIT"
|
|
127
|
+
|
|
128
|
+
cdef void arg_type_check(
|
|
129
|
+
str arg_name,
|
|
130
|
+
object expected_type,
|
|
131
|
+
object arg
|
|
132
|
+
) except *:
|
|
133
|
+
if not isinstance(arg, expected_type):
|
|
134
|
+
raise TypeError(
|
|
135
|
+
f"Expected {arg_name} to have type '{expected_type}', "
|
|
136
|
+
f"got '{type(arg)}'"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
OpKind = cccl_op_kind_t
|
|
140
|
+
TypeEnum = cccl_type_enum
|
|
141
|
+
IteratorKind = cccl_iterator_kind_t
|
|
142
|
+
SortOrder = cccl_sort_order_t
|
|
143
|
+
InitKind = cccl_init_kind_t
|
|
144
|
+
|
|
145
|
+
cdef void _validate_alignment(int alignment) except *:
|
|
146
|
+
"""
|
|
147
|
+
Alignment must be positive integer and a power of two
|
|
148
|
+
that can be represented by uint32_t type.
|
|
149
|
+
"""
|
|
150
|
+
cdef uint32_t val
|
|
151
|
+
if alignment < 1:
|
|
152
|
+
raise ValueError(
|
|
153
|
+
"Alignment must be non-negative, "
|
|
154
|
+
f"got {alignment}."
|
|
155
|
+
)
|
|
156
|
+
val = <uint32_t>alignment
|
|
157
|
+
if (val & (val - 1)) != 0:
|
|
158
|
+
raise ValueError(
|
|
159
|
+
"Alignment must be a power of two, "
|
|
160
|
+
f"got {alignment}"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
cdef class Op:
|
|
165
|
+
"""
|
|
166
|
+
Represents CCCL Operation
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
name (str):
|
|
170
|
+
Name of the operation
|
|
171
|
+
operator_type (OpKind):
|
|
172
|
+
Whether operator is stateless or stateful
|
|
173
|
+
ltoir (bytes):
|
|
174
|
+
The LTOIR for the operation compiled for device
|
|
175
|
+
state (bytes, optional):
|
|
176
|
+
State for the stateful operation.
|
|
177
|
+
state_alignment (int, optional):
|
|
178
|
+
Alignment of the state struct. Default: `1`.
|
|
179
|
+
"""
|
|
180
|
+
# need Python owner of memory used for operator name
|
|
181
|
+
cdef bytes op_encoded_name
|
|
182
|
+
cdef bytes code_bytes
|
|
183
|
+
cdef bytes state_bytes
|
|
184
|
+
cdef cccl_op_t op_data
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
cdef void _set_members(self, cccl_op_kind_t op_type, str name, bytes lto_ir, bytes state, int state_alignment):
|
|
188
|
+
memset(&self.op_data, 0, sizeof(cccl_op_t))
|
|
189
|
+
# Reference Python objects in the class to ensure lifetime
|
|
190
|
+
self.op_encoded_name = name.encode("utf-8")
|
|
191
|
+
self.code_bytes = lto_ir
|
|
192
|
+
self.state_bytes = state
|
|
193
|
+
# set fields of op_data struct
|
|
194
|
+
self.op_data.type = op_type
|
|
195
|
+
self.op_data.name = <const char *>self.op_encoded_name
|
|
196
|
+
self.op_data.code = <const char *>lto_ir
|
|
197
|
+
self.op_data.code_size = len(lto_ir)
|
|
198
|
+
self.op_data.code_type = cccl_op_code_type.CCCL_OP_LTOIR
|
|
199
|
+
self.op_data.size = len(state)
|
|
200
|
+
self.op_data.alignment = state_alignment
|
|
201
|
+
self.op_data.state = <void *><const char *>state
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def __cinit__(self, /, *, name = None, operator_type = None, ltoir = None, state = None, state_alignment = 1):
|
|
205
|
+
if name is None and ltoir is None:
|
|
206
|
+
name = ""
|
|
207
|
+
ltoir = b""
|
|
208
|
+
if state is None:
|
|
209
|
+
state = b""
|
|
210
|
+
if operator_type is None:
|
|
211
|
+
operator_type = OpKind.STATELESS
|
|
212
|
+
arg_type_check(arg_name="name", expected_type=str, arg=name)
|
|
213
|
+
arg_type_check(arg_name="ltoir", expected_type=bytes, arg=ltoir)
|
|
214
|
+
arg_type_check(arg_name="state", expected_type=bytes, arg=state)
|
|
215
|
+
arg_type_check(arg_name="state_alignment", expected_type=int, arg=state_alignment)
|
|
216
|
+
if not isinstance(operator_type, OpKind):
|
|
217
|
+
raise TypeError(
|
|
218
|
+
f"The operator_type argument should be an enumerator of operator kinds"
|
|
219
|
+
)
|
|
220
|
+
_validate_alignment(state_alignment)
|
|
221
|
+
self._set_members(
|
|
222
|
+
<cccl_op_kind_t> operator_type.value,
|
|
223
|
+
<str> name,
|
|
224
|
+
<bytes> ltoir,
|
|
225
|
+
<bytes> state,
|
|
226
|
+
<int> state_alignment
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
cdef void set_state(self, bytes state):
|
|
231
|
+
self.state_bytes = state
|
|
232
|
+
self.op_data.state = <void *><const char *>state
|
|
233
|
+
|
|
234
|
+
@property
|
|
235
|
+
def state(self):
|
|
236
|
+
return self.state_bytes
|
|
237
|
+
|
|
238
|
+
@state.setter
|
|
239
|
+
def state(self, bytes new_value):
|
|
240
|
+
self.set_state(<bytes>new_value)
|
|
241
|
+
|
|
242
|
+
@property
|
|
243
|
+
def name(self):
|
|
244
|
+
return self.op_encoded_name.decode("utf-8")
|
|
245
|
+
|
|
246
|
+
@property
|
|
247
|
+
def ltoir(self):
|
|
248
|
+
# Backward compatibility property
|
|
249
|
+
return self.code_bytes
|
|
250
|
+
|
|
251
|
+
@property
|
|
252
|
+
def code(self):
|
|
253
|
+
return self.code_bytes
|
|
254
|
+
|
|
255
|
+
@property
|
|
256
|
+
def state_alignment(self):
|
|
257
|
+
return self.op_data.alignment
|
|
258
|
+
|
|
259
|
+
@property
|
|
260
|
+
def state_typenum(self):
|
|
261
|
+
return self.op_data.type
|
|
262
|
+
|
|
263
|
+
def as_bytes(self):
|
|
264
|
+
"Debugging utility to view memory content of library struct"
|
|
265
|
+
cdef uint8_t[:] mem_view = bytearray(sizeof(self.op_data))
|
|
266
|
+
memcpy(&mem_view[0], &self.op_data, sizeof(self.op_data))
|
|
267
|
+
return bytes(mem_view)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
cdef class TypeInfo:
|
|
271
|
+
"""
|
|
272
|
+
Represents CCCL type info structure
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
size (int):
|
|
276
|
+
Size of the type in bytes.
|
|
277
|
+
alignment (int):
|
|
278
|
+
Alignment of the type in bytes.
|
|
279
|
+
type_enum (TypeEnum):
|
|
280
|
+
Enumeration member identifying the type.
|
|
281
|
+
"""
|
|
282
|
+
cdef cccl_type_info type_info
|
|
283
|
+
|
|
284
|
+
def __cinit__(self, int size, int alignment, cccl_type_enum type_enum):
|
|
285
|
+
if size < 1:
|
|
286
|
+
raise ValueError("Size argument must be positive")
|
|
287
|
+
_validate_alignment(alignment)
|
|
288
|
+
self.type_info.size = size
|
|
289
|
+
self.type_info.alignment = alignment
|
|
290
|
+
self.type_info.type = type_enum
|
|
291
|
+
|
|
292
|
+
@property
|
|
293
|
+
def size(self):
|
|
294
|
+
return self.type_info.size
|
|
295
|
+
|
|
296
|
+
@property
|
|
297
|
+
def alignment(self):
|
|
298
|
+
return self.type_info.alignment
|
|
299
|
+
|
|
300
|
+
@property
|
|
301
|
+
def typenum(self):
|
|
302
|
+
return self.type_info.type
|
|
303
|
+
|
|
304
|
+
def as_bytes(self):
|
|
305
|
+
"Debugging utility to view memory content of library struct"
|
|
306
|
+
cdef uint8_t[:] mem_view = bytearray(sizeof(self.type_info))
|
|
307
|
+
memcpy(&mem_view[0], &self.type_info, sizeof(self.type_info))
|
|
308
|
+
return bytes(mem_view)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
cdef class Value:
|
|
312
|
+
"""
|
|
313
|
+
Represents CCCL value structure
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
value_type (TypeInfo):
|
|
317
|
+
type descriptor
|
|
318
|
+
state (object):
|
|
319
|
+
state of the value type. Object is expected to
|
|
320
|
+
implement Python buffer protocol and be able to provide
|
|
321
|
+
simple contiguous array of type `uint8_t`.
|
|
322
|
+
"""
|
|
323
|
+
cdef uint8_t[::1] state_obj
|
|
324
|
+
cdef TypeInfo value_type
|
|
325
|
+
cdef cccl_value_t value_data;
|
|
326
|
+
|
|
327
|
+
def __cinit__(self, TypeInfo value_type, uint8_t[::1] state):
|
|
328
|
+
self.state_obj = state
|
|
329
|
+
self.value_type = value_type
|
|
330
|
+
self.value_data.type = value_type.type_info
|
|
331
|
+
self.value_data.state = <void *>&state[0]
|
|
332
|
+
|
|
333
|
+
@property
|
|
334
|
+
def type(self):
|
|
335
|
+
return self.value_type
|
|
336
|
+
|
|
337
|
+
@property
|
|
338
|
+
def state(self):
|
|
339
|
+
return self.state_obj
|
|
340
|
+
|
|
341
|
+
@state.setter
|
|
342
|
+
def state(self, uint8_t[::1] new_value):
|
|
343
|
+
if (len(self.state_obj) == len(new_value)):
|
|
344
|
+
self.state_obj = new_value
|
|
345
|
+
self.value_data.state = <void *>&self.state_obj[0]
|
|
346
|
+
else:
|
|
347
|
+
raise ValueError("Size mismatch")
|
|
348
|
+
|
|
349
|
+
def as_bytes(self):
|
|
350
|
+
"Debugging utility to view memory of native struct"
|
|
351
|
+
cdef uint8_t[:] mem_view = bytearray(sizeof(self.value_data))
|
|
352
|
+
memcpy(&mem_view[0], &self.value_data, sizeof(self.value_data))
|
|
353
|
+
return bytes(mem_view)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
cdef void ensure_buffer(object o) except *:
|
|
357
|
+
if not PyObject_CheckBuffer(o):
|
|
358
|
+
raise TypeError(
|
|
359
|
+
"Object with buffer protocol expected, "
|
|
360
|
+
f"got {type(o)}"
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
cdef void * get_buffer_pointer(object o, size_t *size):
|
|
365
|
+
cdef int status = 0
|
|
366
|
+
cdef void *ptr = NULL
|
|
367
|
+
cdef Py_buffer view
|
|
368
|
+
|
|
369
|
+
status = PyObject_GetBuffer(o, &view, PyBUF_SIMPLE | PyBUF_ANY_CONTIGUOUS)
|
|
370
|
+
if status != 0: # pragma: no cover
|
|
371
|
+
size[0] = 0
|
|
372
|
+
raise RuntimeError(
|
|
373
|
+
"Can not access simple contiguous buffer"
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
ptr = view.buf
|
|
377
|
+
if size is not NULL:
|
|
378
|
+
size[0] = <size_t>view.len
|
|
379
|
+
PyBuffer_Release(&view)
|
|
380
|
+
|
|
381
|
+
return ptr
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
cdef void * ctypes_typed_pointer_payload_ptr(object ctypes_typed_ptr):
|
|
385
|
+
"Get pointer to the value buffer represented by ctypes.pointer(ctypes_val)"
|
|
386
|
+
cdef size_t size = 0
|
|
387
|
+
cdef size_t *ptr_ref = NULL
|
|
388
|
+
ensure_buffer(ctypes_typed_ptr)
|
|
389
|
+
ptr_ref = <size_t *>get_buffer_pointer(ctypes_typed_ptr, &size)
|
|
390
|
+
return <void *>(ptr_ref[0])
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
cdef void * ctypes_value_ptr(object ctypes_cdata):
|
|
394
|
+
"Get pointer to the value buffer behind ctypes_val"
|
|
395
|
+
cdef size_t size = 0
|
|
396
|
+
ensure_buffer(ctypes_cdata)
|
|
397
|
+
return get_buffer_pointer(ctypes_cdata, &size)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
cdef inline void * int_as_ptr(size_t ptr_val):
|
|
401
|
+
return <void *>(ptr_val)
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
cdef class StateBase:
|
|
405
|
+
cdef void *ptr
|
|
406
|
+
cdef object ref
|
|
407
|
+
|
|
408
|
+
def __cinit__(self):
|
|
409
|
+
self.ptr = NULL
|
|
410
|
+
self.ref = None
|
|
411
|
+
|
|
412
|
+
cdef inline void set_state(self, void *ptr, object ref):
|
|
413
|
+
self.ptr = ptr
|
|
414
|
+
self.ref = ref
|
|
415
|
+
|
|
416
|
+
@property
|
|
417
|
+
def pointer(self):
|
|
418
|
+
return <size_t>self.ptr
|
|
419
|
+
|
|
420
|
+
@property
|
|
421
|
+
def reference(self):
|
|
422
|
+
return self.ref
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
cdef class Pointer(StateBase):
|
|
426
|
+
"Represents the pointer value"
|
|
427
|
+
|
|
428
|
+
def __cinit__(self, arg):
|
|
429
|
+
cdef void *ptr
|
|
430
|
+
cdef object ref
|
|
431
|
+
|
|
432
|
+
if isinstance(arg, int):
|
|
433
|
+
ptr = int_as_ptr(arg)
|
|
434
|
+
ref = None
|
|
435
|
+
elif isinstance(arg, ctypes._Pointer):
|
|
436
|
+
ptr = ctypes_typed_pointer_payload_ptr(arg)
|
|
437
|
+
ref = arg
|
|
438
|
+
elif isinstance(arg, ctypes.c_void_p):
|
|
439
|
+
ptr = int_as_ptr(arg.value)
|
|
440
|
+
ref = arg
|
|
441
|
+
else:
|
|
442
|
+
raise TypeError(
|
|
443
|
+
"Expect ctypes pointer, integers, or PointerProxy, "
|
|
444
|
+
f"got type {type(arg)}"
|
|
445
|
+
)
|
|
446
|
+
self.set_state(ptr, ref)
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def make_pointer_object(ptr, owner):
|
|
450
|
+
cdef Pointer res = Pointer(0)
|
|
451
|
+
|
|
452
|
+
if isinstance(ptr, int):
|
|
453
|
+
res.ptr = int_as_ptr(ptr)
|
|
454
|
+
elif isinstance(ptr, ctypes.c_void_p):
|
|
455
|
+
res.ptr = int_as_ptr(ptr.value)
|
|
456
|
+
else:
|
|
457
|
+
raise TypeError(
|
|
458
|
+
"First argument must be an integer, or ctypes.c_void_p, "
|
|
459
|
+
f"got {type(ptr)}"
|
|
460
|
+
)
|
|
461
|
+
res.ref = owner
|
|
462
|
+
return res
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
cdef class IteratorState(StateBase):
|
|
466
|
+
"Represents blob referenced by pointer"
|
|
467
|
+
cdef size_t state_nbytes
|
|
468
|
+
|
|
469
|
+
def __cinit__(self, arg):
|
|
470
|
+
cdef size_t buffer_size = 0
|
|
471
|
+
cdef void *ptr = NULL
|
|
472
|
+
cdef object ref = None
|
|
473
|
+
|
|
474
|
+
super().__init__()
|
|
475
|
+
if isinstance(arg, ctypes._Pointer):
|
|
476
|
+
ptr = ctypes_typed_pointer_payload_ptr(arg)
|
|
477
|
+
ref = arg.contents
|
|
478
|
+
self.state_nbytes = ctypes.sizeof(ref)
|
|
479
|
+
elif PyObject_CheckBuffer(arg):
|
|
480
|
+
ptr = get_buffer_pointer(arg, &buffer_size)
|
|
481
|
+
ref = arg
|
|
482
|
+
self.state_nbytes = buffer_size
|
|
483
|
+
else:
|
|
484
|
+
raise TypeError(
|
|
485
|
+
"Expected a ctypes pointer with content, or object of type bytes or bytearray, "
|
|
486
|
+
f"got type {type(arg)}"
|
|
487
|
+
)
|
|
488
|
+
self.set_state(ptr, ref)
|
|
489
|
+
|
|
490
|
+
cdef inline size_t get_size(self):
|
|
491
|
+
return self.state_nbytes
|
|
492
|
+
|
|
493
|
+
@property
|
|
494
|
+
def size(self):
|
|
495
|
+
return self.state_nbytes
|
|
496
|
+
|
|
497
|
+
def __getbuffer__(self, Py_buffer *buffer, int flags):
|
|
498
|
+
cdef Py_ssize_t cast_size = <Py_ssize_t>self.state_nbytes
|
|
499
|
+
buffer.buf = <char *>self.ptr
|
|
500
|
+
buffer.obj = self
|
|
501
|
+
buffer.len = cast_size
|
|
502
|
+
buffer.readonly = 0
|
|
503
|
+
buffer.itemsize = 1
|
|
504
|
+
buffer.format = "B" # unsigned char
|
|
505
|
+
buffer.ndim = 1
|
|
506
|
+
buffer.shape = <Py_ssize_t *>&self.state_nbytes
|
|
507
|
+
buffer.strides = &buffer.itemsize
|
|
508
|
+
buffer.suboffsets = NULL
|
|
509
|
+
buffer.internal = NULL
|
|
510
|
+
|
|
511
|
+
def __releasebuffer__(self, Py_buffer *buffer):
|
|
512
|
+
pass
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
cdef const char *function_ptr_capsule_name = "void (void *, cccl_increment_t)";
|
|
516
|
+
|
|
517
|
+
cdef bint is_function_pointer_capsule(object o) noexcept:
|
|
518
|
+
"""
|
|
519
|
+
Returns non-zero if input is a valid capsule with
|
|
520
|
+
name 'void (void *, cccl_increment_t)'.
|
|
521
|
+
"""
|
|
522
|
+
return (
|
|
523
|
+
PyCapsule_CheckExact(o) and
|
|
524
|
+
PyCapsule_IsValid(o, function_ptr_capsule_name)
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
cdef inline void* get_function_pointer_from_capsule(object cap) except *:
|
|
529
|
+
return PyCapsule_GetPointer(cap, function_ptr_capsule_name)
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
cdef cccl_host_op_fn_ptr_t unbox_host_advance_fn(object host_fn_obj) except *:
|
|
533
|
+
cdef void *fn_ptr = NULL
|
|
534
|
+
if isinstance(host_fn_obj, ctypes._CFuncPtr):
|
|
535
|
+
# the _CFuncPtr object encapsulates a pointer to the function pointer
|
|
536
|
+
fn_ptr = ctypes_typed_pointer_payload_ptr(host_fn_obj)
|
|
537
|
+
return <cccl_host_op_fn_ptr_t>fn_ptr
|
|
538
|
+
|
|
539
|
+
if isinstance(host_fn_obj, int):
|
|
540
|
+
fn_ptr = <void *><uintptr_t>host_fn_obj
|
|
541
|
+
return <cccl_host_op_fn_ptr_t>fn_ptr
|
|
542
|
+
|
|
543
|
+
if isinstance(host_fn_obj, ctypes.c_void_p):
|
|
544
|
+
fn_ptr = <void *><uintptr_t>host_fn_obj.value
|
|
545
|
+
return <cccl_host_op_fn_ptr_t>fn_ptr
|
|
546
|
+
|
|
547
|
+
if is_function_pointer_capsule(host_fn_obj):
|
|
548
|
+
fn_ptr = get_function_pointer_from_capsule(host_fn_obj)
|
|
549
|
+
return <cccl_host_op_fn_ptr_t>fn_ptr
|
|
550
|
+
|
|
551
|
+
raise TypeError(
|
|
552
|
+
"Expected ctypes function pointer, ctypes.c_void_p, integer or a named capsule, "
|
|
553
|
+
f"got {type(host_fn_obj)}"
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
cdef class Iterator:
|
|
558
|
+
"""
|
|
559
|
+
Represents CCCL iterator.
|
|
560
|
+
|
|
561
|
+
Args:
|
|
562
|
+
alignment (int):
|
|
563
|
+
Alignment of the iterator state
|
|
564
|
+
iterator_type (IteratorKind):
|
|
565
|
+
The type of iterator, `IteratorKind.POINTER` or
|
|
566
|
+
`IteratorKind.ITERATOR`
|
|
567
|
+
advance_fn (Op):
|
|
568
|
+
Descriptor for user-defined `advance` function
|
|
569
|
+
compiled for device
|
|
570
|
+
dereference_fn (Op):
|
|
571
|
+
Descriptor for user-defined `dereference` or `assign`
|
|
572
|
+
function compiled for device
|
|
573
|
+
value_type (TypeInfo):
|
|
574
|
+
Descriptor of the type addressed by the iterator
|
|
575
|
+
state (object, optional):
|
|
576
|
+
Python object for the state of the iterator. For iterators of
|
|
577
|
+
type `ITERATOR` the state object is expected to implement Python
|
|
578
|
+
buffer protocol for SIMPLE 1d buffer of type unsigned byte.
|
|
579
|
+
For iterators of type `POINTER` the state may be an integer convertible
|
|
580
|
+
to `uintptr_t`, or a `ctypes` pointer (typed or untyped).
|
|
581
|
+
Value `None` represents absence of iterator state.
|
|
582
|
+
host_advance_fn (object, optional):
|
|
583
|
+
Python object for host callable function to advance state by a given
|
|
584
|
+
increment. The argument may only be set for iterators of type
|
|
585
|
+
`IteratorKind.ITERATOR` and raise an exception otherwise. Supported
|
|
586
|
+
types are `int` or `ctypes.c_void_p` (raw pointer), ctypes function
|
|
587
|
+
pointer, or a Python capsule with name `"void *(void *, cccl_increment_t)"`.
|
|
588
|
+
"""
|
|
589
|
+
cdef Op advance
|
|
590
|
+
cdef Op dereference
|
|
591
|
+
cdef object state_obj
|
|
592
|
+
cdef object host_advance_obj
|
|
593
|
+
cdef cccl_iterator_t iter_data
|
|
594
|
+
|
|
595
|
+
def __cinit__(self,
|
|
596
|
+
int alignment,
|
|
597
|
+
cccl_iterator_kind_t iterator_type,
|
|
598
|
+
Op advance_fn,
|
|
599
|
+
Op dereference_fn,
|
|
600
|
+
TypeInfo value_type,
|
|
601
|
+
state=None,
|
|
602
|
+
host_advance_fn=None
|
|
603
|
+
):
|
|
604
|
+
cdef cccl_iterator_kind_t it_kind
|
|
605
|
+
_validate_alignment(alignment)
|
|
606
|
+
it_kind = iterator_type
|
|
607
|
+
if it_kind == cccl_iterator_kind_t.POINTER:
|
|
608
|
+
if state is None:
|
|
609
|
+
self.state_obj = None
|
|
610
|
+
self.iter_data.size = 0
|
|
611
|
+
self.iter_data.state = NULL
|
|
612
|
+
elif isinstance(state, int):
|
|
613
|
+
self.state_obj = None
|
|
614
|
+
self.iter_data.size = 0
|
|
615
|
+
self.iter_data.state = int_as_ptr(state)
|
|
616
|
+
elif isinstance(state, Pointer):
|
|
617
|
+
self.state_obj = state.reference
|
|
618
|
+
self.iter_data.size = 0
|
|
619
|
+
self.iter_data.state = (<Pointer>state).ptr
|
|
620
|
+
else:
|
|
621
|
+
raise TypeError(
|
|
622
|
+
"Expect for Iterator of kind POINTER, state must have type Pointer or int, "
|
|
623
|
+
f"got {type(state)}"
|
|
624
|
+
)
|
|
625
|
+
if host_advance_fn is not None:
|
|
626
|
+
raise ValueError(
|
|
627
|
+
"host_advance_fn must be set to None for iterators of kind POINTER"
|
|
628
|
+
)
|
|
629
|
+
self.iter_data.host_advance = NULL
|
|
630
|
+
self.host_advance_obj = None
|
|
631
|
+
elif it_kind == cccl_iterator_kind_t.ITERATOR:
|
|
632
|
+
if state is None:
|
|
633
|
+
self.state_obj = None
|
|
634
|
+
self.iter_data.size = 0
|
|
635
|
+
self.iter_data.state = NULL
|
|
636
|
+
elif isinstance(state, IteratorState):
|
|
637
|
+
self.state_obj = state.reference
|
|
638
|
+
self.iter_data.size = (<IteratorState>state).size
|
|
639
|
+
self.iter_data.state = (<IteratorState>state).ptr
|
|
640
|
+
else:
|
|
641
|
+
raise TypeError(
|
|
642
|
+
"For Iterator of kind ITERATOR, state must have type IteratorState, "
|
|
643
|
+
f"got type {type(state)}"
|
|
644
|
+
)
|
|
645
|
+
if host_advance_fn is not None:
|
|
646
|
+
self.iter_data.host_advance = unbox_host_advance_fn(host_advance_fn)
|
|
647
|
+
self.host_advance_obj = host_advance_fn
|
|
648
|
+
else:
|
|
649
|
+
self.iter_data.host_advance = NULL
|
|
650
|
+
self.host_advance_obj = None
|
|
651
|
+
else: # pragma: no cover
|
|
652
|
+
raise ValueError("Unrecognized iterator kind")
|
|
653
|
+
self.advance = advance_fn
|
|
654
|
+
self.dereference = dereference_fn
|
|
655
|
+
self.iter_data.alignment = alignment
|
|
656
|
+
self.iter_data.type = <cccl_iterator_kind_t> it_kind
|
|
657
|
+
self.iter_data.advance = self.advance.op_data
|
|
658
|
+
self.iter_data.dereference = self.dereference.op_data
|
|
659
|
+
self.iter_data.value_type = value_type.type_info
|
|
660
|
+
|
|
661
|
+
@property
|
|
662
|
+
def advance_op(self):
|
|
663
|
+
return self.advance
|
|
664
|
+
|
|
665
|
+
@property
|
|
666
|
+
def dereference_or_assign_op(self):
|
|
667
|
+
return self.dereference
|
|
668
|
+
|
|
669
|
+
@property
|
|
670
|
+
def state(self):
|
|
671
|
+
if self.iter_data.type == cccl_iterator_kind_t.POINTER:
|
|
672
|
+
return <size_t>self.iter_data.state
|
|
673
|
+
else:
|
|
674
|
+
return self.state_obj
|
|
675
|
+
|
|
676
|
+
@state.setter
|
|
677
|
+
def state(self, new_value):
|
|
678
|
+
cdef ssize_t state_sz = 0
|
|
679
|
+
cdef size_t ptr = 0
|
|
680
|
+
cdef cccl_iterator_kind_t it_kind = self.iter_data.type
|
|
681
|
+
if it_kind == cccl_iterator_kind_t.POINTER:
|
|
682
|
+
if isinstance(new_value, Pointer):
|
|
683
|
+
self.state_obj = (<Pointer>new_value).ref
|
|
684
|
+
self.iter_data.size = state_sz
|
|
685
|
+
self.iter_data.state = (<Pointer>new_value).ptr
|
|
686
|
+
elif isinstance(new_value, int):
|
|
687
|
+
self.state_obj = None
|
|
688
|
+
self.iter_data.size = state_sz
|
|
689
|
+
self.iter_data.state = int_as_ptr(new_value)
|
|
690
|
+
elif new_value is None:
|
|
691
|
+
self.state_obj = None
|
|
692
|
+
self.iter_data.size = 0
|
|
693
|
+
self.iter_data.state = NULL
|
|
694
|
+
else:
|
|
695
|
+
raise TypeError(
|
|
696
|
+
"For iterator with type POINTER, state value must have type int or type Pointer, "
|
|
697
|
+
f"got type {type(new_value)}"
|
|
698
|
+
)
|
|
699
|
+
elif it_kind == cccl_iterator_kind_t.ITERATOR:
|
|
700
|
+
if isinstance(new_value, IteratorState):
|
|
701
|
+
self.state_obj = new_value.reference
|
|
702
|
+
self.iter_data.size = (<IteratorState>new_value).size
|
|
703
|
+
self.iter_data.state = (<IteratorState>new_value).ptr
|
|
704
|
+
elif isinstance(new_value, Pointer):
|
|
705
|
+
self.state_obj = new_value.reference
|
|
706
|
+
if self.iter_data.size == 0:
|
|
707
|
+
raise ValueError("Assigning incomplete state value to iterator without state size information")
|
|
708
|
+
self.iter_data.state = (<Pointer>new_value).ptr
|
|
709
|
+
elif PyObject_CheckBuffer(new_value):
|
|
710
|
+
self.iter_data.state = get_buffer_pointer(new_value, &self.iter_data.size)
|
|
711
|
+
self.state_obj = new_value
|
|
712
|
+
elif new_value is None:
|
|
713
|
+
self.state_obj = None
|
|
714
|
+
self.iter_data.size = 0
|
|
715
|
+
self.iter_data.state = NULL
|
|
716
|
+
else:
|
|
717
|
+
raise TypeError(
|
|
718
|
+
"For iterator with type ITERATOR, state value must have type IteratorState or type bytes, "
|
|
719
|
+
f"got type {type(new_value)}"
|
|
720
|
+
)
|
|
721
|
+
else:
|
|
722
|
+
raise TypeError("The new value should be an integer for iterators of POINTER kind, and bytes for ITERATOR kind")
|
|
723
|
+
|
|
724
|
+
@property
|
|
725
|
+
def type(self):
|
|
726
|
+
cdef cccl_iterator_kind_t it_kind = self.iter_data.type
|
|
727
|
+
if it_kind == cccl_iterator_kind_t.POINTER:
|
|
728
|
+
return IteratorKind.POINTER
|
|
729
|
+
else:
|
|
730
|
+
return IteratorKind.ITERATOR
|
|
731
|
+
|
|
732
|
+
@property
|
|
733
|
+
def value_type(self):
|
|
734
|
+
cdef cccl_type_info type_info = self.iter_data.value_type
|
|
735
|
+
return TypeInfo(type_info.size, type_info.alignment, type_info.type)
|
|
736
|
+
|
|
737
|
+
def is_kind_pointer(self):
|
|
738
|
+
cdef cccl_iterator_kind_t it_kind = self.iter_data.type
|
|
739
|
+
return (it_kind == cccl_iterator_kind_t.POINTER)
|
|
740
|
+
|
|
741
|
+
def is_kind_iterator(self):
|
|
742
|
+
cdef cccl_iterator_kind_t it_kind = self.iter_data.type
|
|
743
|
+
return (it_kind == cccl_iterator_kind_t.ITERATOR)
|
|
744
|
+
|
|
745
|
+
def as_bytes(self):
|
|
746
|
+
"Debugging ulitity to get memory view into library struct"
|
|
747
|
+
cdef uint8_t[:] mem_view = bytearray(sizeof(self.iter_data))
|
|
748
|
+
memcpy(&mem_view[0], &self.iter_data, sizeof(self.iter_data))
|
|
749
|
+
return bytes(mem_view)
|
|
750
|
+
|
|
751
|
+
@property
|
|
752
|
+
def host_advance_fn(self):
|
|
753
|
+
return self.host_advance_obj
|
|
754
|
+
|
|
755
|
+
@host_advance_fn.setter
|
|
756
|
+
def host_advance_fn(self, func):
|
|
757
|
+
if (self.iter_data.type == cccl_iterator_kind_t.ITERATOR):
|
|
758
|
+
if func is not None:
|
|
759
|
+
self.iter_data.host_advance = unbox_host_advance_fn(func)
|
|
760
|
+
self.host_advance_obj = func
|
|
761
|
+
else:
|
|
762
|
+
self.iter_data.host_advance = NULL
|
|
763
|
+
self.host_advance_obj = None
|
|
764
|
+
else:
|
|
765
|
+
raise ValueError
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
cdef class CommonData:
|
|
769
|
+
cdef int cc_major
|
|
770
|
+
cdef int cc_minor
|
|
771
|
+
cdef bytes encoded_cub_path
|
|
772
|
+
cdef bytes encoded_thrust_path
|
|
773
|
+
cdef bytes encoded_libcudacxx_path
|
|
774
|
+
cdef bytes encoded_ctk_path
|
|
775
|
+
|
|
776
|
+
def __cinit__(self, int cc_major, int cc_minor, str cub_path, str thrust_path, str libcudacxx_path, str ctk_path):
|
|
777
|
+
self.cc_major = cc_major
|
|
778
|
+
self.cc_minor = cc_minor
|
|
779
|
+
self.encoded_cub_path = cub_path.encode("utf-8")
|
|
780
|
+
self.encoded_thrust_path = thrust_path.encode("utf-8")
|
|
781
|
+
self.encoded_libcudacxx_path = libcudacxx_path.encode("utf-8")
|
|
782
|
+
self.encoded_ctk_path = ctk_path.encode("utf-8")
|
|
783
|
+
|
|
784
|
+
cdef inline int get_cc_major(self):
|
|
785
|
+
return self.cc_major
|
|
786
|
+
|
|
787
|
+
cdef inline int get_cc_minor(self):
|
|
788
|
+
return self.cc_minor
|
|
789
|
+
|
|
790
|
+
cdef inline const char * cub_path_get_c_str(self):
|
|
791
|
+
return <const char *>self.encoded_cub_path if self.encoded_cub_path else NULL
|
|
792
|
+
|
|
793
|
+
cdef inline const char * thrust_path_get_c_str(self):
|
|
794
|
+
return <const char *>self.encoded_thrust_path if self.encoded_thrust_path else NULL
|
|
795
|
+
|
|
796
|
+
cdef inline const char * libcudacxx_path_get_c_str(self):
|
|
797
|
+
return <const char *>self.encoded_libcudacxx_path if self.encoded_libcudacxx_path else NULL
|
|
798
|
+
|
|
799
|
+
cdef inline const char * ctk_path_get_c_str(self):
|
|
800
|
+
return <const char *>self.encoded_ctk_path if self.encoded_ctk_path else NULL
|
|
801
|
+
|
|
802
|
+
@property
|
|
803
|
+
def compute_capability(self):
|
|
804
|
+
return (self.cc_major, self.cc_minor)
|
|
805
|
+
|
|
806
|
+
@property
|
|
807
|
+
def cub_path(self):
|
|
808
|
+
return self.encoded_cub_path.decode("utf-8")
|
|
809
|
+
|
|
810
|
+
@property
|
|
811
|
+
def ctk_path(self):
|
|
812
|
+
return self.encoded_ctk_path.decode("utf-8")
|
|
813
|
+
|
|
814
|
+
@property
|
|
815
|
+
def thrust_path(self):
|
|
816
|
+
return self.encoded_thrust_path.decode("utf-8")
|
|
817
|
+
|
|
818
|
+
@property
|
|
819
|
+
def libcudacxx_path(self):
|
|
820
|
+
return self.encoded_libcudacxx_path.decode("utf-8")
|
|
821
|
+
|
|
822
|
+
# --------------
|
|
823
|
+
# DeviceReduce
|
|
824
|
+
# --------------
|
|
825
|
+
|
|
826
|
+
cdef extern from "cccl/c/reduce.h":
|
|
827
|
+
cdef struct cccl_device_reduce_build_result_t 'cccl_device_reduce_build_result_t':
|
|
828
|
+
const char* cubin
|
|
829
|
+
size_t cubin_size
|
|
830
|
+
|
|
831
|
+
cdef CUresult cccl_device_reduce_build(
|
|
832
|
+
cccl_device_reduce_build_result_t*,
|
|
833
|
+
cccl_iterator_t,
|
|
834
|
+
cccl_iterator_t,
|
|
835
|
+
cccl_op_t,
|
|
836
|
+
cccl_value_t,
|
|
837
|
+
int, int, const char*, const char*, const char*, const char*
|
|
838
|
+
) nogil
|
|
839
|
+
|
|
840
|
+
cdef CUresult cccl_device_reduce(
|
|
841
|
+
cccl_device_reduce_build_result_t,
|
|
842
|
+
void *,
|
|
843
|
+
size_t *,
|
|
844
|
+
cccl_iterator_t,
|
|
845
|
+
cccl_iterator_t,
|
|
846
|
+
uint64_t,
|
|
847
|
+
cccl_op_t,
|
|
848
|
+
cccl_value_t,
|
|
849
|
+
CUstream
|
|
850
|
+
) nogil
|
|
851
|
+
|
|
852
|
+
cdef CUresult cccl_device_reduce_cleanup(
|
|
853
|
+
cccl_device_reduce_build_result_t*
|
|
854
|
+
) nogil
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
cdef class DeviceReduceBuildResult:
|
|
858
|
+
cdef cccl_device_reduce_build_result_t build_data
|
|
859
|
+
|
|
860
|
+
def __cinit__(
|
|
861
|
+
DeviceReduceBuildResult self,
|
|
862
|
+
Iterator d_in,
|
|
863
|
+
Iterator d_out,
|
|
864
|
+
Op op,
|
|
865
|
+
Value h_init,
|
|
866
|
+
CommonData common_data
|
|
867
|
+
):
|
|
868
|
+
cdef CUresult status = -1
|
|
869
|
+
cdef int cc_major = common_data.get_cc_major()
|
|
870
|
+
cdef int cc_minor = common_data.get_cc_minor()
|
|
871
|
+
cdef const char *cub_path = common_data.cub_path_get_c_str()
|
|
872
|
+
cdef const char *thrust_path = common_data.thrust_path_get_c_str()
|
|
873
|
+
cdef const char *libcudacxx_path = common_data.libcudacxx_path_get_c_str()
|
|
874
|
+
cdef const char *ctk_path = common_data.ctk_path_get_c_str()
|
|
875
|
+
memset(&self.build_data, 0, sizeof(cccl_device_reduce_build_result_t))
|
|
876
|
+
|
|
877
|
+
with nogil:
|
|
878
|
+
status = cccl_device_reduce_build(
|
|
879
|
+
&self.build_data,
|
|
880
|
+
d_in.iter_data,
|
|
881
|
+
d_out.iter_data,
|
|
882
|
+
op.op_data,
|
|
883
|
+
h_init.value_data,
|
|
884
|
+
cc_major,
|
|
885
|
+
cc_minor,
|
|
886
|
+
cub_path,
|
|
887
|
+
thrust_path,
|
|
888
|
+
libcudacxx_path,
|
|
889
|
+
ctk_path,
|
|
890
|
+
)
|
|
891
|
+
if status != 0:
|
|
892
|
+
raise RuntimeError(
|
|
893
|
+
f"Failed building reduce, error code: {status}"
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
def __dealloc__(DeviceReduceBuildResult self):
|
|
897
|
+
cdef CUresult status = -1
|
|
898
|
+
with nogil:
|
|
899
|
+
status = cccl_device_reduce_cleanup(&self.build_data)
|
|
900
|
+
if (status != 0):
|
|
901
|
+
print(f"Return code {status} encountered during reduce result cleanup")
|
|
902
|
+
|
|
903
|
+
cpdef int compute(
|
|
904
|
+
DeviceReduceBuildResult self,
|
|
905
|
+
temp_storage_ptr,
|
|
906
|
+
temp_storage_bytes,
|
|
907
|
+
Iterator d_in,
|
|
908
|
+
Iterator d_out,
|
|
909
|
+
size_t num_items,
|
|
910
|
+
Op op,
|
|
911
|
+
Value h_init,
|
|
912
|
+
stream
|
|
913
|
+
):
|
|
914
|
+
cdef CUresult status = -1
|
|
915
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
916
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
917
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
918
|
+
|
|
919
|
+
with nogil:
|
|
920
|
+
status = cccl_device_reduce(
|
|
921
|
+
self.build_data,
|
|
922
|
+
storage_ptr,
|
|
923
|
+
&storage_sz,
|
|
924
|
+
d_in.iter_data,
|
|
925
|
+
d_out.iter_data,
|
|
926
|
+
<uint64_t>num_items,
|
|
927
|
+
op.op_data,
|
|
928
|
+
h_init.value_data,
|
|
929
|
+
c_stream
|
|
930
|
+
)
|
|
931
|
+
if status != 0:
|
|
932
|
+
raise RuntimeError(
|
|
933
|
+
f"Failed executing reduce, error code: {status}"
|
|
934
|
+
)
|
|
935
|
+
return storage_sz
|
|
936
|
+
|
|
937
|
+
def _get_cubin(self):
|
|
938
|
+
return PyBytes_FromStringAndSize(
|
|
939
|
+
<const char*>self.build_data.cubin,
|
|
940
|
+
self.build_data.cubin_size
|
|
941
|
+
)
|
|
942
|
+
|
|
943
|
+
# ------------
|
|
944
|
+
# DeviceScan
|
|
945
|
+
# ------------
|
|
946
|
+
|
|
947
|
+
|
|
948
|
+
cdef extern from "cccl/c/scan.h":
|
|
949
|
+
ctypedef bint _Bool
|
|
950
|
+
|
|
951
|
+
cdef struct cccl_device_scan_build_result_t 'cccl_device_scan_build_result_t':
|
|
952
|
+
const char* cubin
|
|
953
|
+
size_t cubin_size
|
|
954
|
+
|
|
955
|
+
cdef CUresult cccl_device_scan_build(
|
|
956
|
+
cccl_device_scan_build_result_t*,
|
|
957
|
+
cccl_iterator_t,
|
|
958
|
+
cccl_iterator_t,
|
|
959
|
+
cccl_op_t,
|
|
960
|
+
cccl_type_info,
|
|
961
|
+
_Bool,
|
|
962
|
+
cccl_init_kind_t,
|
|
963
|
+
int, int, const char*, const char*, const char*, const char*
|
|
964
|
+
) nogil
|
|
965
|
+
|
|
966
|
+
cdef CUresult cccl_device_exclusive_scan(
|
|
967
|
+
cccl_device_scan_build_result_t,
|
|
968
|
+
void *,
|
|
969
|
+
size_t *,
|
|
970
|
+
cccl_iterator_t,
|
|
971
|
+
cccl_iterator_t,
|
|
972
|
+
uint64_t,
|
|
973
|
+
cccl_op_t,
|
|
974
|
+
cccl_value_t,
|
|
975
|
+
CUstream
|
|
976
|
+
) nogil
|
|
977
|
+
|
|
978
|
+
cdef CUresult cccl_device_inclusive_scan(
|
|
979
|
+
cccl_device_scan_build_result_t,
|
|
980
|
+
void *,
|
|
981
|
+
size_t *,
|
|
982
|
+
cccl_iterator_t,
|
|
983
|
+
cccl_iterator_t,
|
|
984
|
+
uint64_t,
|
|
985
|
+
cccl_op_t,
|
|
986
|
+
cccl_value_t,
|
|
987
|
+
CUstream
|
|
988
|
+
) nogil
|
|
989
|
+
|
|
990
|
+
cdef CUresult cccl_device_exclusive_scan_future_value(
|
|
991
|
+
cccl_device_scan_build_result_t,
|
|
992
|
+
void *,
|
|
993
|
+
size_t *,
|
|
994
|
+
cccl_iterator_t,
|
|
995
|
+
cccl_iterator_t,
|
|
996
|
+
uint64_t,
|
|
997
|
+
cccl_op_t,
|
|
998
|
+
cccl_iterator_t,
|
|
999
|
+
CUstream
|
|
1000
|
+
) nogil
|
|
1001
|
+
|
|
1002
|
+
cdef CUresult cccl_device_inclusive_scan_future_value(
|
|
1003
|
+
cccl_device_scan_build_result_t,
|
|
1004
|
+
void *,
|
|
1005
|
+
size_t *,
|
|
1006
|
+
cccl_iterator_t,
|
|
1007
|
+
cccl_iterator_t,
|
|
1008
|
+
uint64_t,
|
|
1009
|
+
cccl_op_t,
|
|
1010
|
+
cccl_iterator_t,
|
|
1011
|
+
CUstream
|
|
1012
|
+
) nogil
|
|
1013
|
+
|
|
1014
|
+
cdef CUresult cccl_device_inclusive_scan_no_init(
|
|
1015
|
+
cccl_device_scan_build_result_t,
|
|
1016
|
+
void *,
|
|
1017
|
+
size_t *,
|
|
1018
|
+
cccl_iterator_t,
|
|
1019
|
+
cccl_iterator_t,
|
|
1020
|
+
uint64_t,
|
|
1021
|
+
cccl_op_t,
|
|
1022
|
+
CUstream
|
|
1023
|
+
) nogil
|
|
1024
|
+
|
|
1025
|
+
cdef CUresult cccl_device_scan_cleanup(
|
|
1026
|
+
cccl_device_scan_build_result_t*
|
|
1027
|
+
) nogil
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
cdef class DeviceScanBuildResult:
|
|
1031
|
+
cdef cccl_device_scan_build_result_t build_data
|
|
1032
|
+
|
|
1033
|
+
def __cinit__(
|
|
1034
|
+
DeviceScanBuildResult self,
|
|
1035
|
+
Iterator d_in,
|
|
1036
|
+
Iterator d_out,
|
|
1037
|
+
Op op,
|
|
1038
|
+
TypeInfo init_type,
|
|
1039
|
+
bint force_inclusive,
|
|
1040
|
+
cccl_init_kind_t init_kind,
|
|
1041
|
+
CommonData common_data
|
|
1042
|
+
):
|
|
1043
|
+
cdef CUresult status = -1
|
|
1044
|
+
cdef int cc_major = common_data.get_cc_major()
|
|
1045
|
+
cdef int cc_minor = common_data.get_cc_minor()
|
|
1046
|
+
cdef const char *cub_path = common_data.cub_path_get_c_str()
|
|
1047
|
+
cdef const char *thrust_path = common_data.thrust_path_get_c_str()
|
|
1048
|
+
cdef const char *libcudacxx_path = common_data.libcudacxx_path_get_c_str()
|
|
1049
|
+
cdef const char *ctk_path = common_data.ctk_path_get_c_str()
|
|
1050
|
+
memset(&self.build_data, 0, sizeof(cccl_device_scan_build_result_t))
|
|
1051
|
+
|
|
1052
|
+
with nogil:
|
|
1053
|
+
status = cccl_device_scan_build(
|
|
1054
|
+
&self.build_data,
|
|
1055
|
+
d_in.iter_data,
|
|
1056
|
+
d_out.iter_data,
|
|
1057
|
+
op.op_data,
|
|
1058
|
+
init_type.type_info,
|
|
1059
|
+
force_inclusive,
|
|
1060
|
+
init_kind,
|
|
1061
|
+
cc_major,
|
|
1062
|
+
cc_minor,
|
|
1063
|
+
cub_path,
|
|
1064
|
+
thrust_path,
|
|
1065
|
+
libcudacxx_path,
|
|
1066
|
+
ctk_path,
|
|
1067
|
+
)
|
|
1068
|
+
if status != 0:
|
|
1069
|
+
raise RuntimeError(f"Error {status} building scan")
|
|
1070
|
+
|
|
1071
|
+
def __dealloc__(DeviceScanBuildResult self):
|
|
1072
|
+
cdef CUresult status = -1
|
|
1073
|
+
with nogil:
|
|
1074
|
+
status = cccl_device_scan_cleanup(&self.build_data)
|
|
1075
|
+
if (status != 0):
|
|
1076
|
+
print(f"Return code {status} encountered during scan result cleanup")
|
|
1077
|
+
|
|
1078
|
+
cpdef int compute_inclusive(
|
|
1079
|
+
DeviceScanBuildResult self,
|
|
1080
|
+
temp_storage_ptr,
|
|
1081
|
+
temp_storage_bytes,
|
|
1082
|
+
Iterator d_in,
|
|
1083
|
+
Iterator d_out,
|
|
1084
|
+
size_t num_items,
|
|
1085
|
+
Op op,
|
|
1086
|
+
Value init_value,
|
|
1087
|
+
stream
|
|
1088
|
+
):
|
|
1089
|
+
cdef CUresult status = -1
|
|
1090
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
1091
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
1092
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
1093
|
+
|
|
1094
|
+
with nogil:
|
|
1095
|
+
status = cccl_device_inclusive_scan(
|
|
1096
|
+
self.build_data,
|
|
1097
|
+
storage_ptr,
|
|
1098
|
+
&storage_sz,
|
|
1099
|
+
d_in.iter_data,
|
|
1100
|
+
d_out.iter_data,
|
|
1101
|
+
<uint64_t>num_items,
|
|
1102
|
+
op.op_data,
|
|
1103
|
+
init_value.value_data,
|
|
1104
|
+
c_stream
|
|
1105
|
+
)
|
|
1106
|
+
if status != 0:
|
|
1107
|
+
raise RuntimeError(
|
|
1108
|
+
f"Failed executing inclusive scan, error code: {status}"
|
|
1109
|
+
)
|
|
1110
|
+
return storage_sz
|
|
1111
|
+
|
|
1112
|
+
cpdef int compute_exclusive(
|
|
1113
|
+
DeviceScanBuildResult self,
|
|
1114
|
+
temp_storage_ptr,
|
|
1115
|
+
temp_storage_bytes,
|
|
1116
|
+
Iterator d_in,
|
|
1117
|
+
Iterator d_out,
|
|
1118
|
+
size_t num_items,
|
|
1119
|
+
Op op,
|
|
1120
|
+
Value init_value,
|
|
1121
|
+
stream
|
|
1122
|
+
):
|
|
1123
|
+
cdef CUresult status = -1
|
|
1124
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
1125
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
1126
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
1127
|
+
|
|
1128
|
+
with nogil:
|
|
1129
|
+
status = cccl_device_exclusive_scan(
|
|
1130
|
+
self.build_data,
|
|
1131
|
+
storage_ptr,
|
|
1132
|
+
&storage_sz,
|
|
1133
|
+
d_in.iter_data,
|
|
1134
|
+
d_out.iter_data,
|
|
1135
|
+
<uint64_t>num_items,
|
|
1136
|
+
op.op_data,
|
|
1137
|
+
init_value.value_data,
|
|
1138
|
+
c_stream
|
|
1139
|
+
)
|
|
1140
|
+
if status != 0:
|
|
1141
|
+
raise RuntimeError(
|
|
1142
|
+
f"Failed executing exclusive scan, error code: {status}"
|
|
1143
|
+
)
|
|
1144
|
+
return storage_sz
|
|
1145
|
+
|
|
1146
|
+
cpdef int compute_inclusive_future_value(
|
|
1147
|
+
DeviceScanBuildResult self,
|
|
1148
|
+
temp_storage_ptr,
|
|
1149
|
+
temp_storage_bytes,
|
|
1150
|
+
Iterator d_in,
|
|
1151
|
+
Iterator d_out,
|
|
1152
|
+
size_t num_items,
|
|
1153
|
+
Op op,
|
|
1154
|
+
Iterator init_value,
|
|
1155
|
+
stream
|
|
1156
|
+
):
|
|
1157
|
+
cdef CUresult status = -1
|
|
1158
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
1159
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
1160
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
1161
|
+
|
|
1162
|
+
with nogil:
|
|
1163
|
+
status = cccl_device_inclusive_scan_future_value(
|
|
1164
|
+
self.build_data,
|
|
1165
|
+
storage_ptr,
|
|
1166
|
+
&storage_sz,
|
|
1167
|
+
d_in.iter_data,
|
|
1168
|
+
d_out.iter_data,
|
|
1169
|
+
<uint64_t>num_items,
|
|
1170
|
+
op.op_data,
|
|
1171
|
+
init_value.iter_data,
|
|
1172
|
+
c_stream
|
|
1173
|
+
)
|
|
1174
|
+
if status != 0:
|
|
1175
|
+
raise RuntimeError(
|
|
1176
|
+
f"Failed executing inclusive scan, error code: {status}"
|
|
1177
|
+
)
|
|
1178
|
+
return storage_sz
|
|
1179
|
+
|
|
1180
|
+
cpdef int compute_exclusive_future_value(
|
|
1181
|
+
DeviceScanBuildResult self,
|
|
1182
|
+
temp_storage_ptr,
|
|
1183
|
+
temp_storage_bytes,
|
|
1184
|
+
Iterator d_in,
|
|
1185
|
+
Iterator d_out,
|
|
1186
|
+
size_t num_items,
|
|
1187
|
+
Op op,
|
|
1188
|
+
Iterator init_value,
|
|
1189
|
+
stream
|
|
1190
|
+
):
|
|
1191
|
+
cdef CUresult status = -1
|
|
1192
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
1193
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
1194
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
1195
|
+
|
|
1196
|
+
with nogil:
|
|
1197
|
+
status = cccl_device_exclusive_scan_future_value(
|
|
1198
|
+
self.build_data,
|
|
1199
|
+
storage_ptr,
|
|
1200
|
+
&storage_sz,
|
|
1201
|
+
d_in.iter_data,
|
|
1202
|
+
d_out.iter_data,
|
|
1203
|
+
<uint64_t>num_items,
|
|
1204
|
+
op.op_data,
|
|
1205
|
+
init_value.iter_data,
|
|
1206
|
+
c_stream
|
|
1207
|
+
)
|
|
1208
|
+
if status != 0:
|
|
1209
|
+
raise RuntimeError(
|
|
1210
|
+
f"Failed executing exclusive scan, error code: {status}"
|
|
1211
|
+
)
|
|
1212
|
+
return storage_sz
|
|
1213
|
+
|
|
1214
|
+
cpdef int compute_inclusive_no_init(
|
|
1215
|
+
DeviceScanBuildResult self,
|
|
1216
|
+
temp_storage_ptr,
|
|
1217
|
+
temp_storage_bytes,
|
|
1218
|
+
Iterator d_in,
|
|
1219
|
+
Iterator d_out,
|
|
1220
|
+
size_t num_items,
|
|
1221
|
+
Op op,
|
|
1222
|
+
object init_value,
|
|
1223
|
+
stream
|
|
1224
|
+
):
|
|
1225
|
+
cdef CUresult status = -1
|
|
1226
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
1227
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
1228
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
1229
|
+
|
|
1230
|
+
with nogil:
|
|
1231
|
+
status = cccl_device_inclusive_scan_no_init(
|
|
1232
|
+
self.build_data,
|
|
1233
|
+
storage_ptr,
|
|
1234
|
+
&storage_sz,
|
|
1235
|
+
d_in.iter_data,
|
|
1236
|
+
d_out.iter_data,
|
|
1237
|
+
<uint64_t>num_items,
|
|
1238
|
+
op.op_data,
|
|
1239
|
+
c_stream
|
|
1240
|
+
)
|
|
1241
|
+
if status != 0:
|
|
1242
|
+
raise RuntimeError(
|
|
1243
|
+
f"Failed executing inclusive scan, error code: {status}"
|
|
1244
|
+
)
|
|
1245
|
+
return storage_sz
|
|
1246
|
+
|
|
1247
|
+
def _get_cubin(self):
|
|
1248
|
+
return PyBytes_FromStringAndSize(
|
|
1249
|
+
<const char*>self.build_data.cubin,
|
|
1250
|
+
self.build_data.cubin_size
|
|
1251
|
+
)
|
|
1252
|
+
|
|
1253
|
+
# -----------------------
|
|
1254
|
+
# DeviceSegmentedReduce
|
|
1255
|
+
# -----------------------
|
|
1256
|
+
|
|
1257
|
+
|
|
1258
|
+
cdef extern from "cccl/c/segmented_reduce.h":
|
|
1259
|
+
cdef struct cccl_device_segmented_reduce_build_result_t 'cccl_device_segmented_reduce_build_result_t':
|
|
1260
|
+
const char* cubin
|
|
1261
|
+
size_t cubin_size
|
|
1262
|
+
|
|
1263
|
+
cdef CUresult cccl_device_segmented_reduce_build(
|
|
1264
|
+
cccl_device_segmented_reduce_build_result_t*,
|
|
1265
|
+
cccl_iterator_t,
|
|
1266
|
+
cccl_iterator_t,
|
|
1267
|
+
cccl_iterator_t,
|
|
1268
|
+
cccl_iterator_t,
|
|
1269
|
+
cccl_op_t,
|
|
1270
|
+
cccl_value_t,
|
|
1271
|
+
int, int, const char*, const char*, const char*, const char*
|
|
1272
|
+
) nogil
|
|
1273
|
+
|
|
1274
|
+
cdef CUresult cccl_device_segmented_reduce(
|
|
1275
|
+
cccl_device_segmented_reduce_build_result_t,
|
|
1276
|
+
void *,
|
|
1277
|
+
size_t *,
|
|
1278
|
+
cccl_iterator_t,
|
|
1279
|
+
cccl_iterator_t,
|
|
1280
|
+
uint64_t,
|
|
1281
|
+
cccl_iterator_t,
|
|
1282
|
+
cccl_iterator_t,
|
|
1283
|
+
cccl_op_t,
|
|
1284
|
+
cccl_value_t,
|
|
1285
|
+
CUstream
|
|
1286
|
+
) nogil
|
|
1287
|
+
|
|
1288
|
+
cdef CUresult cccl_device_segmented_reduce_cleanup(
|
|
1289
|
+
cccl_device_segmented_reduce_build_result_t* bld_ptr
|
|
1290
|
+
) nogil
|
|
1291
|
+
|
|
1292
|
+
|
|
1293
|
+
cdef class DeviceSegmentedReduceBuildResult:
|
|
1294
|
+
cdef cccl_device_segmented_reduce_build_result_t build_data
|
|
1295
|
+
|
|
1296
|
+
def __cinit__(
|
|
1297
|
+
DeviceSegmentedReduceBuildResult self,
|
|
1298
|
+
Iterator d_in,
|
|
1299
|
+
Iterator d_out,
|
|
1300
|
+
Iterator start_offsets,
|
|
1301
|
+
Iterator end_offsets,
|
|
1302
|
+
Op op,
|
|
1303
|
+
Value h_init,
|
|
1304
|
+
CommonData common_data
|
|
1305
|
+
):
|
|
1306
|
+
cdef CUresult status = -1
|
|
1307
|
+
cdef int cc_major = common_data.get_cc_major()
|
|
1308
|
+
cdef int cc_minor = common_data.get_cc_minor()
|
|
1309
|
+
cdef const char *cub_path = common_data.cub_path_get_c_str()
|
|
1310
|
+
cdef const char *thrust_path = common_data.thrust_path_get_c_str()
|
|
1311
|
+
cdef const char *libcudacxx_path = common_data.libcudacxx_path_get_c_str()
|
|
1312
|
+
cdef const char *ctk_path = common_data.ctk_path_get_c_str()
|
|
1313
|
+
|
|
1314
|
+
memset(&self.build_data, 0, sizeof(cccl_device_segmented_reduce_build_result_t))
|
|
1315
|
+
with nogil:
|
|
1316
|
+
status = cccl_device_segmented_reduce_build(
|
|
1317
|
+
&self.build_data,
|
|
1318
|
+
d_in.iter_data,
|
|
1319
|
+
d_out.iter_data,
|
|
1320
|
+
start_offsets.iter_data,
|
|
1321
|
+
end_offsets.iter_data,
|
|
1322
|
+
op.op_data,
|
|
1323
|
+
h_init.value_data,
|
|
1324
|
+
cc_major,
|
|
1325
|
+
cc_minor,
|
|
1326
|
+
cub_path,
|
|
1327
|
+
thrust_path,
|
|
1328
|
+
libcudacxx_path,
|
|
1329
|
+
ctk_path,
|
|
1330
|
+
)
|
|
1331
|
+
if status != 0:
|
|
1332
|
+
raise RuntimeError(
|
|
1333
|
+
f"Failed building segmented_reduce, error code: {status}"
|
|
1334
|
+
)
|
|
1335
|
+
|
|
1336
|
+
def __dealloc__(DeviceSegmentedReduceBuildResult self):
|
|
1337
|
+
cdef CUresult status = -1
|
|
1338
|
+
with nogil:
|
|
1339
|
+
status = cccl_device_segmented_reduce_cleanup(&self.build_data)
|
|
1340
|
+
if (status != 0):
|
|
1341
|
+
print(f"Return code {status} encountered during segmented_reduce result cleanup")
|
|
1342
|
+
|
|
1343
|
+
cpdef int compute(
|
|
1344
|
+
DeviceSegmentedReduceBuildResult self,
|
|
1345
|
+
temp_storage_ptr,
|
|
1346
|
+
temp_storage_bytes,
|
|
1347
|
+
Iterator d_in,
|
|
1348
|
+
Iterator d_out,
|
|
1349
|
+
size_t num_items,
|
|
1350
|
+
Iterator start_offsets,
|
|
1351
|
+
Iterator end_offsets,
|
|
1352
|
+
Op op,
|
|
1353
|
+
Value h_init,
|
|
1354
|
+
stream
|
|
1355
|
+
):
|
|
1356
|
+
cdef CUresult status = -1
|
|
1357
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
1358
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
1359
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
1360
|
+
|
|
1361
|
+
with nogil:
|
|
1362
|
+
status = cccl_device_segmented_reduce(
|
|
1363
|
+
self.build_data,
|
|
1364
|
+
storage_ptr,
|
|
1365
|
+
&storage_sz,
|
|
1366
|
+
d_in.iter_data,
|
|
1367
|
+
d_out.iter_data,
|
|
1368
|
+
<uint64_t>num_items,
|
|
1369
|
+
start_offsets.iter_data,
|
|
1370
|
+
end_offsets.iter_data,
|
|
1371
|
+
op.op_data,
|
|
1372
|
+
h_init.value_data,
|
|
1373
|
+
c_stream
|
|
1374
|
+
)
|
|
1375
|
+
if status != 0:
|
|
1376
|
+
raise RuntimeError(
|
|
1377
|
+
f"Failed executing segmented_reduce, error code: {status}"
|
|
1378
|
+
)
|
|
1379
|
+
return storage_sz
|
|
1380
|
+
|
|
1381
|
+
def _get_cubin(self):
|
|
1382
|
+
return PyBytes_FromStringAndSize(
|
|
1383
|
+
<const char*>self.build_data.cubin,
|
|
1384
|
+
self.build_data.cubin_size
|
|
1385
|
+
)
|
|
1386
|
+
|
|
1387
|
+
# -----------------
|
|
1388
|
+
# DeviceMergeSort
|
|
1389
|
+
# -----------------
|
|
1390
|
+
|
|
1391
|
+
|
|
1392
|
+
cdef extern from "cccl/c/merge_sort.h":
|
|
1393
|
+
cdef struct cccl_device_merge_sort_build_result_t 'cccl_device_merge_sort_build_result_t':
|
|
1394
|
+
const char* cubin
|
|
1395
|
+
size_t cubin_size
|
|
1396
|
+
|
|
1397
|
+
cdef CUresult cccl_device_merge_sort_build(
|
|
1398
|
+
cccl_device_merge_sort_build_result_t *bld_ptr,
|
|
1399
|
+
cccl_iterator_t d_in_keys,
|
|
1400
|
+
cccl_iterator_t d_in_items,
|
|
1401
|
+
cccl_iterator_t d_out_keys,
|
|
1402
|
+
cccl_iterator_t d_out_items,
|
|
1403
|
+
cccl_op_t,
|
|
1404
|
+
int, int, const char*, const char*, const char*, const char*
|
|
1405
|
+
) nogil
|
|
1406
|
+
|
|
1407
|
+
cdef CUresult cccl_device_merge_sort(
|
|
1408
|
+
cccl_device_merge_sort_build_result_t,
|
|
1409
|
+
void *,
|
|
1410
|
+
size_t *,
|
|
1411
|
+
cccl_iterator_t,
|
|
1412
|
+
cccl_iterator_t,
|
|
1413
|
+
cccl_iterator_t,
|
|
1414
|
+
cccl_iterator_t,
|
|
1415
|
+
uint64_t,
|
|
1416
|
+
cccl_op_t,
|
|
1417
|
+
CUstream
|
|
1418
|
+
) nogil
|
|
1419
|
+
|
|
1420
|
+
cdef CUresult cccl_device_merge_sort_cleanup(
|
|
1421
|
+
cccl_device_merge_sort_build_result_t* bld_ptr
|
|
1422
|
+
) nogil
|
|
1423
|
+
|
|
1424
|
+
|
|
1425
|
+
cdef class DeviceMergeSortBuildResult:
|
|
1426
|
+
cdef cccl_device_merge_sort_build_result_t build_data
|
|
1427
|
+
|
|
1428
|
+
def __cinit__(
|
|
1429
|
+
DeviceMergeSortBuildResult self,
|
|
1430
|
+
Iterator d_in_keys,
|
|
1431
|
+
Iterator d_in_items,
|
|
1432
|
+
Iterator d_out_keys,
|
|
1433
|
+
Iterator d_out_items,
|
|
1434
|
+
Op op,
|
|
1435
|
+
CommonData common_data
|
|
1436
|
+
):
|
|
1437
|
+
cdef CUresult status = -1
|
|
1438
|
+
cdef int cc_major = common_data.get_cc_major()
|
|
1439
|
+
cdef int cc_minor = common_data.get_cc_minor()
|
|
1440
|
+
cdef const char *cub_path = common_data.cub_path_get_c_str()
|
|
1441
|
+
cdef const char *thrust_path = common_data.thrust_path_get_c_str()
|
|
1442
|
+
cdef const char *libcudacxx_path = common_data.libcudacxx_path_get_c_str()
|
|
1443
|
+
cdef const char *ctk_path = common_data.ctk_path_get_c_str()
|
|
1444
|
+
|
|
1445
|
+
memset(&self.build_data, 0, sizeof(cccl_device_merge_sort_build_result_t))
|
|
1446
|
+
with nogil:
|
|
1447
|
+
status = cccl_device_merge_sort_build(
|
|
1448
|
+
&self.build_data,
|
|
1449
|
+
d_in_keys.iter_data,
|
|
1450
|
+
d_in_items.iter_data,
|
|
1451
|
+
d_out_keys.iter_data,
|
|
1452
|
+
d_out_items.iter_data,
|
|
1453
|
+
op.op_data,
|
|
1454
|
+
cc_major,
|
|
1455
|
+
cc_minor,
|
|
1456
|
+
cub_path,
|
|
1457
|
+
thrust_path,
|
|
1458
|
+
libcudacxx_path,
|
|
1459
|
+
ctk_path,
|
|
1460
|
+
)
|
|
1461
|
+
if status != 0:
|
|
1462
|
+
raise RuntimeError(
|
|
1463
|
+
f"Failed building merge_sort, error code: {status}"
|
|
1464
|
+
)
|
|
1465
|
+
|
|
1466
|
+
def __dealloc__(DeviceMergeSortBuildResult self):
|
|
1467
|
+
cdef CUresult status = -1
|
|
1468
|
+
with nogil:
|
|
1469
|
+
status = cccl_device_merge_sort_cleanup(&self.build_data)
|
|
1470
|
+
if (status != 0):
|
|
1471
|
+
print(f"Return code {status} encountered during merge_sort result cleanup")
|
|
1472
|
+
|
|
1473
|
+
cpdef int compute(
|
|
1474
|
+
DeviceMergeSortBuildResult self,
|
|
1475
|
+
temp_storage_ptr,
|
|
1476
|
+
temp_storage_bytes,
|
|
1477
|
+
Iterator d_in_keys,
|
|
1478
|
+
Iterator d_in_items,
|
|
1479
|
+
Iterator d_out_keys,
|
|
1480
|
+
Iterator d_out_items,
|
|
1481
|
+
size_t num_items,
|
|
1482
|
+
Op op,
|
|
1483
|
+
stream
|
|
1484
|
+
):
|
|
1485
|
+
cdef CUresult status = -1
|
|
1486
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
1487
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
1488
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
1489
|
+
with nogil:
|
|
1490
|
+
status = cccl_device_merge_sort(
|
|
1491
|
+
self.build_data,
|
|
1492
|
+
storage_ptr,
|
|
1493
|
+
&storage_sz,
|
|
1494
|
+
d_in_keys.iter_data,
|
|
1495
|
+
d_in_items.iter_data,
|
|
1496
|
+
d_out_keys.iter_data,
|
|
1497
|
+
d_out_items.iter_data,
|
|
1498
|
+
<uint64_t>num_items,
|
|
1499
|
+
op.op_data,
|
|
1500
|
+
c_stream
|
|
1501
|
+
)
|
|
1502
|
+
if status != 0:
|
|
1503
|
+
raise RuntimeError(
|
|
1504
|
+
f"Failed executing merge_sort, error code: {status}"
|
|
1505
|
+
)
|
|
1506
|
+
return storage_sz
|
|
1507
|
+
|
|
1508
|
+
|
|
1509
|
+
def _get_cubin(self):
|
|
1510
|
+
return PyBytes_FromStringAndSize(
|
|
1511
|
+
<const char*>self.build_data.cubin,
|
|
1512
|
+
self.build_data.cubin_size
|
|
1513
|
+
)
|
|
1514
|
+
|
|
1515
|
+
|
|
1516
|
+
# -------------------
|
|
1517
|
+
# DeviceUniqueByKey
|
|
1518
|
+
# -------------------
|
|
1519
|
+
|
|
1520
|
+
cdef extern from "cccl/c/unique_by_key.h":
|
|
1521
|
+
cdef struct cccl_device_unique_by_key_build_result_t 'cccl_device_unique_by_key_build_result_t':
|
|
1522
|
+
const char* cubin
|
|
1523
|
+
size_t cubin_size
|
|
1524
|
+
|
|
1525
|
+
|
|
1526
|
+
cdef CUresult cccl_device_unique_by_key_build(
|
|
1527
|
+
cccl_device_unique_by_key_build_result_t *build_ptr,
|
|
1528
|
+
cccl_iterator_t d_keys_in,
|
|
1529
|
+
cccl_iterator_t d_values_in,
|
|
1530
|
+
cccl_iterator_t d_keys_out,
|
|
1531
|
+
cccl_iterator_t d_values_out,
|
|
1532
|
+
cccl_iterator_t d_num_selected_out,
|
|
1533
|
+
cccl_op_t comparison_op,
|
|
1534
|
+
int, int, const char *, const char *, const char *, const char *
|
|
1535
|
+
) nogil
|
|
1536
|
+
|
|
1537
|
+
cdef CUresult cccl_device_unique_by_key(
|
|
1538
|
+
cccl_device_unique_by_key_build_result_t build,
|
|
1539
|
+
void *d_storage_ptr,
|
|
1540
|
+
size_t *d_storage_nbytes,
|
|
1541
|
+
cccl_iterator_t d_keys_in,
|
|
1542
|
+
cccl_iterator_t d_values_in,
|
|
1543
|
+
cccl_iterator_t d_keys_out,
|
|
1544
|
+
cccl_iterator_t d_values_out,
|
|
1545
|
+
cccl_iterator_t d_num_selected_out,
|
|
1546
|
+
cccl_op_t comparison_op,
|
|
1547
|
+
size_t num_items,
|
|
1548
|
+
CUstream stream
|
|
1549
|
+
) nogil
|
|
1550
|
+
|
|
1551
|
+
cdef CUresult cccl_device_unique_by_key_cleanup(
|
|
1552
|
+
cccl_device_unique_by_key_build_result_t *build_ptr,
|
|
1553
|
+
) nogil
|
|
1554
|
+
|
|
1555
|
+
|
|
1556
|
+
cdef class DeviceUniqueByKeyBuildResult:
|
|
1557
|
+
cdef cccl_device_unique_by_key_build_result_t build_data
|
|
1558
|
+
|
|
1559
|
+
def __cinit__(
|
|
1560
|
+
DeviceUniqueByKeyBuildResult self,
|
|
1561
|
+
Iterator d_keys_in,
|
|
1562
|
+
Iterator d_values_in,
|
|
1563
|
+
Iterator d_keys_out,
|
|
1564
|
+
Iterator d_values_out,
|
|
1565
|
+
Iterator d_num_selected_out,
|
|
1566
|
+
Op comparison_op,
|
|
1567
|
+
CommonData common_data
|
|
1568
|
+
):
|
|
1569
|
+
cdef CUresult status = -1
|
|
1570
|
+
cdef int cc_major = common_data.get_cc_major()
|
|
1571
|
+
cdef int cc_minor = common_data.get_cc_minor()
|
|
1572
|
+
cdef const char *cub_path = common_data.cub_path_get_c_str()
|
|
1573
|
+
cdef const char *thrust_path = common_data.thrust_path_get_c_str()
|
|
1574
|
+
cdef const char *libcudacxx_path = common_data.libcudacxx_path_get_c_str()
|
|
1575
|
+
cdef const char *ctk_path = common_data.ctk_path_get_c_str()
|
|
1576
|
+
|
|
1577
|
+
memset(&self.build_data, 0, sizeof(cccl_device_unique_by_key_build_result_t))
|
|
1578
|
+
with nogil:
|
|
1579
|
+
status = cccl_device_unique_by_key_build(
|
|
1580
|
+
&self.build_data,
|
|
1581
|
+
d_keys_in.iter_data,
|
|
1582
|
+
d_values_in.iter_data,
|
|
1583
|
+
d_keys_out.iter_data,
|
|
1584
|
+
d_values_out.iter_data,
|
|
1585
|
+
d_num_selected_out.iter_data,
|
|
1586
|
+
comparison_op.op_data,
|
|
1587
|
+
cc_major,
|
|
1588
|
+
cc_minor,
|
|
1589
|
+
cub_path,
|
|
1590
|
+
thrust_path,
|
|
1591
|
+
libcudacxx_path,
|
|
1592
|
+
ctk_path,
|
|
1593
|
+
)
|
|
1594
|
+
if status != 0:
|
|
1595
|
+
raise RuntimeError(
|
|
1596
|
+
f"Failed building unique_by_key, error code: {status}"
|
|
1597
|
+
)
|
|
1598
|
+
|
|
1599
|
+
def __dealloc__(DeviceUniqueByKeyBuildResult self):
|
|
1600
|
+
cdef CUresult status = -1
|
|
1601
|
+
with nogil:
|
|
1602
|
+
status = cccl_device_unique_by_key_cleanup(&self.build_data)
|
|
1603
|
+
if (status != 0):
|
|
1604
|
+
print(f"Return code {status} encountered during unique_by_key result cleanup")
|
|
1605
|
+
|
|
1606
|
+
cpdef int compute(
|
|
1607
|
+
DeviceUniqueByKeyBuildResult self,
|
|
1608
|
+
temp_storage_ptr,
|
|
1609
|
+
temp_storage_bytes,
|
|
1610
|
+
Iterator d_keys_in,
|
|
1611
|
+
Iterator d_values_in,
|
|
1612
|
+
Iterator d_keys_out,
|
|
1613
|
+
Iterator d_values_out,
|
|
1614
|
+
Iterator d_num_selected_out,
|
|
1615
|
+
Op comparison_op,
|
|
1616
|
+
size_t num_items,
|
|
1617
|
+
stream
|
|
1618
|
+
):
|
|
1619
|
+
cdef CUresult status = -1
|
|
1620
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
1621
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
1622
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
1623
|
+
|
|
1624
|
+
with nogil:
|
|
1625
|
+
status = cccl_device_unique_by_key(
|
|
1626
|
+
self.build_data,
|
|
1627
|
+
storage_ptr,
|
|
1628
|
+
&storage_sz,
|
|
1629
|
+
d_keys_in.iter_data,
|
|
1630
|
+
d_values_in.iter_data,
|
|
1631
|
+
d_keys_out.iter_data,
|
|
1632
|
+
d_values_out.iter_data,
|
|
1633
|
+
d_num_selected_out.iter_data,
|
|
1634
|
+
comparison_op.op_data,
|
|
1635
|
+
<uint64_t>num_items,
|
|
1636
|
+
c_stream
|
|
1637
|
+
)
|
|
1638
|
+
|
|
1639
|
+
if status != 0:
|
|
1640
|
+
raise RuntimeError(
|
|
1641
|
+
f"Failed executing unique_by_key, error code: {status}"
|
|
1642
|
+
)
|
|
1643
|
+
return storage_sz
|
|
1644
|
+
|
|
1645
|
+
def _get_cubin(self):
|
|
1646
|
+
return PyBytes_FromStringAndSize(
|
|
1647
|
+
<const char*>self.build_data.cubin,
|
|
1648
|
+
self.build_data.cubin_size
|
|
1649
|
+
)
|
|
1650
|
+
|
|
1651
|
+
# -----------------
|
|
1652
|
+
# DeviceRadixSort
|
|
1653
|
+
# -----------------
|
|
1654
|
+
|
|
1655
|
+
cdef extern from "cccl/c/radix_sort.h":
|
|
1656
|
+
cdef struct cccl_device_radix_sort_build_result_t 'cccl_device_radix_sort_build_result_t':
|
|
1657
|
+
const char* cubin
|
|
1658
|
+
size_t cubin_size
|
|
1659
|
+
|
|
1660
|
+
cdef CUresult cccl_device_radix_sort_build(
|
|
1661
|
+
cccl_device_radix_sort_build_result_t *build_ptr,
|
|
1662
|
+
cccl_sort_order_t sort_order,
|
|
1663
|
+
cccl_iterator_t d_keys_in,
|
|
1664
|
+
cccl_iterator_t d_values_in,
|
|
1665
|
+
cccl_op_t decomposer,
|
|
1666
|
+
const char* decomposer_return_type,
|
|
1667
|
+
int, int, const char *, const char *, const char *, const char *
|
|
1668
|
+
) nogil
|
|
1669
|
+
|
|
1670
|
+
cdef CUresult cccl_device_radix_sort(
|
|
1671
|
+
cccl_device_radix_sort_build_result_t build,
|
|
1672
|
+
void *d_storage_ptr,
|
|
1673
|
+
size_t *d_storage_nbytes,
|
|
1674
|
+
cccl_iterator_t d_keys_in,
|
|
1675
|
+
cccl_iterator_t d_keys_out,
|
|
1676
|
+
cccl_iterator_t d_values_in,
|
|
1677
|
+
cccl_iterator_t d_values_out,
|
|
1678
|
+
cccl_op_t decomposer,
|
|
1679
|
+
size_t num_items,
|
|
1680
|
+
int begin_bit,
|
|
1681
|
+
int end_bit,
|
|
1682
|
+
bint is_overwrite_okay,
|
|
1683
|
+
int* selector,
|
|
1684
|
+
CUstream stream
|
|
1685
|
+
) nogil
|
|
1686
|
+
|
|
1687
|
+
cdef CUresult cccl_device_radix_sort_cleanup(
|
|
1688
|
+
cccl_device_radix_sort_build_result_t *build_ptr,
|
|
1689
|
+
) nogil
|
|
1690
|
+
|
|
1691
|
+
|
|
1692
|
+
cdef class DeviceRadixSortBuildResult:
|
|
1693
|
+
cdef cccl_device_radix_sort_build_result_t build_data
|
|
1694
|
+
|
|
1695
|
+
def __dealloc__(DeviceRadixSortBuildResult self):
|
|
1696
|
+
cdef CUresult status = -1
|
|
1697
|
+
with nogil:
|
|
1698
|
+
status = cccl_device_radix_sort_cleanup(&self.build_data)
|
|
1699
|
+
if (status != 0):
|
|
1700
|
+
print(f"Return code {status} encountered during radix_sort result cleanup")
|
|
1701
|
+
|
|
1702
|
+
def __cinit__(
|
|
1703
|
+
DeviceRadixSortBuildResult self,
|
|
1704
|
+
cccl_sort_order_t order,
|
|
1705
|
+
Iterator d_keys_in,
|
|
1706
|
+
Iterator d_values_in,
|
|
1707
|
+
Op decomposer_op,
|
|
1708
|
+
const char* decomposer_return_type,
|
|
1709
|
+
CommonData common_data
|
|
1710
|
+
):
|
|
1711
|
+
cdef CUresult status = -1
|
|
1712
|
+
cdef int cc_major = common_data.get_cc_major()
|
|
1713
|
+
cdef int cc_minor = common_data.get_cc_minor()
|
|
1714
|
+
cdef const char *cub_path = common_data.cub_path_get_c_str()
|
|
1715
|
+
cdef const char *thrust_path = common_data.thrust_path_get_c_str()
|
|
1716
|
+
cdef const char *libcudacxx_path = common_data.libcudacxx_path_get_c_str()
|
|
1717
|
+
cdef const char *ctk_path = common_data.ctk_path_get_c_str()
|
|
1718
|
+
|
|
1719
|
+
memset(&self.build_data, 0, sizeof(cccl_device_radix_sort_build_result_t))
|
|
1720
|
+
with nogil:
|
|
1721
|
+
status = cccl_device_radix_sort_build(
|
|
1722
|
+
&self.build_data,
|
|
1723
|
+
order,
|
|
1724
|
+
d_keys_in.iter_data,
|
|
1725
|
+
d_values_in.iter_data,
|
|
1726
|
+
decomposer_op.op_data,
|
|
1727
|
+
decomposer_return_type,
|
|
1728
|
+
cc_major,
|
|
1729
|
+
cc_minor,
|
|
1730
|
+
cub_path,
|
|
1731
|
+
thrust_path,
|
|
1732
|
+
libcudacxx_path,
|
|
1733
|
+
ctk_path,
|
|
1734
|
+
)
|
|
1735
|
+
if status != 0:
|
|
1736
|
+
raise RuntimeError(
|
|
1737
|
+
f"Failed building radix_sort, error code: {status}"
|
|
1738
|
+
)
|
|
1739
|
+
|
|
1740
|
+
cpdef tuple compute(
|
|
1741
|
+
DeviceRadixSortBuildResult self,
|
|
1742
|
+
temp_storage_ptr,
|
|
1743
|
+
temp_storage_bytes,
|
|
1744
|
+
Iterator d_keys_in,
|
|
1745
|
+
Iterator d_keys_out,
|
|
1746
|
+
Iterator d_values_in,
|
|
1747
|
+
Iterator d_values_out,
|
|
1748
|
+
Op decomposer_op,
|
|
1749
|
+
size_t num_items,
|
|
1750
|
+
int begin_bit,
|
|
1751
|
+
int end_bit,
|
|
1752
|
+
bint is_overwrite_okay,
|
|
1753
|
+
selector,
|
|
1754
|
+
stream
|
|
1755
|
+
):
|
|
1756
|
+
cdef CUresult status = -1
|
|
1757
|
+
cdef void *storage_ptr = (<void *><size_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
1758
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
1759
|
+
cdef int selector_int = <int>selector
|
|
1760
|
+
cdef CUstream c_stream = <CUstream><size_t>(stream) if stream else NULL
|
|
1761
|
+
|
|
1762
|
+
with nogil:
|
|
1763
|
+
status = cccl_device_radix_sort(
|
|
1764
|
+
self.build_data,
|
|
1765
|
+
storage_ptr,
|
|
1766
|
+
&storage_sz,
|
|
1767
|
+
d_keys_in.iter_data,
|
|
1768
|
+
d_keys_out.iter_data,
|
|
1769
|
+
d_values_in.iter_data,
|
|
1770
|
+
d_values_out.iter_data,
|
|
1771
|
+
decomposer_op.op_data,
|
|
1772
|
+
<uint64_t>num_items,
|
|
1773
|
+
begin_bit,
|
|
1774
|
+
end_bit,
|
|
1775
|
+
is_overwrite_okay,
|
|
1776
|
+
&selector_int,
|
|
1777
|
+
c_stream
|
|
1778
|
+
)
|
|
1779
|
+
|
|
1780
|
+
if status != 0:
|
|
1781
|
+
raise RuntimeError(
|
|
1782
|
+
f"Failed executing ascending radix_sort, error code: {status}"
|
|
1783
|
+
)
|
|
1784
|
+
return <object>storage_sz, <object>selector_int
|
|
1785
|
+
|
|
1786
|
+
|
|
1787
|
+
def _get_cubin(self):
|
|
1788
|
+
return PyBytes_FromStringAndSize(
|
|
1789
|
+
<const char*>self.build_data.cubin,
|
|
1790
|
+
self.build_data.cubin_size
|
|
1791
|
+
)
|
|
1792
|
+
|
|
1793
|
+
# --------------------------------------------
|
|
1794
|
+
# DeviceUnaryTransform/DeviceBinaryTransform
|
|
1795
|
+
# --------------------------------------------
|
|
1796
|
+
cdef extern from "cccl/c/transform.h":
|
|
1797
|
+
cdef struct cccl_device_transform_build_result_t:
|
|
1798
|
+
const char* cubin
|
|
1799
|
+
size_t cubin_size
|
|
1800
|
+
|
|
1801
|
+
cdef CUresult cccl_device_unary_transform_build(
|
|
1802
|
+
cccl_device_transform_build_result_t *build_ptr,
|
|
1803
|
+
cccl_iterator_t d_in,
|
|
1804
|
+
cccl_iterator_t d_out,
|
|
1805
|
+
cccl_op_t op,
|
|
1806
|
+
int, int, const char *, const char *, const char *, const char *
|
|
1807
|
+
) nogil
|
|
1808
|
+
|
|
1809
|
+
cdef CUresult cccl_device_unary_transform(
|
|
1810
|
+
cccl_device_transform_build_result_t build,
|
|
1811
|
+
cccl_iterator_t d_in,
|
|
1812
|
+
cccl_iterator_t d_out,
|
|
1813
|
+
uint64_t num_items,
|
|
1814
|
+
cccl_op_t op,
|
|
1815
|
+
CUstream stream) nogil
|
|
1816
|
+
|
|
1817
|
+
cdef CUresult cccl_device_binary_transform_build(
|
|
1818
|
+
cccl_device_transform_build_result_t* build_ptr,
|
|
1819
|
+
cccl_iterator_t d_in1,
|
|
1820
|
+
cccl_iterator_t d_in2,
|
|
1821
|
+
cccl_iterator_t d_out,
|
|
1822
|
+
cccl_op_t op,
|
|
1823
|
+
int, int, const char *, const char *, const char *, const char *
|
|
1824
|
+
) nogil
|
|
1825
|
+
|
|
1826
|
+
cdef CUresult cccl_device_binary_transform(
|
|
1827
|
+
cccl_device_transform_build_result_t build,
|
|
1828
|
+
cccl_iterator_t d_in1,
|
|
1829
|
+
cccl_iterator_t d_in2,
|
|
1830
|
+
cccl_iterator_t d_out,
|
|
1831
|
+
uint64_t num_items,
|
|
1832
|
+
cccl_op_t op,
|
|
1833
|
+
CUstream stream) nogil
|
|
1834
|
+
|
|
1835
|
+
cdef CUresult cccl_device_transform_cleanup(
|
|
1836
|
+
cccl_device_transform_build_result_t *build_ptr,
|
|
1837
|
+
) nogil
|
|
1838
|
+
|
|
1839
|
+
|
|
1840
|
+
cdef class DeviceUnaryTransform:
|
|
1841
|
+
cdef cccl_device_transform_build_result_t build_data
|
|
1842
|
+
|
|
1843
|
+
def __cinit__(
|
|
1844
|
+
self,
|
|
1845
|
+
Iterator d_in,
|
|
1846
|
+
Iterator d_out,
|
|
1847
|
+
Op op,
|
|
1848
|
+
CommonData common_data
|
|
1849
|
+
):
|
|
1850
|
+
memset(&self.build_data, 0, sizeof(cccl_device_transform_build_result_t))
|
|
1851
|
+
|
|
1852
|
+
cdef CUresult status = -1
|
|
1853
|
+
cdef int cc_major = common_data.get_cc_major()
|
|
1854
|
+
cdef int cc_minor = common_data.get_cc_minor()
|
|
1855
|
+
cdef const char *cub_path = common_data.cub_path_get_c_str()
|
|
1856
|
+
cdef const char *thrust_path = common_data.thrust_path_get_c_str()
|
|
1857
|
+
cdef const char *libcudacxx_path = common_data.libcudacxx_path_get_c_str()
|
|
1858
|
+
cdef const char *ctk_path = common_data.ctk_path_get_c_str()
|
|
1859
|
+
|
|
1860
|
+
with nogil:
|
|
1861
|
+
status = cccl_device_unary_transform_build(
|
|
1862
|
+
&self.build_data,
|
|
1863
|
+
d_in.iter_data,
|
|
1864
|
+
d_out.iter_data,
|
|
1865
|
+
op.op_data,
|
|
1866
|
+
cc_major,
|
|
1867
|
+
cc_minor,
|
|
1868
|
+
cub_path,
|
|
1869
|
+
thrust_path,
|
|
1870
|
+
libcudacxx_path,
|
|
1871
|
+
ctk_path,
|
|
1872
|
+
)
|
|
1873
|
+
if status != 0:
|
|
1874
|
+
raise RuntimeError("Failed to build unary transform")
|
|
1875
|
+
|
|
1876
|
+
def __dealloc__(DeviceUnaryTransform self):
|
|
1877
|
+
cdef CUresult status = -1
|
|
1878
|
+
with nogil:
|
|
1879
|
+
status = cccl_device_transform_cleanup(&self.build_data)
|
|
1880
|
+
if (status != 0):
|
|
1881
|
+
print(f"Return code {status} encountered during unary transform result cleanup")
|
|
1882
|
+
|
|
1883
|
+
cpdef void compute(
|
|
1884
|
+
DeviceUnaryTransform self,
|
|
1885
|
+
Iterator d_in,
|
|
1886
|
+
Iterator d_out,
|
|
1887
|
+
size_t num_items,
|
|
1888
|
+
Op op,
|
|
1889
|
+
stream
|
|
1890
|
+
):
|
|
1891
|
+
cdef CUresult status = -1
|
|
1892
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
1893
|
+
with nogil:
|
|
1894
|
+
status = cccl_device_unary_transform(
|
|
1895
|
+
self.build_data,
|
|
1896
|
+
d_in.iter_data,
|
|
1897
|
+
d_out.iter_data,
|
|
1898
|
+
<uint64_t>num_items,
|
|
1899
|
+
op.op_data,
|
|
1900
|
+
c_stream
|
|
1901
|
+
)
|
|
1902
|
+
if (status != 0):
|
|
1903
|
+
raise RuntimeError("Failed to compute unary transform")
|
|
1904
|
+
|
|
1905
|
+
|
|
1906
|
+
def _get_cubin(self):
|
|
1907
|
+
return PyBytes_FromStringAndSize(
|
|
1908
|
+
<const char*>self.build_data.cubin,
|
|
1909
|
+
self.build_data.cubin_size
|
|
1910
|
+
)
|
|
1911
|
+
|
|
1912
|
+
|
|
1913
|
+
cdef class DeviceBinaryTransform:
|
|
1914
|
+
cdef cccl_device_transform_build_result_t build_data
|
|
1915
|
+
|
|
1916
|
+
def __cinit__(
|
|
1917
|
+
self,
|
|
1918
|
+
Iterator d_in1,
|
|
1919
|
+
Iterator d_in2,
|
|
1920
|
+
Iterator d_out,
|
|
1921
|
+
Op op,
|
|
1922
|
+
CommonData common_data
|
|
1923
|
+
):
|
|
1924
|
+
memset(&self.build_data, 0, sizeof(cccl_device_transform_build_result_t))
|
|
1925
|
+
|
|
1926
|
+
cdef CUresult status = -1
|
|
1927
|
+
cdef int cc_major = common_data.get_cc_major()
|
|
1928
|
+
cdef int cc_minor = common_data.get_cc_minor()
|
|
1929
|
+
cdef const char *cub_path = common_data.cub_path_get_c_str()
|
|
1930
|
+
cdef const char *thrust_path = common_data.thrust_path_get_c_str()
|
|
1931
|
+
cdef const char *libcudacxx_path = common_data.libcudacxx_path_get_c_str()
|
|
1932
|
+
cdef const char *ctk_path = common_data.ctk_path_get_c_str()
|
|
1933
|
+
|
|
1934
|
+
with nogil:
|
|
1935
|
+
status = cccl_device_binary_transform_build(
|
|
1936
|
+
&self.build_data,
|
|
1937
|
+
d_in1.iter_data,
|
|
1938
|
+
d_in2.iter_data,
|
|
1939
|
+
d_out.iter_data,
|
|
1940
|
+
op.op_data,
|
|
1941
|
+
cc_major,
|
|
1942
|
+
cc_minor,
|
|
1943
|
+
cub_path,
|
|
1944
|
+
thrust_path,
|
|
1945
|
+
libcudacxx_path,
|
|
1946
|
+
ctk_path,
|
|
1947
|
+
)
|
|
1948
|
+
if status != 0:
|
|
1949
|
+
raise RuntimeError("Failed to build binary transform")
|
|
1950
|
+
|
|
1951
|
+
def __dealloc__(DeviceBinaryTransform self):
|
|
1952
|
+
cdef CUresult status = -1
|
|
1953
|
+
with nogil:
|
|
1954
|
+
status = cccl_device_transform_cleanup(&self.build_data)
|
|
1955
|
+
if (status != 0):
|
|
1956
|
+
print(f"Return code {status} encountered during binary transform result cleanup")
|
|
1957
|
+
|
|
1958
|
+
cpdef void compute(
|
|
1959
|
+
DeviceBinaryTransform self,
|
|
1960
|
+
Iterator d_in1,
|
|
1961
|
+
Iterator d_in2,
|
|
1962
|
+
Iterator d_out,
|
|
1963
|
+
size_t num_items,
|
|
1964
|
+
Op op,
|
|
1965
|
+
stream
|
|
1966
|
+
):
|
|
1967
|
+
cdef CUresult status = -1
|
|
1968
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
1969
|
+
with nogil:
|
|
1970
|
+
status = cccl_device_binary_transform(
|
|
1971
|
+
self.build_data,
|
|
1972
|
+
d_in1.iter_data,
|
|
1973
|
+
d_in2.iter_data,
|
|
1974
|
+
d_out.iter_data,
|
|
1975
|
+
<uint64_t>num_items,
|
|
1976
|
+
op.op_data,
|
|
1977
|
+
c_stream
|
|
1978
|
+
)
|
|
1979
|
+
if (status != 0):
|
|
1980
|
+
raise RuntimeError("Failed to compute binary transform")
|
|
1981
|
+
|
|
1982
|
+
def _get_cubin(self):
|
|
1983
|
+
return PyBytes_FromStringAndSize(
|
|
1984
|
+
<const char*>self.build_data.cubin,
|
|
1985
|
+
self.build_data.cubin_size
|
|
1986
|
+
)
|
|
1987
|
+
|
|
1988
|
+
|
|
1989
|
+
# -----------------
|
|
1990
|
+
# DeviceHistogram
|
|
1991
|
+
# -----------------
|
|
1992
|
+
cdef extern from "cccl/c/histogram.h":
|
|
1993
|
+
cdef struct cccl_device_histogram_build_result_t 'cccl_device_histogram_build_result_t':
|
|
1994
|
+
const char* cubin
|
|
1995
|
+
size_t cubin_size
|
|
1996
|
+
|
|
1997
|
+
cdef CUresult cccl_device_histogram_build(
|
|
1998
|
+
cccl_device_histogram_build_result_t *build_ptr,
|
|
1999
|
+
int num_channels,
|
|
2000
|
+
int num_active_channels,
|
|
2001
|
+
cccl_iterator_t d_samples,
|
|
2002
|
+
int num_output_levels_val,
|
|
2003
|
+
cccl_iterator_t d_output_histograms,
|
|
2004
|
+
cccl_value_t h_levels,
|
|
2005
|
+
int64_t num_rows,
|
|
2006
|
+
int64_t row_stride_samples,
|
|
2007
|
+
bint is_evenly_segmented,
|
|
2008
|
+
int, int, const char *, const char *, const char *, const char *
|
|
2009
|
+
) nogil
|
|
2010
|
+
|
|
2011
|
+
cdef CUresult cccl_device_histogram_even(
|
|
2012
|
+
cccl_device_histogram_build_result_t build,
|
|
2013
|
+
void *d_storage_ptr,
|
|
2014
|
+
size_t *d_storage_nbytes,
|
|
2015
|
+
cccl_iterator_t d_samples,
|
|
2016
|
+
cccl_iterator_t d_output_histograms,
|
|
2017
|
+
cccl_value_t num_output_levels,
|
|
2018
|
+
cccl_value_t lower_level,
|
|
2019
|
+
cccl_value_t upper_level,
|
|
2020
|
+
int64_t num_row_pixels,
|
|
2021
|
+
int64_t num_rows,
|
|
2022
|
+
int64_t row_stride_samples,
|
|
2023
|
+
CUstream stream
|
|
2024
|
+
) nogil
|
|
2025
|
+
|
|
2026
|
+
cdef CUresult cccl_device_histogram_cleanup(
|
|
2027
|
+
cccl_device_histogram_build_result_t *build_ptr,
|
|
2028
|
+
) nogil
|
|
2029
|
+
|
|
2030
|
+
|
|
2031
|
+
cdef class DeviceHistogramBuildResult:
|
|
2032
|
+
cdef cccl_device_histogram_build_result_t build_data
|
|
2033
|
+
|
|
2034
|
+
def __dealloc__(DeviceHistogramBuildResult self):
|
|
2035
|
+
cdef CUresult status = -1
|
|
2036
|
+
with nogil:
|
|
2037
|
+
status = cccl_device_histogram_cleanup(&self.build_data)
|
|
2038
|
+
if (status != 0):
|
|
2039
|
+
print(f"Return code {status} encountered during histogram result cleanup")
|
|
2040
|
+
|
|
2041
|
+
|
|
2042
|
+
def __cinit__(
|
|
2043
|
+
DeviceHistogramBuildResult self,
|
|
2044
|
+
int num_channels,
|
|
2045
|
+
int num_active_channels,
|
|
2046
|
+
Iterator d_samples,
|
|
2047
|
+
int num_levels,
|
|
2048
|
+
Iterator d_histogram,
|
|
2049
|
+
Value h_levels,
|
|
2050
|
+
int num_rows,
|
|
2051
|
+
int row_stride_samples,
|
|
2052
|
+
bint is_evenly_segmented,
|
|
2053
|
+
CommonData common_data
|
|
2054
|
+
):
|
|
2055
|
+
cdef CUresult status = -1
|
|
2056
|
+
cdef int cc_major = common_data.get_cc_major()
|
|
2057
|
+
cdef int cc_minor = common_data.get_cc_minor()
|
|
2058
|
+
cdef const char *cub_path = common_data.cub_path_get_c_str()
|
|
2059
|
+
cdef const char *thrust_path = common_data.thrust_path_get_c_str()
|
|
2060
|
+
cdef const char *libcudacxx_path = common_data.libcudacxx_path_get_c_str()
|
|
2061
|
+
cdef const char *ctk_path = common_data.ctk_path_get_c_str()
|
|
2062
|
+
|
|
2063
|
+
memset(&self.build_data, 0, sizeof(cccl_device_histogram_build_result_t))
|
|
2064
|
+
with nogil:
|
|
2065
|
+
status = cccl_device_histogram_build(
|
|
2066
|
+
&self.build_data,
|
|
2067
|
+
num_channels,
|
|
2068
|
+
num_active_channels,
|
|
2069
|
+
d_samples.iter_data,
|
|
2070
|
+
num_levels,
|
|
2071
|
+
d_histogram.iter_data,
|
|
2072
|
+
h_levels.value_data,
|
|
2073
|
+
num_rows,
|
|
2074
|
+
row_stride_samples,
|
|
2075
|
+
is_evenly_segmented,
|
|
2076
|
+
cc_major,
|
|
2077
|
+
cc_minor,
|
|
2078
|
+
cub_path,
|
|
2079
|
+
thrust_path,
|
|
2080
|
+
libcudacxx_path,
|
|
2081
|
+
ctk_path,
|
|
2082
|
+
)
|
|
2083
|
+
if status != 0:
|
|
2084
|
+
raise RuntimeError(
|
|
2085
|
+
f"Failed building histogram, error code: {status}"
|
|
2086
|
+
)
|
|
2087
|
+
|
|
2088
|
+
cpdef int compute_even(
|
|
2089
|
+
DeviceHistogramBuildResult self,
|
|
2090
|
+
temp_storage_ptr,
|
|
2091
|
+
temp_storage_bytes,
|
|
2092
|
+
Iterator d_samples,
|
|
2093
|
+
Iterator d_histogram,
|
|
2094
|
+
Value h_num_output_levels,
|
|
2095
|
+
Value h_lower_level,
|
|
2096
|
+
Value h_upper_level,
|
|
2097
|
+
int num_row_pixels,
|
|
2098
|
+
int num_rows,
|
|
2099
|
+
int row_stride_samples,
|
|
2100
|
+
stream
|
|
2101
|
+
):
|
|
2102
|
+
cdef CUresult status = -1
|
|
2103
|
+
cdef void *storage_ptr = (<void *><size_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
2104
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
2105
|
+
cdef CUstream c_stream = <CUstream><size_t>(stream) if stream else NULL
|
|
2106
|
+
|
|
2107
|
+
with nogil:
|
|
2108
|
+
status = cccl_device_histogram_even(
|
|
2109
|
+
self.build_data,
|
|
2110
|
+
storage_ptr,
|
|
2111
|
+
&storage_sz,
|
|
2112
|
+
d_samples.iter_data,
|
|
2113
|
+
d_histogram.iter_data,
|
|
2114
|
+
h_num_output_levels.value_data,
|
|
2115
|
+
h_lower_level.value_data,
|
|
2116
|
+
h_upper_level.value_data,
|
|
2117
|
+
num_row_pixels,
|
|
2118
|
+
num_rows,
|
|
2119
|
+
row_stride_samples,
|
|
2120
|
+
c_stream
|
|
2121
|
+
)
|
|
2122
|
+
if status != 0:
|
|
2123
|
+
raise RuntimeError(
|
|
2124
|
+
f"Failed executing histogram, error code: {status}"
|
|
2125
|
+
)
|
|
2126
|
+
return storage_sz
|
|
2127
|
+
|
|
2128
|
+
|
|
2129
|
+
def _get_cubin(self):
|
|
2130
|
+
return PyBytes_FromStringAndSize(
|
|
2131
|
+
<const char*>self.build_data.cubin,
|
|
2132
|
+
self.build_data.cubin_size
|
|
2133
|
+
)
|
|
2134
|
+
|
|
2135
|
+
|
|
2136
|
+
# ----------------------------------
|
|
2137
|
+
# DeviceThreeWayPartitionBuildResult
|
|
2138
|
+
# ----------------------------------
|
|
2139
|
+
cdef extern from "cccl/c/three_way_partition.h":
|
|
2140
|
+
cdef struct cccl_device_three_way_partition_build_result_t 'cccl_device_three_way_partition_build_result_t':
|
|
2141
|
+
const char* cubin
|
|
2142
|
+
size_t cubin_size
|
|
2143
|
+
|
|
2144
|
+
cdef CUresult cccl_device_three_way_partition_build(
|
|
2145
|
+
cccl_device_three_way_partition_build_result_t *build_ptr,
|
|
2146
|
+
cccl_iterator_t d_in,
|
|
2147
|
+
cccl_iterator_t d_first_part_out,
|
|
2148
|
+
cccl_iterator_t d_second_part_out,
|
|
2149
|
+
cccl_iterator_t d_unselected_out,
|
|
2150
|
+
cccl_iterator_t d_num_selected_out,
|
|
2151
|
+
cccl_op_t select_first_part_op,
|
|
2152
|
+
cccl_op_t select_second_part_op,
|
|
2153
|
+
int, int, const char *, const char *, const char *, const char *
|
|
2154
|
+
) nogil
|
|
2155
|
+
|
|
2156
|
+
CUresult cccl_device_three_way_partition(
|
|
2157
|
+
cccl_device_three_way_partition_build_result_t build,
|
|
2158
|
+
void* d_temp_storage,
|
|
2159
|
+
size_t* temp_storage_bytes,
|
|
2160
|
+
cccl_iterator_t d_in,
|
|
2161
|
+
cccl_iterator_t d_first_part_out,
|
|
2162
|
+
cccl_iterator_t d_second_part_out,
|
|
2163
|
+
cccl_iterator_t d_unselected_out,
|
|
2164
|
+
cccl_iterator_t d_num_selected_out,
|
|
2165
|
+
cccl_op_t select_first_part_op,
|
|
2166
|
+
cccl_op_t select_second_part_op,
|
|
2167
|
+
int64_t num_items,
|
|
2168
|
+
CUstream stream
|
|
2169
|
+
) nogil
|
|
2170
|
+
|
|
2171
|
+
cdef CUresult cccl_device_three_way_partition_cleanup(
|
|
2172
|
+
cccl_device_three_way_partition_build_result_t *build_ptr
|
|
2173
|
+
) nogil
|
|
2174
|
+
|
|
2175
|
+
|
|
2176
|
+
cdef class DeviceThreeWayPartitionBuildResult:
|
|
2177
|
+
cdef cccl_device_three_way_partition_build_result_t build_data
|
|
2178
|
+
|
|
2179
|
+
def __dealloc__(DeviceThreeWayPartitionBuildResult self):
|
|
2180
|
+
cdef CUresult status = -1
|
|
2181
|
+
with nogil:
|
|
2182
|
+
status = cccl_device_three_way_partition_cleanup(&self.build_data)
|
|
2183
|
+
if (status != 0):
|
|
2184
|
+
print(f"Return code {status} encountered during three_way_partition result cleanup")
|
|
2185
|
+
|
|
2186
|
+
|
|
2187
|
+
def __cinit__(
|
|
2188
|
+
DeviceThreeWayPartitionBuildResult self,
|
|
2189
|
+
Iterator d_in,
|
|
2190
|
+
Iterator d_first_part_out,
|
|
2191
|
+
Iterator d_second_part_out,
|
|
2192
|
+
Iterator d_unselected_out,
|
|
2193
|
+
Iterator d_num_selected_out,
|
|
2194
|
+
Op select_first_part_op,
|
|
2195
|
+
Op select_second_part_op,
|
|
2196
|
+
CommonData common_data
|
|
2197
|
+
):
|
|
2198
|
+
cdef CUresult status = -1
|
|
2199
|
+
cdef int cc_major = common_data.get_cc_major()
|
|
2200
|
+
cdef int cc_minor = common_data.get_cc_minor()
|
|
2201
|
+
cdef const char *cub_path = common_data.cub_path_get_c_str()
|
|
2202
|
+
cdef const char *thrust_path = common_data.thrust_path_get_c_str()
|
|
2203
|
+
cdef const char *libcudacxx_path = common_data.libcudacxx_path_get_c_str()
|
|
2204
|
+
cdef const char *ctk_path = common_data.ctk_path_get_c_str()
|
|
2205
|
+
|
|
2206
|
+
memset(&self.build_data, 0, sizeof(cccl_device_three_way_partition_build_result_t))
|
|
2207
|
+
with nogil:
|
|
2208
|
+
status = cccl_device_three_way_partition_build(
|
|
2209
|
+
&self.build_data,
|
|
2210
|
+
d_in.iter_data,
|
|
2211
|
+
d_first_part_out.iter_data,
|
|
2212
|
+
d_second_part_out.iter_data,
|
|
2213
|
+
d_unselected_out.iter_data,
|
|
2214
|
+
d_num_selected_out.iter_data,
|
|
2215
|
+
select_first_part_op.op_data,
|
|
2216
|
+
select_second_part_op.op_data,
|
|
2217
|
+
cc_major,
|
|
2218
|
+
cc_minor,
|
|
2219
|
+
cub_path,
|
|
2220
|
+
thrust_path,
|
|
2221
|
+
libcudacxx_path,
|
|
2222
|
+
ctk_path,
|
|
2223
|
+
)
|
|
2224
|
+
if status != 0:
|
|
2225
|
+
raise RuntimeError(
|
|
2226
|
+
f"Failed building three_way_partition, error code: {status}"
|
|
2227
|
+
)
|
|
2228
|
+
|
|
2229
|
+
cpdef int compute(
|
|
2230
|
+
DeviceThreeWayPartitionBuildResult self,
|
|
2231
|
+
temp_storage_ptr,
|
|
2232
|
+
temp_storage_bytes,
|
|
2233
|
+
Iterator d_in,
|
|
2234
|
+
Iterator d_first_part_out,
|
|
2235
|
+
Iterator d_second_part_out,
|
|
2236
|
+
Iterator d_unselected_out,
|
|
2237
|
+
Iterator d_num_selected_out,
|
|
2238
|
+
Op select_first_part_op,
|
|
2239
|
+
Op select_second_part_op,
|
|
2240
|
+
size_t num_items,
|
|
2241
|
+
stream
|
|
2242
|
+
):
|
|
2243
|
+
cdef CUresult status = -1
|
|
2244
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
2245
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
2246
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
2247
|
+
|
|
2248
|
+
with nogil:
|
|
2249
|
+
status = cccl_device_three_way_partition(
|
|
2250
|
+
self.build_data,
|
|
2251
|
+
storage_ptr,
|
|
2252
|
+
&storage_sz,
|
|
2253
|
+
d_in.iter_data,
|
|
2254
|
+
d_first_part_out.iter_data,
|
|
2255
|
+
d_second_part_out.iter_data,
|
|
2256
|
+
d_unselected_out.iter_data,
|
|
2257
|
+
d_num_selected_out.iter_data,
|
|
2258
|
+
select_first_part_op.op_data,
|
|
2259
|
+
select_second_part_op.op_data,
|
|
2260
|
+
<uint64_t>num_items,
|
|
2261
|
+
c_stream
|
|
2262
|
+
)
|
|
2263
|
+
if status != 0:
|
|
2264
|
+
raise RuntimeError(
|
|
2265
|
+
f"Failed executing three_way_partition, error code: {status}"
|
|
2266
|
+
)
|
|
2267
|
+
return storage_sz
|
|
2268
|
+
|
|
2269
|
+
def _get_cubin(self):
|
|
2270
|
+
return PyBytes_FromStringAndSize(
|
|
2271
|
+
<const char*>self.build_data.cubin,
|
|
2272
|
+
self.build_data.cubin_size
|
|
2273
|
+
)
|