cuda-cccl 0.1.3.2.0.dev271__cp310-cp310-manylinux_2_26_x86_64.whl → 0.2.1__cp310-cp310-manylinux_2_26_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/_cuda_version_utils.py +0 -22
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +6 -2
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +3 -1
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +5 -2
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +4 -2
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +3 -2
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +6 -3
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +4 -2
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +6 -3
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +3 -2
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +8 -2
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +4 -1
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +5 -4
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +3 -1
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +1 -1
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +23 -24
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +5 -3
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +5 -2
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +2 -1
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +0 -3
- cuda/cccl/headers/include/cub/block/block_scan.cuh +2 -1
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +9 -5
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +1 -1
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +2 -2
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +6 -5
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +8 -2
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +11 -4
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +7 -3
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +4 -4
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +5 -4
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +1 -1
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +0 -18
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +4 -3
- cuda/cccl/headers/include/cub/detail/rfa.cuh +9 -2
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +15 -7
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +3 -2
- cuda/cccl/headers/include/cub/device/device_for.cuh +5 -2
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +3 -1
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +1 -1
- cuda/cccl/headers/include/cub/device/device_merge.cuh +2 -1
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3 -1
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +10 -1
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +9 -2
- cuda/cccl/headers/include/cub/device/device_select.cuh +5 -1
- cuda/cccl/headers/include/cub/device/device_transform.cuh +109 -26
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +2 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +3 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +4 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +2 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +7 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +3 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +6 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +13 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +79 -40
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +4 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +5 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +6 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +6 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +5 -4
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +6 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +3 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +6 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +8 -12
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +3 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +11 -3
- cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +5 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +7 -17
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +26 -4
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +3 -2
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +4 -2
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +2 -1
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +10 -4
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +3 -2
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +3 -5
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +6 -7
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +4 -6
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +4 -6
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +3 -1
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +7 -3
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +11 -17
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +2 -1
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +7 -13
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +3 -2
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +2 -1
- cuda/cccl/headers/include/cub/util_device.cuh +9 -9
- cuda/cccl/headers/include/cub/util_macro.cuh +0 -2
- cuda/cccl/headers/include/cub/util_math.cuh +4 -1
- cuda/cccl/headers/include/cub/util_type.cuh +18 -29
- cuda/cccl/headers/include/cub/util_vsmem.cuh +5 -3
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +1 -1
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +1 -1
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +9 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -1
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +8 -6
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +5 -3
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +1 -1
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +1 -1
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +5 -3
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -1
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +3 -3
- cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +93 -0
- cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
- cuda/cccl/headers/include/cuda/__complex_ +28 -0
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
- cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +98 -60
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +132 -114
- cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +46 -36
- cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +86 -56
- cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +66 -29
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +123 -63
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +71 -62
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +95 -99
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +98 -99
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +101 -99
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +14 -6
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +198 -103
- cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +3 -3
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +27 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +8 -0
- cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +3 -3
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +60 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +15 -0
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +105 -153
- cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +5 -7
- cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +1 -0
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +186 -119
- cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +5 -3
- cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +6 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +32 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +6 -0
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +3 -3
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +2 -2
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +2 -3
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +26 -100
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +2 -2
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +2 -2
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +2 -2
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +2 -3
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +6 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +10 -12
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +1 -1
- cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +8 -8
- cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +0 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +0 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +0 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +0 -1
- cuda/cccl/headers/include/cuda/std/cmath +63 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +1 -2
- cuda/cccl/headers/include/cuda/std/inplace_vector +9 -9
- cuda/cccl/headers/include/cuda/std/numbers +0 -1
- cuda/cccl/headers/include/thrust/detail/pointer.h +1 -1
- cuda/cccl/headers/include/thrust/detail/vector_base.h +2 -2
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +2 -4
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +9 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +4 -1
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +39 -56
- cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +6 -10
- cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +1 -2
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +12 -1
- cuda/cccl/headers/include/thrust/iterator/{detail/iterator_traversal_tags.h → iterator_traversal_tags.h} +14 -0
- cuda/cccl/headers/include/thrust/iterator/retag.h +5 -5
- cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +7 -7
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +2 -2
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +77 -107
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +2 -5
- cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +2 -5
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +2 -5
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +2 -5
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +2 -5
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +2 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +0 -16
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +36 -18
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +26 -51
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +25 -14
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +1 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +18 -21
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +0 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +19 -23
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +0 -11
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +7 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +15 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +0 -1
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +2 -5
- cuda/cccl/headers/include/thrust/system/detail/errno.h +2 -7
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +3 -10
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +2 -17
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +2 -17
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +2 -9
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +3 -9
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +2 -10
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +3 -9
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +3 -9
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +2 -8
- cuda/cccl/headers/include_paths.py +6 -9
- cuda/cccl/parallel/experimental/__init__.py +2 -4
- cuda/cccl/parallel/experimental/_cccl_interop.py +53 -27
- cuda/cccl/parallel/experimental/algorithms/_histogram.py +2 -2
- cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +4 -4
- cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +4 -4
- cuda/cccl/parallel/experimental/algorithms/_reduce.py +2 -2
- cuda/cccl/parallel/experimental/algorithms/_scan.py +4 -4
- cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +4 -4
- cuda/cccl/parallel/experimental/algorithms/_transform.py +5 -5
- cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +5 -5
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-x86_64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-x86_64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/cccl/parallel/experimental/iterators/__init__.py +2 -4
- cuda/cccl/parallel/experimental/iterators/_factories.py +28 -51
- cuda/cccl/parallel/experimental/iterators/_iterators.py +189 -204
- cuda/cccl/parallel/experimental/iterators/_zip_iterator.py +4 -12
- cuda/cccl/parallel/experimental/numba_utils.py +47 -0
- {cuda_cccl-0.1.3.2.0.dev271.dist-info → cuda_cccl-0.2.1.dist-info}/METADATA +8 -5
- {cuda_cccl-0.1.3.2.0.dev271.dist-info → cuda_cccl-0.2.1.dist-info}/RECORD +339 -332
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +0 -520
- cuda/cccl/headers/include/thrust/detail/mpl/math.h +0 -164
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +0 -44
- {cuda_cccl-0.1.3.2.0.dev271.dist-info → cuda_cccl-0.2.1.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.1.3.2.0.dev271.dist-info → cuda_cccl-0.2.1.dist-info}/licenses/LICENSE +0 -0
cuda/cccl/_cuda_version_utils.py
CHANGED
|
@@ -6,9 +6,6 @@
|
|
|
6
6
|
CUDA version detection utilities shared across the cccl package.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
import os
|
|
10
|
-
import shutil
|
|
11
|
-
from pathlib import Path
|
|
12
9
|
from typing import Optional
|
|
13
10
|
|
|
14
11
|
import cuda.bindings
|
|
@@ -19,25 +16,6 @@ def detect_cuda_version() -> Optional[int]:
|
|
|
19
16
|
return int(cuda_version.split(".")[0])
|
|
20
17
|
|
|
21
18
|
|
|
22
|
-
def get_cuda_path() -> Optional[Path]:
|
|
23
|
-
"""Get the CUDA installation path."""
|
|
24
|
-
cuda_path_str = os.environ.get("CUDA_PATH")
|
|
25
|
-
if cuda_path_str:
|
|
26
|
-
cuda_path = Path(cuda_path_str)
|
|
27
|
-
if cuda_path.exists():
|
|
28
|
-
return cuda_path
|
|
29
|
-
|
|
30
|
-
nvcc_path = shutil.which("nvcc")
|
|
31
|
-
if nvcc_path:
|
|
32
|
-
return Path(nvcc_path).parent.parent
|
|
33
|
-
|
|
34
|
-
default_path = Path("/usr/local/cuda")
|
|
35
|
-
if default_path.exists():
|
|
36
|
-
return default_path
|
|
37
|
-
|
|
38
|
-
return None
|
|
39
|
-
|
|
40
|
-
|
|
41
19
|
def get_recommended_extra(cuda_version: Optional[int]) -> str:
|
|
42
20
|
"""Get the recommended pip extra for the detected CUDA version."""
|
|
43
21
|
if cuda_version == 13:
|
|
@@ -52,9 +52,13 @@
|
|
|
52
52
|
#include <cub/util_ptx.cuh>
|
|
53
53
|
#include <cub/util_type.cuh>
|
|
54
54
|
|
|
55
|
-
#include <cuda/
|
|
55
|
+
#include <cuda/__cmath/ceil_div.h>
|
|
56
|
+
#include <cuda/__cmath/round_up.h>
|
|
57
|
+
#include <cuda/std/__functional/operations.h>
|
|
58
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
59
|
+
#include <cuda/std/__type_traits/enable_if.h>
|
|
60
|
+
#include <cuda/std/__type_traits/type_identity.h>
|
|
56
61
|
#include <cuda/std/cstdint>
|
|
57
|
-
#include <cuda/std/type_traits>
|
|
58
62
|
|
|
59
63
|
CUB_NAMESPACE_BEGIN
|
|
60
64
|
|
|
@@ -49,7 +49,9 @@
|
|
|
49
49
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
50
50
|
#include <cub/util_type.cuh>
|
|
51
51
|
|
|
52
|
-
#include <cuda/std/
|
|
52
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
53
|
+
#include <cuda/std/__type_traits/integral_constant.h>
|
|
54
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
53
55
|
|
|
54
56
|
CUB_NAMESPACE_BEGIN
|
|
55
57
|
|
|
@@ -50,8 +50,11 @@
|
|
|
50
50
|
#include <cub/util_math.cuh>
|
|
51
51
|
#include <cub/util_type.cuh>
|
|
52
52
|
|
|
53
|
-
#include <cuda/
|
|
54
|
-
#include <cuda/
|
|
53
|
+
#include <cuda/__cmath/ceil_div.h>
|
|
54
|
+
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
55
|
+
#include <cuda/std/__algorithm/max.h>
|
|
56
|
+
#include <cuda/std/__algorithm/min.h>
|
|
57
|
+
#include <cuda/std/__functional/operations.h>
|
|
55
58
|
|
|
56
59
|
CUB_NAMESPACE_BEGIN
|
|
57
60
|
|
|
@@ -49,8 +49,10 @@
|
|
|
49
49
|
#include <cub/util_ptx.cuh>
|
|
50
50
|
#include <cub/util_type.cuh>
|
|
51
51
|
|
|
52
|
-
#include <cuda/
|
|
53
|
-
#include <cuda/std/
|
|
52
|
+
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
53
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
54
|
+
#include <cuda/std/__type_traits/integral_constant.h>
|
|
55
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
54
56
|
|
|
55
57
|
CUB_NAMESPACE_BEGIN
|
|
56
58
|
|
|
@@ -52,8 +52,9 @@
|
|
|
52
52
|
#include <cub/util_type.cuh>
|
|
53
53
|
#include <cub/warp/warp_reduce.cuh>
|
|
54
54
|
|
|
55
|
-
#include <cuda/
|
|
56
|
-
#include <cuda/std/
|
|
55
|
+
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
56
|
+
#include <cuda/std/__algorithm/max.h>
|
|
57
|
+
#include <cuda/std/__algorithm/min.h>
|
|
57
58
|
|
|
58
59
|
CUB_NAMESPACE_BEGIN
|
|
59
60
|
|
|
@@ -50,9 +50,12 @@
|
|
|
50
50
|
#include <cub/util_device.cuh>
|
|
51
51
|
#include <cub/util_type.cuh>
|
|
52
52
|
|
|
53
|
-
#include <cuda/
|
|
54
|
-
#include <cuda/std/
|
|
55
|
-
#include <cuda/std/
|
|
53
|
+
#include <cuda/__memory/is_aligned.h>
|
|
54
|
+
#include <cuda/std/__algorithm/min.h>
|
|
55
|
+
#include <cuda/std/__functional/identity.h>
|
|
56
|
+
#include <cuda/std/__functional/operations.h>
|
|
57
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
58
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
56
59
|
|
|
57
60
|
CUB_NAMESPACE_BEGIN
|
|
58
61
|
|
|
@@ -50,8 +50,10 @@
|
|
|
50
50
|
#include <cub/block/block_store.cuh>
|
|
51
51
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
52
52
|
|
|
53
|
-
#include <cuda/std/
|
|
54
|
-
|
|
53
|
+
#include <cuda/std/__functional/operations.h>
|
|
54
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
55
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
56
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
55
57
|
CUB_NAMESPACE_BEGIN
|
|
56
58
|
|
|
57
59
|
/******************************************************************************
|
|
@@ -50,11 +50,14 @@
|
|
|
50
50
|
#include <cub/block/block_load.cuh>
|
|
51
51
|
#include <cub/block/block_scan.cuh>
|
|
52
52
|
#include <cub/block/block_store.cuh>
|
|
53
|
-
#include <cub/grid/grid_queue.cuh>
|
|
54
53
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
55
54
|
|
|
56
|
-
#include <cuda/
|
|
57
|
-
#include <cuda/std/
|
|
55
|
+
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
56
|
+
#include <cuda/std/__functional/operations.h>
|
|
57
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
58
|
+
#include <cuda/std/__type_traits/integral_constant.h>
|
|
59
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
60
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
58
61
|
|
|
59
62
|
CUB_NAMESPACE_BEGIN
|
|
60
63
|
|
|
@@ -47,10 +47,11 @@
|
|
|
47
47
|
#include <cub/block/block_load.cuh>
|
|
48
48
|
#include <cub/block/block_scan.cuh>
|
|
49
49
|
#include <cub/block/block_store.cuh>
|
|
50
|
-
#include <cub/grid/grid_queue.cuh>
|
|
51
50
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
52
51
|
|
|
53
|
-
#include <cuda/std/
|
|
52
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
53
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
54
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
54
55
|
|
|
55
56
|
CUB_NAMESPACE_BEGIN
|
|
56
57
|
|
|
@@ -50,7 +50,11 @@
|
|
|
50
50
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
51
51
|
#include <cub/util_type.cuh>
|
|
52
52
|
|
|
53
|
-
#include <cuda/std/
|
|
53
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
54
|
+
#include <cuda/std/__type_traits/enable_if.h>
|
|
55
|
+
#include <cuda/std/__type_traits/integral_constant.h>
|
|
56
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
57
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
54
58
|
|
|
55
59
|
CUB_NAMESPACE_BEGIN
|
|
56
60
|
|
|
@@ -50,11 +50,17 @@
|
|
|
50
50
|
#include <cub/block/block_scan.cuh>
|
|
51
51
|
#include <cub/block/block_store.cuh>
|
|
52
52
|
#include <cub/device/dispatch/dispatch_common.cuh>
|
|
53
|
-
#include <cub/grid/grid_queue.cuh>
|
|
54
53
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
55
54
|
#include <cub/util_type.cuh>
|
|
56
55
|
|
|
57
|
-
#include <cuda/std/
|
|
56
|
+
#include <cuda/std/__functional/operations.h>
|
|
57
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
58
|
+
#include <cuda/std/__type_traits/enable_if.h>
|
|
59
|
+
#include <cuda/std/__type_traits/integral_constant.h>
|
|
60
|
+
#include <cuda/std/__type_traits/is_callable.h>
|
|
61
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
62
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
63
|
+
#include <cuda/std/cstdint>
|
|
58
64
|
|
|
59
65
|
CUB_NAMESPACE_BEGIN
|
|
60
66
|
|
|
@@ -45,7 +45,10 @@
|
|
|
45
45
|
#include <cub/block/block_store.cuh>
|
|
46
46
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
47
47
|
|
|
48
|
-
#include <cuda/std/
|
|
48
|
+
#include <cuda/std/__functional/operations.h>
|
|
49
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
50
|
+
#include <cuda/std/__type_traits/enable_if.h>
|
|
51
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
49
52
|
|
|
50
53
|
CUB_NAMESPACE_BEGIN
|
|
51
54
|
|
|
@@ -51,7 +51,8 @@
|
|
|
51
51
|
#include <cub/util_temporary_storage.cuh>
|
|
52
52
|
#include <cub/warp/warp_reduce.cuh>
|
|
53
53
|
|
|
54
|
-
#include <cuda/std/
|
|
54
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
55
|
+
#include <cuda/std/__type_traits/enable_if.h>
|
|
55
56
|
|
|
56
57
|
#include <nv/target>
|
|
57
58
|
|
|
@@ -1178,13 +1179,13 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
|
|
|
1178
1179
|
#endif // _CCCL_DOXYGEN_INVOKED
|
|
1179
1180
|
|
|
1180
1181
|
/******************************************************************************
|
|
1181
|
-
* Prefix
|
|
1182
|
+
* Prefix callback operator for coupling local block scan within a
|
|
1182
1183
|
* block-cooperative scan
|
|
1183
1184
|
******************************************************************************/
|
|
1184
1185
|
|
|
1185
1186
|
/**
|
|
1186
|
-
* Stateful block-scan prefix functor. Provides the
|
|
1187
|
-
* the current tile by using the
|
|
1187
|
+
* Stateful block-scan prefix functor. Provides the running prefix for
|
|
1188
|
+
* the current tile by using the callback warp to wait for
|
|
1188
1189
|
* aggregates/prefixes from predecessor tiles to become available.
|
|
1189
1190
|
*
|
|
1190
1191
|
* @tparam DelayConstructorT
|
|
@@ -47,7 +47,9 @@
|
|
|
47
47
|
#include <cub/util_type.cuh>
|
|
48
48
|
#include <cub/warp/warp_exchange.cuh>
|
|
49
49
|
|
|
50
|
-
#include <cuda/
|
|
50
|
+
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
51
|
+
#include <cuda/std/__algorithm/min.h>
|
|
52
|
+
#include <cuda/std/__type_traits/integral_constant.h>
|
|
51
53
|
|
|
52
54
|
CUB_NAMESPACE_BEGIN
|
|
53
55
|
|
|
@@ -43,9 +43,8 @@
|
|
|
43
43
|
#include <cub/util_ptx.cuh>
|
|
44
44
|
#include <cub/util_type.cuh>
|
|
45
45
|
|
|
46
|
-
#include <cuda/std/__algorithm/max.h>
|
|
47
46
|
#include <cuda/std/__algorithm/min.h>
|
|
48
|
-
#include <cuda/std/
|
|
47
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
49
48
|
|
|
50
49
|
CUB_NAMESPACE_BEGIN
|
|
51
50
|
|
|
@@ -384,7 +383,7 @@ public:
|
|
|
384
383
|
int valid_items,
|
|
385
384
|
KeyT oob_default)
|
|
386
385
|
{
|
|
387
|
-
if (IS_LAST_TILE)
|
|
386
|
+
if constexpr (IS_LAST_TILE)
|
|
388
387
|
{
|
|
389
388
|
// if last tile, find valid max_key
|
|
390
389
|
// and fill the remaining keys with it
|
|
@@ -418,8 +417,8 @@ public:
|
|
|
418
417
|
for (int target_merged_threads_number = 2; target_merged_threads_number <= NUM_THREADS;
|
|
419
418
|
target_merged_threads_number *= 2)
|
|
420
419
|
{
|
|
421
|
-
int merged_threads_number = target_merged_threads_number / 2;
|
|
422
|
-
int mask = target_merged_threads_number - 1;
|
|
420
|
+
const int merged_threads_number = target_merged_threads_number / 2;
|
|
421
|
+
const int mask = target_merged_threads_number - 1;
|
|
423
422
|
|
|
424
423
|
Sync();
|
|
425
424
|
|
|
@@ -436,23 +435,23 @@ public:
|
|
|
436
435
|
|
|
437
436
|
int indices[ITEMS_PER_THREAD];
|
|
438
437
|
|
|
439
|
-
int first_thread_idx_in_thread_group_being_merged = ~mask & linear_tid;
|
|
440
|
-
int start = ITEMS_PER_THREAD * first_thread_idx_in_thread_group_being_merged;
|
|
441
|
-
int size = ITEMS_PER_THREAD * merged_threads_number;
|
|
438
|
+
const int first_thread_idx_in_thread_group_being_merged = ~mask & linear_tid;
|
|
439
|
+
const int start = ITEMS_PER_THREAD * first_thread_idx_in_thread_group_being_merged;
|
|
440
|
+
const int size = ITEMS_PER_THREAD * merged_threads_number;
|
|
442
441
|
|
|
443
|
-
int thread_idx_in_thread_group_being_merged = mask & linear_tid;
|
|
442
|
+
const int thread_idx_in_thread_group_being_merged = mask & linear_tid;
|
|
444
443
|
|
|
445
|
-
int diag = (::cuda::std::min) (valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged);
|
|
444
|
+
const int diag = (::cuda::std::min) (valid_items, ITEMS_PER_THREAD * thread_idx_in_thread_group_being_merged);
|
|
446
445
|
|
|
447
|
-
int keys1_beg = (::cuda::std::min) (valid_items, start);
|
|
448
|
-
int keys1_end = (::cuda::std::min) (valid_items, keys1_beg + size);
|
|
449
|
-
int keys2_beg = keys1_end;
|
|
450
|
-
int keys2_end = (::cuda::std::min) (valid_items, keys2_beg + size);
|
|
446
|
+
const int keys1_beg = (::cuda::std::min) (valid_items, start);
|
|
447
|
+
const int keys1_end = (::cuda::std::min) (valid_items, keys1_beg + size);
|
|
448
|
+
const int keys2_beg = keys1_end;
|
|
449
|
+
const int keys2_end = (::cuda::std::min) (valid_items, keys2_beg + size);
|
|
451
450
|
|
|
452
|
-
int keys1_count = keys1_end - keys1_beg;
|
|
453
|
-
int keys2_count = keys2_end - keys2_beg;
|
|
451
|
+
const int keys1_count = keys1_end - keys1_beg;
|
|
452
|
+
const int keys2_count = keys2_end - keys2_beg;
|
|
454
453
|
|
|
455
|
-
int partition_diag = MergePath(
|
|
454
|
+
const int partition_diag = MergePath(
|
|
456
455
|
&temp_storage.keys_shared[keys1_beg],
|
|
457
456
|
&temp_storage.keys_shared[keys2_beg],
|
|
458
457
|
keys1_count,
|
|
@@ -460,12 +459,12 @@ public:
|
|
|
460
459
|
diag,
|
|
461
460
|
compare_op);
|
|
462
461
|
|
|
463
|
-
int keys1_beg_loc = keys1_beg + partition_diag;
|
|
464
|
-
int keys1_end_loc = keys1_end;
|
|
465
|
-
int keys2_beg_loc = keys2_beg + diag - partition_diag;
|
|
466
|
-
int keys2_end_loc = keys2_end;
|
|
467
|
-
int keys1_count_loc = keys1_end_loc - keys1_beg_loc;
|
|
468
|
-
int keys2_count_loc = keys2_end_loc - keys2_beg_loc;
|
|
462
|
+
const int keys1_beg_loc = keys1_beg + partition_diag;
|
|
463
|
+
const int keys1_end_loc = keys1_end;
|
|
464
|
+
const int keys2_beg_loc = keys2_beg + diag - partition_diag;
|
|
465
|
+
const int keys2_end_loc = keys2_end;
|
|
466
|
+
const int keys1_count_loc = keys1_end_loc - keys1_beg_loc;
|
|
467
|
+
const int keys2_count_loc = keys2_end_loc - keys2_beg_loc;
|
|
469
468
|
SerialMerge(
|
|
470
469
|
&temp_storage.keys_shared[0],
|
|
471
470
|
keys1_beg_loc,
|
|
@@ -477,7 +476,7 @@ public:
|
|
|
477
476
|
compare_op,
|
|
478
477
|
oob_default);
|
|
479
478
|
|
|
480
|
-
if (!KEYS_ONLY)
|
|
479
|
+
if constexpr (!KEYS_ONLY)
|
|
481
480
|
{
|
|
482
481
|
Sync();
|
|
483
482
|
|
|
@@ -48,12 +48,14 @@
|
|
|
48
48
|
#include <cub/util_ptx.cuh>
|
|
49
49
|
#include <cub/util_type.cuh>
|
|
50
50
|
|
|
51
|
-
#include <cuda/
|
|
52
|
-
#include <cuda/std/
|
|
51
|
+
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
52
|
+
#include <cuda/std/__algorithm/max.h>
|
|
53
|
+
#include <cuda/std/__functional/operations.h>
|
|
54
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
55
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
53
56
|
#include <cuda/std/cstdint>
|
|
54
57
|
#include <cuda/std/limits>
|
|
55
58
|
#include <cuda/std/span>
|
|
56
|
-
#include <cuda/std/type_traits>
|
|
57
59
|
|
|
58
60
|
CUB_NAMESPACE_BEGIN
|
|
59
61
|
|
|
@@ -50,8 +50,11 @@
|
|
|
50
50
|
#include <cub/util_ptx.cuh>
|
|
51
51
|
#include <cub/util_type.cuh>
|
|
52
52
|
|
|
53
|
-
#include <cuda/std/
|
|
54
|
-
#include <cuda/std/
|
|
53
|
+
#include <cuda/std/__algorithm/min.h>
|
|
54
|
+
#include <cuda/std/__type_traits/enable_if.h>
|
|
55
|
+
#include <cuda/std/__type_traits/integral_constant.h>
|
|
56
|
+
#include <cuda/std/__type_traits/is_convertible.h>
|
|
57
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
55
58
|
|
|
56
59
|
CUB_NAMESPACE_BEGIN
|
|
57
60
|
|
|
@@ -49,14 +49,18 @@
|
|
|
49
49
|
|
|
50
50
|
#include <thrust/type_traits/integer_sequence.h>
|
|
51
51
|
|
|
52
|
-
#include <cuda/
|
|
53
|
-
#include <cuda/
|
|
54
|
-
#include <cuda/
|
|
52
|
+
#include <cuda/__bit/bitfield.h>
|
|
53
|
+
#include <cuda/__type_traits/is_floating_point.h>
|
|
54
|
+
#include <cuda/__utility/static_for.h>
|
|
55
55
|
#include <cuda/std/__algorithm/min.h>
|
|
56
|
+
#include <cuda/std/__functional/invoke.h>
|
|
57
|
+
#include <cuda/std/__type_traits/enable_if.h>
|
|
58
|
+
#include <cuda/std/__type_traits/integral_constant.h>
|
|
59
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
60
|
+
#include <cuda/std/__type_traits/remove_cv.h>
|
|
61
|
+
#include <cuda/std/__type_traits/void_t.h>
|
|
56
62
|
#include <cuda/std/cstdint>
|
|
57
63
|
#include <cuda/std/tuple>
|
|
58
|
-
#include <cuda/std/type_traits>
|
|
59
|
-
#include <cuda/type_traits>
|
|
60
64
|
|
|
61
65
|
CUB_NAMESPACE_BEGIN
|
|
62
66
|
|
|
@@ -38,11 +38,12 @@
|
|
|
38
38
|
#include <cub/detail/type_traits.cuh> // static_size_v
|
|
39
39
|
#include <cub/util_namespace.cuh>
|
|
40
40
|
|
|
41
|
-
#include <cuda/std/
|
|
42
|
-
#include <cuda/std/
|
|
43
|
-
#include <cuda/std/
|
|
44
|
-
#include <cuda/std/
|
|
45
|
-
#include <cuda/std/
|
|
41
|
+
#include <cuda/std/__iterator/iterator_traits.h>
|
|
42
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
43
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
44
|
+
#include <cuda/std/__utility/integer_sequence.h>
|
|
45
|
+
#include <cuda/std/array>
|
|
46
|
+
#include <cuda/std/cstddef>
|
|
46
47
|
|
|
47
48
|
CUB_NAMESPACE_BEGIN
|
|
48
49
|
namespace detail
|
|
@@ -37,9 +37,15 @@
|
|
|
37
37
|
# pragma system_header
|
|
38
38
|
#endif // no system header
|
|
39
39
|
|
|
40
|
+
#include <cuda/std/__iterator/iterator_traits.h>
|
|
41
|
+
#include <cuda/std/__type_traits/common_type.h>
|
|
42
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
43
|
+
#include <cuda/std/__type_traits/is_integral.h>
|
|
44
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
45
|
+
#include <cuda/std/__type_traits/is_unsigned.h>
|
|
46
|
+
#include <cuda/std/__type_traits/remove_cv.h>
|
|
40
47
|
#include <cuda/std/cstdint>
|
|
41
|
-
#include <cuda/std/
|
|
42
|
-
#include <cuda/std/type_traits>
|
|
48
|
+
#include <cuda/std/limits>
|
|
43
49
|
|
|
44
50
|
CUB_NAMESPACE_BEGIN
|
|
45
51
|
|
|
@@ -40,12 +40,19 @@
|
|
|
40
40
|
#include <cub/detail/type_traits.cuh> // implicit_prom_t
|
|
41
41
|
#include <cub/util_type.cuh> // _CCCL_HAS_INT128()
|
|
42
42
|
|
|
43
|
-
#include <cuda/
|
|
44
|
-
#include <cuda/std/
|
|
43
|
+
#include <cuda/__cmath/ceil_div.h>
|
|
44
|
+
#include <cuda/std/__bit/has_single_bit.h>
|
|
45
|
+
#include <cuda/std/__bit/integral.h>
|
|
46
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
47
|
+
#include <cuda/std/__type_traits/enable_if.h>
|
|
48
|
+
#include <cuda/std/__type_traits/integral_constant.h>
|
|
49
|
+
#include <cuda/std/__type_traits/is_integral.h>
|
|
50
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
51
|
+
#include <cuda/std/__type_traits/is_signed.h>
|
|
52
|
+
#include <cuda/std/__type_traits/make_unsigned.h>
|
|
45
53
|
#include <cuda/std/climits> // CHAR_BIT
|
|
46
54
|
#include <cuda/std/cstdint> // uint64_t
|
|
47
|
-
#include <cuda/std/limits>
|
|
48
|
-
#include <cuda/std/type_traits> // ::cuda::std::is_integral
|
|
55
|
+
#include <cuda/std/limits>
|
|
49
56
|
|
|
50
57
|
#if defined(CCCL_ENABLE_DEVICE_ASSERTIONS)
|
|
51
58
|
_CCCL_BEGIN_NV_DIAG_SUPPRESS(186) // pointless comparison of unsigned integer with zero
|
|
@@ -15,9 +15,13 @@
|
|
|
15
15
|
#include <cub/detail/unsafe_bitcast.cuh>
|
|
16
16
|
#include <cub/thread/thread_operators.cuh> // is_cuda_minimum_maximum_v
|
|
17
17
|
|
|
18
|
-
#include <cuda/
|
|
19
|
-
#include <cuda/std/
|
|
20
|
-
#include <cuda/
|
|
18
|
+
#include <cuda/__type_traits/is_floating_point.h>
|
|
19
|
+
#include <cuda/std/__cmath/isnan.h>
|
|
20
|
+
#include <cuda/std/__type_traits/is_integer.h>
|
|
21
|
+
#include <cuda/std/__type_traits/is_signed.h>
|
|
22
|
+
#include <cuda/std/__type_traits/make_nbit_int.h>
|
|
23
|
+
#include <cuda/std/__type_traits/make_unsigned.h>
|
|
24
|
+
#include <cuda/std/limits>
|
|
21
25
|
|
|
22
26
|
CUB_NAMESPACE_BEGIN
|
|
23
27
|
|
|
@@ -39,11 +39,11 @@
|
|
|
39
39
|
|
|
40
40
|
#include <cub/detail/fast_modulo_division.cuh> // fast_div_mod
|
|
41
41
|
|
|
42
|
-
#include <cuda/std/
|
|
43
|
-
#include <cuda/std/
|
|
42
|
+
#include <cuda/std/__type_traits/make_unsigned.h>
|
|
43
|
+
#include <cuda/std/__utility/integer_sequence.h>
|
|
44
|
+
#include <cuda/std/array>
|
|
45
|
+
#include <cuda/std/cstddef>
|
|
44
46
|
#include <cuda/std/mdspan>
|
|
45
|
-
#include <cuda/std/type_traits> // make_unsigned_t
|
|
46
|
-
#include <cuda/std/utility> // index_sequence
|
|
47
47
|
|
|
48
48
|
CUB_NAMESPACE_BEGIN
|
|
49
49
|
|
|
@@ -32,9 +32,9 @@
|
|
|
32
32
|
#include <cub/detail/ptx-json/string.h>
|
|
33
33
|
#include <cub/detail/ptx-json/value.h>
|
|
34
34
|
|
|
35
|
+
#include <cuda/std/__type_traits/enable_if.h>
|
|
36
|
+
#include <cuda/std/__utility/integer_sequence.h>
|
|
35
37
|
#include <cuda/std/cstddef>
|
|
36
|
-
#include <cuda/std/type_traits>
|
|
37
|
-
#include <cuda/std/utility>
|
|
38
38
|
|
|
39
39
|
namespace ptx_json
|
|
40
40
|
{
|
|
@@ -47,9 +47,10 @@ struct tagged_json<T, cuda::std::index_sequence<Is...>>
|
|
|
47
47
|
template <typename V, typename = cuda::std::enable_if_t<is_object<V>::value || is_array<V>::value>>
|
|
48
48
|
__noinline__ __device__ void operator=(V)
|
|
49
49
|
{
|
|
50
|
-
|
|
50
|
+
static constexpr char str[]{T.str[Is]...};
|
|
51
|
+
asm volatile("cccl.ptx_json.begin(%0)\n\n" ::"C"(str) : "memory");
|
|
51
52
|
V::emit();
|
|
52
|
-
asm volatile("\ncccl.ptx_json.end(%0)" ::"C"(
|
|
53
|
+
asm volatile("\ncccl.ptx_json.end(%0)" ::"C"(str) : "memory");
|
|
53
54
|
}
|
|
54
55
|
};
|
|
55
56
|
|
|
@@ -50,22 +50,4 @@ __forceinline__ __device__ void comma()
|
|
|
50
50
|
{
|
|
51
51
|
asm volatile("," ::: "memory");
|
|
52
52
|
}
|
|
53
|
-
|
|
54
|
-
#pragma nv_diag_suppress 177
|
|
55
|
-
template <char... Cs>
|
|
56
|
-
struct storage_helper
|
|
57
|
-
{
|
|
58
|
-
// This, and the dance to invoke this through value_traits elsewhere, is necessary because the "C" inline assembly
|
|
59
|
-
// constraint supported by NVCC requires that its argument is a pointer to a constant array of type char; NVCC also
|
|
60
|
-
// doesn't allow passing raw character literals as pointer template arguments; and *also* it seems to look at the type
|
|
61
|
-
// of a containing object, not a subobject it is given, when passed in a pointer to an array inside a literal type.
|
|
62
|
-
// All of this means that we can't just pass strings, and *also* we can't just use the string<N>::array member above
|
|
63
|
-
// as the string literal; therefore, using the fact that the length of the string is a core constant expression in the
|
|
64
|
-
// definition of value_traits, we can generate a variadic pack that allows us to expand the contents of
|
|
65
|
-
// string<N>::array into a comma separated list of N chars. We can then plug that in as template arguments to
|
|
66
|
-
// storage_helper, which then can, as below, turn that into its own char array that NVCC accepts as an argument for a
|
|
67
|
-
// "C" inline assembly constraint.
|
|
68
|
-
static const constexpr char value[] = {Cs...};
|
|
69
|
-
};
|
|
70
|
-
#pragma nv_diag_default 177
|
|
71
53
|
} // namespace ptx_json
|