cuda-cccl 0.1.3.2.0.dev271__cp313-cp313-manylinux_2_26_x86_64.whl → 0.1.3.2.0.dev438__cp313-cp313-manylinux_2_26_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/_cuda_version_utils.py +0 -22
- cuda/cccl/cooperative/experimental/_common.py +3 -1
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +6 -2
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +3 -1
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +68 -62
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +5 -2
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +4 -2
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +3 -2
- cuda/cccl/headers/include/cub/agent/agent_reduce.cuh +7 -20
- cuda/cccl/headers/include/cub/agent/agent_reduce_by_key.cuh +4 -2
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +6 -3
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +25 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +8 -2
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +18 -1
- cuda/cccl/headers/include/cub/agent/agent_topk.cuh +764 -0
- cuda/cccl/headers/include/cub/agent/single_pass_scan_operators.cuh +99 -17
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +3 -1
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +1 -1
- cuda/cccl/headers/include/cub/block/block_merge_sort.cuh +43 -30
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +5 -3
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +5 -4
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +2 -1
- cuda/cccl/headers/include/cub/block/block_run_length_decode.cuh +0 -3
- cuda/cccl/headers/include/cub/block/block_scan.cuh +2 -1
- cuda/cccl/headers/include/cub/block/radix_rank_sort_operations.cuh +9 -5
- cuda/cccl/headers/include/cub/block/specializations/block_reduce_warp_reductions.cuh +1 -1
- cuda/cccl/headers/include/cub/block/specializations/block_scan_warp_scans.cuh +2 -2
- cuda/cccl/headers/include/cub/cub.cuh +8 -0
- cuda/cccl/headers/include/cub/detail/array_utils.cuh +6 -5
- cuda/cccl/headers/include/cub/detail/choose_offset.cuh +8 -2
- cuda/cccl/headers/include/cub/detail/detect_cuda_runtime.cuh +13 -32
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +61 -0
- cuda/cccl/headers/include/cub/detail/fast_modulo_division.cuh +11 -4
- cuda/cccl/headers/include/cub/detail/integer_utils.cuh +7 -3
- cuda/cccl/headers/include/cub/detail/launcher/cuda_driver.cuh +3 -3
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +4 -4
- cuda/cccl/headers/include/cub/detail/ptx-json/json.h +5 -4
- cuda/cccl/headers/include/cub/detail/ptx-json/object.h +1 -1
- cuda/cccl/headers/include/cub/detail/ptx-json/string.h +0 -18
- cuda/cccl/headers/include/cub/detail/ptx-json/value.h +7 -5
- cuda/cccl/headers/include/cub/detail/rfa.cuh +9 -2
- cuda/cccl/headers/include/cub/detail/type_traits.cuh +15 -7
- cuda/cccl/headers/include/cub/detail/uninitialized_copy.cuh +3 -2
- cuda/cccl/headers/include/cub/device/device_for.cuh +7 -12
- cuda/cccl/headers/include/cub/device/device_histogram.cuh +11 -9
- cuda/cccl/headers/include/cub/device/device_memcpy.cuh +1 -1
- cuda/cccl/headers/include/cub/device/device_merge.cuh +2 -1
- cuda/cccl/headers/include/cub/device/device_radix_sort.cuh +3 -1
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +785 -164
- cuda/cccl/headers/include/cub/device/device_scan.cuh +306 -0
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +10 -2
- cuda/cccl/headers/include/cub/device/device_select.cuh +5 -1
- cuda/cccl/headers/include/cub/device/device_topk.cuh +511 -0
- cuda/cccl/headers/include/cub/device/device_transform.cuh +118 -26
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_adjacent_difference.cuh +2 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +3 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_batch_memcpy.cuh +4 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_common.cuh +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_for.cuh +2 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_histogram.cuh +7 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +43 -44
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge_sort.cuh +3 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +6 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +14 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +79 -40
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +4 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_rle.cuh +5 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan.cuh +6 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_scan_by_key.cuh +6 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +5 -4
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_select_if.cuh +6 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +3 -2
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_three_way_partition.cuh +106 -172
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +629 -0
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +15 -12
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_unique_by_key.cuh +3 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +11 -3
- cuda/cccl/headers/include/cub/device/dispatch/kernels/radix_sort.cuh +5 -1
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +28 -41
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +0 -2
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +4 -10
- cuda/cccl/headers/include/cub/device/dispatch/kernels/three_way_partition.cuh +80 -0
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +28 -6
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +3 -15
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +3 -2
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +14 -2
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_select_if.cuh +2 -1
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +36 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_topk.cuh +85 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +10 -4
- cuda/cccl/headers/include/cub/grid/grid_even_share.cuh +3 -2
- cuda/cccl/headers/include/cub/iterator/arg_index_input_iterator.cuh +3 -5
- cuda/cccl/headers/include/cub/iterator/cache_modified_input_iterator.cuh +6 -7
- cuda/cccl/headers/include/cub/iterator/cache_modified_output_iterator.cuh +4 -6
- cuda/cccl/headers/include/cub/iterator/tex_obj_input_iterator.cuh +4 -6
- cuda/cccl/headers/include/cub/thread/thread_load.cuh +3 -1
- cuda/cccl/headers/include/cub/thread/thread_operators.cuh +7 -3
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +11 -17
- cuda/cccl/headers/include/cub/thread/thread_search.cuh +2 -1
- cuda/cccl/headers/include/cub/thread/thread_simd.cuh +7 -13
- cuda/cccl/headers/include/cub/thread/thread_sort.cuh +3 -2
- cuda/cccl/headers/include/cub/thread/thread_store.cuh +2 -1
- cuda/cccl/headers/include/cub/util_device.cuh +30 -25
- cuda/cccl/headers/include/cub/util_macro.cuh +0 -2
- cuda/cccl/headers/include/cub/util_math.cuh +4 -1
- cuda/cccl/headers/include/cub/util_ptx.cuh +8 -8
- cuda/cccl/headers/include/cub/util_type.cuh +33 -49
- cuda/cccl/headers/include/cub/util_vsmem.cuh +5 -3
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_shfl.cuh +1 -1
- cuda/cccl/headers/include/cub/warp/specializations/warp_exchange_smem.cuh +1 -1
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +9 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -1
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +8 -6
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_smem.cuh +5 -3
- cuda/cccl/headers/include/cub/warp/warp_exchange.cuh +1 -1
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +1 -1
- cuda/cccl/headers/include/cub/warp/warp_merge_sort.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +5 -3
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -1
- cuda/cccl/headers/include/cub/warp/warp_utils.cuh +3 -3
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +63 -10
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +3 -1
- cuda/cccl/headers/include/cuda/__complex/complex.h +238 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +93 -0
- cuda/cccl/headers/include/cuda/__complex/traits.h +64 -0
- cuda/cccl/headers/include/cuda/__complex_ +28 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +9 -0
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +51 -13
- cuda/cccl/headers/include/cuda/__event/timed_event.h +1 -1
- cuda/cccl/headers/include/cuda/__execution/output_ordering.h +89 -0
- cuda/cccl/headers/include/cuda/__functional/for_each_canceled.h +97 -52
- cuda/cccl/headers/include/cuda/__functional/proclaim_return_type.h +5 -6
- cuda/cccl/headers/include/cuda/__fwd/complex.h +48 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +103 -60
- cuda/cccl/headers/include/cuda/__iterator/counting_iterator.h +136 -113
- cuda/cccl/headers/include/cuda/__iterator/discard_iterator.h +46 -36
- cuda/cccl/headers/include/cuda/__iterator/permutation_iterator.h +92 -60
- cuda/cccl/headers/include/cuda/__iterator/shuffle_iterator.h +71 -29
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +129 -64
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +71 -62
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +117 -120
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +129 -124
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +115 -106
- cuda/cccl/headers/include/cuda/__iterator/zip_function.h +24 -6
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +275 -141
- cuda/cccl/headers/include/cuda/__memcpy_async/memcpy_async_tx.h +1 -1
- cuda/cccl/headers/include/cuda/__memory/address_space.h +28 -12
- cuda/cccl/headers/include/cuda/__memory/check_address.h +34 -29
- cuda/cccl/headers/include/cuda/__nvtx/nvtx.h +2 -2
- cuda/cccl/headers/include/cuda/__runtime/types.h +41 -0
- cuda/cccl/headers/include/cuda/__utility/__basic_any/basic_any_value.h +161 -92
- cuda/cccl/headers/include/cuda/__utility/__basic_any/storage.h +3 -2
- cuda/cccl/headers/include/cuda/pipeline +2 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/comp.h +0 -6
- cuda/cccl/headers/include/cuda/std/__algorithm/copy_n.h +3 -3
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/iterator_operations.h +3 -3
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/max_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/pop_heap.h +2 -3
- cuda/cccl/headers/include/cuda/std/__algorithm/push_heap.h +2 -3
- cuda/cccl/headers/include/cuda/std/__algorithm/unwrap_iter.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__atomic/api/owned.h +6 -8
- cuda/cccl/headers/include/cuda/std/__atomic/api/reference.h +3 -3
- cuda/cccl/headers/include/cuda/std/__atomic/functions/host.h +4 -4
- cuda/cccl/headers/include/cuda/std/__atomic/order.h +1 -1
- cuda/cccl/headers/include/cuda/std/__barrier/barrier.h +1 -1
- cuda/cccl/headers/include/cuda/std/__cccl/assert.h +18 -7
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +4 -115
- cuda/cccl/headers/include/cuda/std/__cccl/cuda_toolkit.h +8 -5
- cuda/cccl/headers/include/cuda/std/__cccl/deprecated.h +22 -3
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +4 -4
- cuda/cccl/headers/include/cuda/std/__cccl/extended_data_types.h +27 -0
- cuda/cccl/headers/include/cuda/std/__cccl/os.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/preprocessor.h +8 -0
- cuda/cccl/headers/include/cuda/std/__cccl/ptx_isa.h +3 -3
- cuda/cccl/headers/include/cuda/std/__cccl/visibility.h +25 -0
- cuda/cccl/headers/include/cuda/std/__chrono/calendar.h +54 -0
- cuda/cccl/headers/include/cuda/std/__chrono/day.h +162 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +503 -0
- cuda/cccl/headers/include/cuda/std/__chrono/file_clock.h +55 -0
- cuda/cccl/headers/include/cuda/std/__chrono/high_resolution_clock.h +46 -0
- cuda/cccl/headers/include/cuda/std/__chrono/month.h +187 -0
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +60 -0
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +80 -0
- cuda/cccl/headers/include/cuda/std/__chrono/time_point.h +258 -0
- cuda/cccl/headers/include/cuda/std/__chrono/year.h +186 -0
- cuda/cccl/headers/include/cuda/std/__cmath/error_functions.h +200 -0
- cuda/cccl/headers/include/cuda/std/__cmath/exponential_functions.h +60 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fdim.h +118 -0
- cuda/cccl/headers/include/cuda/std/__cmath/fpclassify.h +15 -0
- cuda/cccl/headers/include/cuda/std/__cmath/logarithms.h +105 -153
- cuda/cccl/headers/include/cuda/std/__cmath/nan.h +54 -0
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +5 -7
- cuda/cccl/headers/include/cuda/std/__complex/logarithms.h +1 -0
- cuda/cccl/headers/include/cuda/std/__concepts/concept_macros.h +186 -119
- cuda/cccl/headers/include/cuda/std/__concepts/constructible.h +67 -0
- cuda/cccl/headers/include/cuda/std/__concepts/same_as.h +1 -4
- cuda/cccl/headers/include/cuda/std/__cstring/memcpy.h +12 -9
- cuda/cccl/headers/include/cuda/std/__cuda/ensure_current_device.h +1 -1
- cuda/cccl/headers/include/cuda/std/__expected/expected.h +31 -38
- cuda/cccl/headers/include/cuda/std/__floating_point/constants.h +5 -3
- cuda/cccl/headers/include/cuda/std/__floating_point/decompose.h +69 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -0
- cuda/cccl/headers/include/cuda/std/__floating_point/mask.h +6 -0
- cuda/cccl/headers/include/cuda/std/__functional/binary_function.h +4 -4
- cuda/cccl/headers/include/cuda/std/__functional/binary_negate.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/bind_back.h +4 -4
- cuda/cccl/headers/include/cuda/std/__functional/binder1st.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/binder2nd.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/function.h +10 -11
- cuda/cccl/headers/include/cuda/std/__functional/hash.h +5 -6
- cuda/cccl/headers/include/cuda/std/__functional/identity.h +4 -8
- cuda/cccl/headers/include/cuda/std/__functional/is_transparent.h +2 -4
- cuda/cccl/headers/include/cuda/std/__functional/mem_fun_ref.h +16 -18
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_binary_function.h +2 -3
- cuda/cccl/headers/include/cuda/std/__functional/pointer_to_unary_function.h +2 -3
- cuda/cccl/headers/include/cuda/std/__functional/unary_function.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/unary_negate.h +2 -2
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +16 -25
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +32 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +6 -0
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +3 -3
- cuda/cccl/headers/include/cuda/std/__iterator/advance.h +3 -3
- cuda/cccl/headers/include/cuda/std/__iterator/bounded_iter.h +3 -4
- cuda/cccl/headers/include/cuda/std/__iterator/distance.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/incrementable_traits.h +21 -28
- cuda/cccl/headers/include/cuda/std/__iterator/iterator.h +1 -1
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +41 -126
- cuda/cccl/headers/include/cuda/std/__iterator/move_iterator.h +3 -4
- cuda/cccl/headers/include/cuda/std/__iterator/next.h +2 -2
- cuda/cccl/headers/include/cuda/std/__iterator/prev.h +2 -3
- cuda/cccl/headers/include/cuda/std/__iterator/readable_traits.h +12 -41
- cuda/cccl/headers/include/cuda/std/__iterator/reverse_iterator.h +2 -2
- cuda/cccl/headers/include/cuda/std/__iterator/wrap_iter.h +4 -6
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits.h +31 -31
- cuda/cccl/headers/include/cuda/std/__limits/numeric_limits_ext.h +45 -45
- cuda/cccl/headers/include/cuda/std/__mdspan/concepts.h +6 -0
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +3 -2
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_left.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_right.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/layout_stride.h +1 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +10 -12
- cuda/cccl/headers/include/cuda/std/__memory/addressof.h +5 -2
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +30 -30
- cuda/cccl/headers/include/cuda/std/__memory/allocator_arg_t.h +1 -1
- cuda/cccl/headers/include/cuda/std/__memory/allocator_traits.h +66 -86
- cuda/cccl/headers/include/cuda/std/__memory/compressed_pair.h +2 -2
- cuda/cccl/headers/include/cuda/std/__memory/destruct_n.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/is_sufficiently_aligned.h +5 -2
- cuda/cccl/headers/include/cuda/std/__memory/pointer_traits.h +30 -45
- cuda/cccl/headers/include/cuda/std/__memory/uninitialized_algorithms.h +8 -12
- cuda/cccl/headers/include/cuda/std/__memory/unique_ptr.h +21 -23
- cuda/cccl/headers/include/cuda/std/__new/launder.h +4 -0
- cuda/cccl/headers/include/cuda/std/__optional/hash.h +2 -2
- cuda/cccl/headers/include/cuda/std/__ranges/iota_view.h +8 -8
- cuda/cccl/headers/include/cuda/std/__semaphore/atomic_semaphore.h +2 -1
- cuda/cccl/headers/include/cuda/std/__thread/threading_support.h +2 -1
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_cuda.h +1 -1
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_external.h +1 -1
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_pthread.h +1 -1
- cuda/cccl/headers/include/cuda/std/__thread/threading_support_win32.h +1 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/sfinae_helpers.h +4 -13
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +12 -22
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +9 -18
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_size.h +2 -2
- cuda/cccl/headers/include/cuda/std/__tuple_dir/vector_types.h +0 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/can_extract_key.h +3 -4
- cuda/cccl/headers/include/cuda/std/__type_traits/decay.h +2 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/has_unique_object_representation.h +7 -8
- cuda/cccl/headers/include/cuda/std/__type_traits/has_virtual_destructor.h +4 -13
- cuda/cccl/headers/include/cuda/std/__type_traits/integral_constant.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_abstract.h +2 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_aggregate.h +3 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/is_assignable.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_base_of.h +3 -44
- cuda/cccl/headers/include/cuda/std/__type_traits/is_class.h +2 -28
- cuda/cccl/headers/include/cuda/std/__type_traits/is_compound.h +9 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/is_constructible.h +3 -3
- cuda/cccl/headers/include/cuda/std/__type_traits/is_convertible.h +1 -4
- cuda/cccl/headers/include/cuda/std/__type_traits/is_empty.h +3 -34
- cuda/cccl/headers/include/cuda/std/__type_traits/is_enum.h +3 -29
- cuda/cccl/headers/include/cuda/std/__type_traits/is_extended_floating_point.h +0 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/is_final.h +2 -16
- cuda/cccl/headers/include/cuda/std/__type_traits/is_function.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_literal_type.h +4 -21
- cuda/cccl/headers/include/cuda/std/__type_traits/is_member_function_pointer.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_assignable.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/is_nothrow_constructible.h +3 -3
- cuda/cccl/headers/include/cuda/std/__type_traits/is_pod.h +4 -24
- cuda/cccl/headers/include/cuda/std/__type_traits/is_polymorphic.h +3 -24
- cuda/cccl/headers/include/cuda/std/__type_traits/is_same.h +13 -9
- cuda/cccl/headers/include/cuda/std/__type_traits/is_standard_layout.h +3 -18
- cuda/cccl/headers/include/cuda/std/__type_traits/is_swappable.h +21 -20
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivial.h +3 -17
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_assignable.h +4 -31
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_constructible.h +3 -42
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_assignable.h +5 -19
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copy_constructible.h +3 -19
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_copyable.h +3 -17
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_default_constructible.h +2 -15
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_destructible.h +13 -28
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_assignable.h +2 -17
- cuda/cccl/headers/include/cuda/std/__type_traits/is_trivially_move_constructible.h +2 -16
- cuda/cccl/headers/include/cuda/std/__type_traits/is_union.h +3 -18
- cuda/cccl/headers/include/cuda/std/__type_traits/make_signed.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/make_unsigned.h +1 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/num_bits.h +0 -1
- cuda/cccl/headers/include/cuda/std/__type_traits/promote.h +1 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +2 -2
- cuda/cccl/headers/include/cuda/std/__type_traits/underlying_type.h +2 -16
- cuda/cccl/headers/include/cuda/std/__utility/convert_to_integral.h +2 -2
- cuda/cccl/headers/include/cuda/std/__utility/declval.h +17 -4
- cuda/cccl/headers/include/cuda/std/__utility/exchange.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/forward.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/in_place.h +29 -0
- cuda/cccl/headers/include/cuda/std/__utility/integer_sequence.h +2 -2
- cuda/cccl/headers/include/cuda/std/__utility/move.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +8 -9
- cuda/cccl/headers/include/cuda/std/array +2 -2
- cuda/cccl/headers/include/cuda/std/atomic +20 -28
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/cmath +63 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +1 -32
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/algorithm +3 -4
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/chrono +16 -1137
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/tuple +12 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +4 -4
- cuda/cccl/headers/include/cuda/std/inplace_vector +9 -9
- cuda/cccl/headers/include/cuda/std/numbers +0 -1
- cuda/cccl/headers/include/cuda/std/ratio +3 -4
- cuda/cccl/headers/include/cuda/std/version +2 -4
- cuda/cccl/headers/include/thrust/advance.h +6 -8
- cuda/cccl/headers/include/thrust/detail/execution_policy.h +61 -21
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +37 -2
- cuda/cccl/headers/include/thrust/detail/overlapped_copy.h +1 -1
- cuda/cccl/headers/include/thrust/detail/pointer.h +1 -1
- cuda/cccl/headers/include/thrust/detail/reference.h +10 -16
- cuda/cccl/headers/include/thrust/detail/seq.h +37 -25
- cuda/cccl/headers/include/thrust/detail/vector_base.h +2 -4
- cuda/cccl/headers/include/thrust/detail/vector_base.inl +2 -4
- cuda/cccl/headers/include/thrust/distance.h +3 -3
- cuda/cccl/headers/include/thrust/execution_policy.h +202 -335
- cuda/cccl/headers/include/thrust/functional.h +1 -2
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_system.h +9 -0
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_category_to_traversal.h +4 -1
- cuda/cccl/headers/include/thrust/iterator/detail/iterator_facade_category.h +39 -56
- cuda/cccl/headers/include/thrust/iterator/detail/minimum_system.h +6 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_adaptor.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_categories.h +6 -10
- cuda/cccl/headers/include/thrust/iterator/iterator_facade.h +1 -2
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +36 -24
- cuda/cccl/headers/include/thrust/iterator/{detail/iterator_traversal_tags.h → iterator_traversal_tags.h} +14 -0
- cuda/cccl/headers/include/thrust/iterator/retag.h +5 -5
- cuda/cccl/headers/include/thrust/iterator/strided_iterator.h +7 -7
- cuda/cccl/headers/include/thrust/iterator/transform_iterator.h +2 -2
- cuda/cccl/headers/include/thrust/random/detail/xor_combine_engine_max.h +77 -107
- cuda/cccl/headers/include/thrust/system/cpp/detail/execution_policy.h +70 -51
- cuda/cccl/headers/include/thrust/system/cpp/execution_policy.h +4 -99
- cuda/cccl/headers/include/thrust/system/cpp/memory.h +2 -5
- cuda/cccl/headers/include/thrust/system/cpp/memory_resource.h +2 -5
- cuda/cccl/headers/include/thrust/system/cpp/pointer.h +2 -5
- cuda/cccl/headers/include/thrust/system/cpp/vector.h +2 -5
- cuda/cccl/headers/include/thrust/system/cuda/config.h +7 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/adjacent_difference.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +151 -40
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy_if.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/triple_chevron_launch.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/core/util.h +0 -16
- cuda/cccl/headers/include/thrust/system/cuda/detail/execution_policy.h +199 -48
- cuda/cccl/headers/include/thrust/system/cuda/detail/fill.h +36 -18
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/for_each.h +2 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/generate.h +26 -51
- cuda/cccl/headers/include/thrust/system/cuda/detail/iter_swap.h +25 -14
- cuda/cccl/headers/include/thrust/system/cuda/detail/partition.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce.h +16 -13
- cuda/cccl/headers/include/thrust/system/cuda/detail/reduce_by_key.h +40 -40
- cuda/cccl/headers/include/thrust/system/cuda/detail/replace.h +12 -42
- cuda/cccl/headers/include/thrust/system/cuda/detail/scan_by_key.h +1 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/scatter.h +16 -4
- cuda/cccl/headers/include/thrust/system/cuda/detail/set_operations.h +49 -53
- cuda/cccl/headers/include/thrust/system/cuda/detail/sort.h +0 -12
- cuda/cccl/headers/include/thrust/system/cuda/detail/temporary_buffer.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform.h +29 -15
- cuda/cccl/headers/include/thrust/system/cuda/detail/transform_reduce.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_copy.h +8 -5
- cuda/cccl/headers/include/thrust/system/cuda/detail/uninitialized_fill.h +16 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/unique_by_key.h +1 -2
- cuda/cccl/headers/include/thrust/system/cuda/detail/util.h +7 -5
- cuda/cccl/headers/include/thrust/system/cuda/execution_policy.h +3 -27
- cuda/cccl/headers/include/thrust/system/detail/bad_alloc.h +2 -5
- cuda/cccl/headers/include/thrust/system/detail/errno.h +2 -7
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/adjacent_difference.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/binary_search.inl +3 -10
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/copy.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/copy_if.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/count.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/count.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/equal.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/extrema.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/fill.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/find.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/find.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/for_each.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/gather.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/generate.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/inner_product.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/logical.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/memory.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/merge.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/mismatch.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/partition.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/per_device_resource.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/reduce_by_key.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/remove.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/replace.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/reverse.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.h +2 -17
- cuda/cccl/headers/include/thrust/system/detail/generic/scalar/binary_search.inl +2 -17
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/scan.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/scan_by_key.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/scatter.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/select_system.h +10 -2
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/set_operations.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/shuffle.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/sort.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/swap_ranges.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/tabulate.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/tag.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/temporary_buffer.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_reduce.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/transform_scan.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_copy.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/uninitialized_fill.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/unique.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/generic/unique_by_key.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/internal/decompose.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/adjacent_difference.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/assign_value.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/binary_search.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_backward.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/copy_if.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/execution_policy.h +18 -44
- cuda/cccl/headers/include/thrust/system/detail/sequential/extrema.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/find.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/for_each.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/general_copy.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/get_value.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/insertion_sort.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/iter_swap.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/malloc_and_free.h +3 -9
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/merge.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/partition.h +2 -10
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/reduce_by_key.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/remove.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/scan_by_key.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/set_operations.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/sort.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_merge_sort.inl +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_primitive_sort.inl +3 -9
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/stable_radix_sort.inl +3 -9
- cuda/cccl/headers/include/thrust/system/detail/sequential/trivial_copy.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique.h +2 -8
- cuda/cccl/headers/include/thrust/system/detail/sequential/unique_by_key.h +2 -8
- cuda/cccl/headers/include/thrust/system/omp/detail/execution_policy.h +75 -61
- cuda/cccl/headers/include/thrust/system/omp/execution_policy.h +4 -99
- cuda/cccl/headers/include/thrust/system/tbb/detail/execution_policy.h +68 -51
- cuda/cccl/headers/include/thrust/system/tbb/detail/for_each.inl +2 -2
- cuda/cccl/headers/include/thrust/system/tbb/execution_policy.h +4 -99
- cuda/cccl/headers/include_paths.py +6 -9
- cuda/cccl/parallel/experimental/__init__.py +2 -4
- cuda/cccl/parallel/experimental/_bindings.py +38 -15
- cuda/cccl/parallel/experimental/_bindings_impl.pyx +36 -9
- cuda/cccl/parallel/experimental/_cccl_interop.py +56 -30
- cuda/cccl/parallel/experimental/algorithms/_histogram.py +2 -2
- cuda/cccl/parallel/experimental/algorithms/_merge_sort.py +4 -4
- cuda/cccl/parallel/experimental/algorithms/_radix_sort.py +4 -4
- cuda/cccl/parallel/experimental/algorithms/_reduce.py +2 -2
- cuda/cccl/parallel/experimental/algorithms/_scan.py +4 -4
- cuda/cccl/parallel/experimental/algorithms/_segmented_reduce.py +4 -4
- cuda/cccl/parallel/experimental/algorithms/_transform.py +5 -5
- cuda/cccl/parallel/experimental/algorithms/_unique_by_key.py +5 -5
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-x86_64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/cccl/parallel/experimental/iterators/__init__.py +2 -4
- cuda/cccl/parallel/experimental/iterators/_factories.py +28 -51
- cuda/cccl/parallel/experimental/iterators/_iterators.py +189 -204
- cuda/cccl/parallel/experimental/iterators/_zip_iterator.py +4 -12
- cuda/cccl/parallel/experimental/numba_utils.py +47 -0
- {cuda_cccl-0.1.3.2.0.dev271.dist-info → cuda_cccl-0.1.3.2.0.dev438.dist-info}/METADATA +8 -6
- {cuda_cccl-0.1.3.2.0.dev271.dist-info → cuda_cccl-0.1.3.2.0.dev438.dist-info}/RECORD +545 -530
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/cmath +0 -520
- cuda/cccl/headers/include/thrust/detail/mpl/math.h +0 -164
- cuda/cccl/headers/include/thrust/detail/type_traits/iterator/is_discard_iterator.h +0 -44
- cuda/cccl/headers/include/thrust/detail/util/align.h +0 -59
- cuda/cccl/headers/include/thrust/system/cpp/detail/par.h +0 -62
- cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_cross_system.h +0 -204
- cuda/cccl/headers/include/thrust/system/cuda/detail/internal/copy_device_to_device.h +0 -92
- cuda/cccl/headers/include/thrust/system/cuda/detail/par.h +0 -237
- cuda/cccl/headers/include/thrust/system/cuda/detail/par_to_seq.h +0 -95
- cuda/cccl/headers/include/thrust/system/omp/detail/par.h +0 -62
- cuda/cccl/headers/include/thrust/system/tbb/detail/par.h +0 -62
- {cuda_cccl-0.1.3.2.0.dev271.dist-info → cuda_cccl-0.1.3.2.0.dev438.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.1.3.2.0.dev271.dist-info → cuda_cccl-0.1.3.2.0.dev438.dist-info}/licenses/LICENSE +0 -0
cuda/cccl/_cuda_version_utils.py
CHANGED
|
@@ -6,9 +6,6 @@
|
|
|
6
6
|
CUDA version detection utilities shared across the cccl package.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
import os
|
|
10
|
-
import shutil
|
|
11
|
-
from pathlib import Path
|
|
12
9
|
from typing import Optional
|
|
13
10
|
|
|
14
11
|
import cuda.bindings
|
|
@@ -19,25 +16,6 @@ def detect_cuda_version() -> Optional[int]:
|
|
|
19
16
|
return int(cuda_version.split(".")[0])
|
|
20
17
|
|
|
21
18
|
|
|
22
|
-
def get_cuda_path() -> Optional[Path]:
|
|
23
|
-
"""Get the CUDA installation path."""
|
|
24
|
-
cuda_path_str = os.environ.get("CUDA_PATH")
|
|
25
|
-
if cuda_path_str:
|
|
26
|
-
cuda_path = Path(cuda_path_str)
|
|
27
|
-
if cuda_path.exists():
|
|
28
|
-
return cuda_path
|
|
29
|
-
|
|
30
|
-
nvcc_path = shutil.which("nvcc")
|
|
31
|
-
if nvcc_path:
|
|
32
|
-
return Path(nvcc_path).parent.parent
|
|
33
|
-
|
|
34
|
-
default_path = Path("/usr/local/cuda")
|
|
35
|
-
if default_path.exists():
|
|
36
|
-
return default_path
|
|
37
|
-
|
|
38
|
-
return None
|
|
39
|
-
|
|
40
|
-
|
|
41
19
|
def get_recommended_extra(cuda_version: Optional[int]) -> str:
|
|
42
20
|
"""Get the recommended pip extra for the detected CUDA version."""
|
|
43
21
|
if cuda_version == 13:
|
|
@@ -58,7 +58,9 @@ def make_binary_tempfile(content: bytes, suffix: str) -> BinaryIO:
|
|
|
58
58
|
|
|
59
59
|
:return: A binary file-like object representing the temporary file.
|
|
60
60
|
"""
|
|
61
|
-
tmp = tempfile.NamedTemporaryFile(
|
|
61
|
+
tmp = tempfile.NamedTemporaryFile(
|
|
62
|
+
mode="w+b", suffix=suffix, buffering=0, delete=False
|
|
63
|
+
)
|
|
62
64
|
tmp.write(content)
|
|
63
65
|
return tmp
|
|
64
66
|
|
|
@@ -52,9 +52,13 @@
|
|
|
52
52
|
#include <cub/util_ptx.cuh>
|
|
53
53
|
#include <cub/util_type.cuh>
|
|
54
54
|
|
|
55
|
-
#include <cuda/
|
|
55
|
+
#include <cuda/__cmath/ceil_div.h>
|
|
56
|
+
#include <cuda/__cmath/round_up.h>
|
|
57
|
+
#include <cuda/std/__functional/operations.h>
|
|
58
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
59
|
+
#include <cuda/std/__type_traits/enable_if.h>
|
|
60
|
+
#include <cuda/std/__type_traits/type_identity.h>
|
|
56
61
|
#include <cuda/std/cstdint>
|
|
57
|
-
#include <cuda/std/type_traits>
|
|
58
62
|
|
|
59
63
|
CUB_NAMESPACE_BEGIN
|
|
60
64
|
|
|
@@ -49,7 +49,9 @@
|
|
|
49
49
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
50
50
|
#include <cub/util_type.cuh>
|
|
51
51
|
|
|
52
|
-
#include <cuda/std/
|
|
52
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
53
|
+
#include <cuda/std/__type_traits/integral_constant.h>
|
|
54
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
53
55
|
|
|
54
56
|
CUB_NAMESPACE_BEGIN
|
|
55
57
|
|
|
@@ -25,22 +25,15 @@
|
|
|
25
25
|
#include <cuda/std/__algorithm/min.h>
|
|
26
26
|
|
|
27
27
|
CUB_NAMESPACE_BEGIN
|
|
28
|
-
namespace detail
|
|
28
|
+
namespace detail::merge
|
|
29
29
|
{
|
|
30
|
-
|
|
31
|
-
{
|
|
32
|
-
template <int ThreadsPerBlock,
|
|
33
|
-
int ItemsPerThread,
|
|
34
|
-
BlockLoadAlgorithm LoadAlgorithm,
|
|
35
|
-
CacheLoadModifier LoadCacheModifier,
|
|
36
|
-
BlockStoreAlgorithm StoreAlgorithm>
|
|
30
|
+
template <int ThreadsPerBlock, int ItemsPerThread, CacheLoadModifier LoadCacheModifier, BlockStoreAlgorithm StoreAlgorithm>
|
|
37
31
|
struct agent_policy_t
|
|
38
32
|
{
|
|
39
33
|
// do not change data member names, policy_wrapper_t depends on it
|
|
40
34
|
static constexpr int BLOCK_THREADS = ThreadsPerBlock;
|
|
41
35
|
static constexpr int ITEMS_PER_THREAD = ItemsPerThread;
|
|
42
36
|
static constexpr int ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD;
|
|
43
|
-
static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = LoadAlgorithm;
|
|
44
37
|
static constexpr CacheLoadModifier LOAD_MODIFIER = LoadCacheModifier;
|
|
45
38
|
static constexpr BlockStoreAlgorithm STORE_ALGORITHM = StoreAlgorithm;
|
|
46
39
|
};
|
|
@@ -68,34 +61,27 @@ struct agent_t
|
|
|
68
61
|
using items_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt1>;
|
|
69
62
|
using items_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt2>;
|
|
70
63
|
|
|
71
|
-
using block_load_keys1 = typename BlockLoadType<Policy, keys_load_it1>::type;
|
|
72
|
-
using block_load_keys2 = typename BlockLoadType<Policy, keys_load_it2>::type;
|
|
73
|
-
using block_load_items1 = typename BlockLoadType<Policy, items_load_it1>::type;
|
|
74
|
-
using block_load_items2 = typename BlockLoadType<Policy, items_load_it2>::type;
|
|
75
|
-
|
|
76
64
|
using block_store_keys = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
|
|
77
65
|
using block_store_items = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
|
|
78
66
|
|
|
67
|
+
static constexpr int items_per_thread = Policy::ITEMS_PER_THREAD;
|
|
68
|
+
static constexpr int threads_per_block = Policy::BLOCK_THREADS;
|
|
69
|
+
static constexpr int items_per_tile = Policy::ITEMS_PER_TILE;
|
|
70
|
+
|
|
79
71
|
union temp_storages
|
|
80
72
|
{
|
|
81
|
-
typename block_load_keys1::TempStorage load_keys1;
|
|
82
|
-
typename block_load_keys2::TempStorage load_keys2;
|
|
83
|
-
typename block_load_items1::TempStorage load_items1;
|
|
84
|
-
typename block_load_items2::TempStorage load_items2;
|
|
85
73
|
typename block_store_keys::TempStorage store_keys;
|
|
86
74
|
typename block_store_items::TempStorage store_items;
|
|
87
75
|
|
|
88
|
-
|
|
89
|
-
|
|
76
|
+
// We could change SerialMerge to avoid reading one item out of bounds and drop the + 1 here. But that would
|
|
77
|
+
// introduce more branches (about 10% slower on 2^16 problem sizes on RTX 5090 in a first attempt)
|
|
78
|
+
key_type keys_shared[items_per_tile + 1];
|
|
79
|
+
item_type items_shared[items_per_tile + 1];
|
|
90
80
|
};
|
|
91
81
|
|
|
92
82
|
struct TempStorage : Uninitialized<temp_storages>
|
|
93
83
|
{};
|
|
94
84
|
|
|
95
|
-
static constexpr int items_per_thread = Policy::ITEMS_PER_THREAD;
|
|
96
|
-
static constexpr int threads_per_block = Policy::BLOCK_THREADS;
|
|
97
|
-
static constexpr Offset items_per_tile = Policy::ITEMS_PER_TILE;
|
|
98
|
-
|
|
99
85
|
// Per thread data
|
|
100
86
|
temp_storages& storage;
|
|
101
87
|
keys_load_it1 keys1_in;
|
|
@@ -107,61 +93,86 @@ struct agent_t
|
|
|
107
93
|
KeysOutputIt keys_out;
|
|
108
94
|
ItemsOutputIt items_out;
|
|
109
95
|
CompareOp compare_op;
|
|
110
|
-
Offset*
|
|
96
|
+
Offset* key1_beg_offsets;
|
|
111
97
|
|
|
112
98
|
template <bool IsFullTile>
|
|
113
99
|
_CCCL_DEVICE _CCCL_FORCEINLINE void consume_tile(Offset tile_idx, Offset tile_base, int num_remaining)
|
|
114
100
|
{
|
|
115
|
-
const Offset partition_beg = merge_partitions[tile_idx + 0];
|
|
116
|
-
const Offset partition_end = merge_partitions[tile_idx + 1];
|
|
117
|
-
|
|
118
101
|
const Offset diag0 = items_per_tile * tile_idx;
|
|
119
|
-
|
|
102
|
+
Offset diag1 = diag0 + items_per_tile;
|
|
103
|
+
if constexpr (IsFullTile)
|
|
104
|
+
{
|
|
105
|
+
_CCCL_ASSERT(diag1 <= keys1_count + keys2_count, "");
|
|
106
|
+
}
|
|
107
|
+
else
|
|
108
|
+
{
|
|
109
|
+
diag1 = keys1_count + keys2_count;
|
|
110
|
+
}
|
|
120
111
|
|
|
121
112
|
// compute bounding box for keys1 & keys2
|
|
122
|
-
const Offset keys1_beg =
|
|
123
|
-
const Offset keys1_end =
|
|
113
|
+
const Offset keys1_beg = key1_beg_offsets[tile_idx + 0];
|
|
114
|
+
const Offset keys1_end = key1_beg_offsets[tile_idx + 1];
|
|
124
115
|
const Offset keys2_beg = diag0 - keys1_beg;
|
|
125
116
|
const Offset keys2_end = diag1 - keys1_end;
|
|
126
117
|
|
|
127
118
|
// number of keys per tile
|
|
128
|
-
const int
|
|
129
|
-
const int
|
|
119
|
+
const int keys1_count_tile = static_cast<int>(keys1_end - keys1_beg);
|
|
120
|
+
const int keys2_count_tile = static_cast<int>(keys2_end - keys2_beg);
|
|
121
|
+
if constexpr (IsFullTile)
|
|
122
|
+
{
|
|
123
|
+
_CCCL_ASSERT(keys1_count_tile + keys2_count_tile == items_per_tile, "");
|
|
124
|
+
}
|
|
125
|
+
else
|
|
126
|
+
{
|
|
127
|
+
_CCCL_ASSERT(keys1_count_tile + keys2_count_tile == num_remaining, "");
|
|
128
|
+
}
|
|
130
129
|
|
|
131
130
|
key_type keys_loc[items_per_thread];
|
|
132
131
|
merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
|
|
133
|
-
keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg,
|
|
132
|
+
keys_loc, keys1_in + keys1_beg, keys2_in + keys2_beg, keys1_count_tile, keys2_count_tile);
|
|
134
133
|
merge_sort::reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
|
|
135
134
|
__syncthreads();
|
|
136
135
|
|
|
137
|
-
//
|
|
136
|
+
// now find the merge path for each of thread.
|
|
138
137
|
// we can use int type here, because the number of items in shared memory is limited
|
|
139
|
-
|
|
138
|
+
int diag0_thread = items_per_thread * static_cast<int>(threadIdx.x);
|
|
139
|
+
if constexpr (IsFullTile)
|
|
140
|
+
{
|
|
141
|
+
_CCCL_ASSERT(num_remaining == items_per_tile, "");
|
|
142
|
+
_CCCL_ASSERT(diag0_thread < num_remaining, "");
|
|
143
|
+
}
|
|
144
|
+
else
|
|
145
|
+
{ // for partial tiles, clamp the thread diagonal to the valid items
|
|
146
|
+
diag0_thread = (::cuda::std::min) (diag0_thread, num_remaining);
|
|
147
|
+
}
|
|
140
148
|
|
|
141
|
-
const int
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
149
|
+
const int keys1_beg_thread = MergePath(
|
|
150
|
+
&storage.keys_shared[0],
|
|
151
|
+
&storage.keys_shared[keys1_count_tile],
|
|
152
|
+
keys1_count_tile,
|
|
153
|
+
keys2_count_tile,
|
|
154
|
+
diag0_thread,
|
|
155
|
+
compare_op);
|
|
156
|
+
const int keys2_beg_thread = diag0_thread - keys1_beg_thread;
|
|
146
157
|
|
|
147
|
-
const int
|
|
148
|
-
const int
|
|
158
|
+
const int keys1_count_thread = keys1_count_tile - keys1_beg_thread;
|
|
159
|
+
const int keys2_count_thread = keys2_count_tile - keys2_beg_thread;
|
|
149
160
|
|
|
150
161
|
// perform serial merge
|
|
151
162
|
int indices[items_per_thread];
|
|
152
|
-
|
|
163
|
+
SerialMerge(
|
|
153
164
|
&storage.keys_shared[0],
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
165
|
+
keys1_beg_thread,
|
|
166
|
+
keys2_beg_thread + keys1_count_tile,
|
|
167
|
+
keys1_count_thread,
|
|
168
|
+
keys2_count_thread,
|
|
158
169
|
keys_loc,
|
|
159
170
|
indices,
|
|
160
171
|
compare_op);
|
|
161
|
-
__syncthreads();
|
|
162
172
|
|
|
163
173
|
// write keys
|
|
164
|
-
|
|
174
|
+
__syncthreads(); // sync after reading from SMEM before so block store can use SMEM again
|
|
175
|
+
if constexpr (IsFullTile)
|
|
165
176
|
{
|
|
166
177
|
block_store_keys{storage.store_keys}.Store(keys_out + tile_base, keys_loc);
|
|
167
178
|
}
|
|
@@ -176,9 +187,8 @@ struct agent_t
|
|
|
176
187
|
{
|
|
177
188
|
item_type items_loc[items_per_thread];
|
|
178
189
|
merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
|
|
179
|
-
items_loc, items1_in + keys1_beg, items2_in + keys2_beg,
|
|
180
|
-
__syncthreads(); // block_store_keys above uses
|
|
181
|
-
// to it
|
|
190
|
+
items_loc, items1_in + keys1_beg, items2_in + keys2_beg, keys1_count_tile, keys2_count_tile);
|
|
191
|
+
__syncthreads(); // block_store_keys above uses SMEM, so make sure all threads are done before we write to it
|
|
182
192
|
merge_sort::reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
|
|
183
193
|
__syncthreads();
|
|
184
194
|
|
|
@@ -191,7 +201,7 @@ struct agent_t
|
|
|
191
201
|
__syncthreads();
|
|
192
202
|
|
|
193
203
|
// write from reg to gmem
|
|
194
|
-
if (IsFullTile)
|
|
204
|
+
if constexpr (IsFullTile)
|
|
195
205
|
{
|
|
196
206
|
block_store_items{storage.store_items}.Store(items_out + tile_base, items_loc);
|
|
197
207
|
}
|
|
@@ -204,23 +214,19 @@ struct agent_t
|
|
|
204
214
|
|
|
205
215
|
_CCCL_DEVICE _CCCL_FORCEINLINE void operator()()
|
|
206
216
|
{
|
|
207
|
-
|
|
208
|
-
// TODO(bgruber): is the above still true?
|
|
209
|
-
const int tile_idx = static_cast<int>(blockIdx.x);
|
|
217
|
+
const Offset tile_idx = blockIdx.x;
|
|
210
218
|
const Offset tile_base = tile_idx * items_per_tile;
|
|
211
|
-
// TODO(bgruber): random mixing of int and Offset
|
|
212
219
|
const int items_in_tile =
|
|
213
220
|
static_cast<int>((::cuda::std::min) (static_cast<Offset>(items_per_tile), keys1_count + keys2_count - tile_base));
|
|
214
221
|
if (items_in_tile == items_per_tile)
|
|
215
222
|
{
|
|
216
|
-
consume_tile
|
|
223
|
+
consume_tile</* IsFullTile */ true>(tile_idx, tile_base, items_per_tile);
|
|
217
224
|
}
|
|
218
225
|
else
|
|
219
226
|
{
|
|
220
|
-
consume_tile
|
|
227
|
+
consume_tile</* IsFullTile */ false>(tile_idx, tile_base, items_in_tile);
|
|
221
228
|
}
|
|
222
229
|
}
|
|
223
230
|
};
|
|
224
|
-
} // namespace merge
|
|
225
|
-
} // namespace detail
|
|
231
|
+
} // namespace detail::merge
|
|
226
232
|
CUB_NAMESPACE_END
|
|
@@ -50,8 +50,11 @@
|
|
|
50
50
|
#include <cub/util_math.cuh>
|
|
51
51
|
#include <cub/util_type.cuh>
|
|
52
52
|
|
|
53
|
-
#include <cuda/
|
|
54
|
-
#include <cuda/
|
|
53
|
+
#include <cuda/__cmath/ceil_div.h>
|
|
54
|
+
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
55
|
+
#include <cuda/std/__algorithm/max.h>
|
|
56
|
+
#include <cuda/std/__algorithm/min.h>
|
|
57
|
+
#include <cuda/std/__functional/operations.h>
|
|
55
58
|
|
|
56
59
|
CUB_NAMESPACE_BEGIN
|
|
57
60
|
|
|
@@ -49,8 +49,10 @@
|
|
|
49
49
|
#include <cub/util_ptx.cuh>
|
|
50
50
|
#include <cub/util_type.cuh>
|
|
51
51
|
|
|
52
|
-
#include <cuda/
|
|
53
|
-
#include <cuda/std/
|
|
52
|
+
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
53
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
54
|
+
#include <cuda/std/__type_traits/integral_constant.h>
|
|
55
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
54
56
|
|
|
55
57
|
CUB_NAMESPACE_BEGIN
|
|
56
58
|
|
|
@@ -52,8 +52,9 @@
|
|
|
52
52
|
#include <cub/util_type.cuh>
|
|
53
53
|
#include <cub/warp/warp_reduce.cuh>
|
|
54
54
|
|
|
55
|
-
#include <cuda/
|
|
56
|
-
#include <cuda/std/
|
|
55
|
+
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
56
|
+
#include <cuda/std/__algorithm/max.h>
|
|
57
|
+
#include <cuda/std/__algorithm/min.h>
|
|
57
58
|
|
|
58
59
|
CUB_NAMESPACE_BEGIN
|
|
59
60
|
|
|
@@ -50,9 +50,12 @@
|
|
|
50
50
|
#include <cub/util_device.cuh>
|
|
51
51
|
#include <cub/util_type.cuh>
|
|
52
52
|
|
|
53
|
-
#include <cuda/
|
|
54
|
-
#include <cuda/std/
|
|
55
|
-
#include <cuda/std/
|
|
53
|
+
#include <cuda/std/__algorithm/min.h>
|
|
54
|
+
#include <cuda/std/__functional/identity.h>
|
|
55
|
+
#include <cuda/std/__functional/operations.h>
|
|
56
|
+
#include <cuda/std/__memory/is_sufficiently_aligned.h>
|
|
57
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
58
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
56
59
|
|
|
57
60
|
CUB_NAMESPACE_BEGIN
|
|
58
61
|
|
|
@@ -172,9 +175,6 @@ namespace detail::reduce
|
|
|
172
175
|
* @tparam InputIteratorT
|
|
173
176
|
* Random-access iterator type for input
|
|
174
177
|
*
|
|
175
|
-
* @tparam OutputIteratorT
|
|
176
|
-
* Random-access iterator type for output
|
|
177
|
-
*
|
|
178
178
|
* @tparam OffsetT
|
|
179
179
|
* Signed integer type for global offsets
|
|
180
180
|
*
|
|
@@ -199,7 +199,6 @@ namespace detail::reduce
|
|
|
199
199
|
*/
|
|
200
200
|
template <typename AgentReducePolicy,
|
|
201
201
|
typename InputIteratorT,
|
|
202
|
-
typename OutputIteratorT,
|
|
203
202
|
typename OffsetT,
|
|
204
203
|
typename ReductionOp,
|
|
205
204
|
typename AccumT,
|
|
@@ -271,7 +270,7 @@ struct AgentReduceImpl
|
|
|
271
270
|
{
|
|
272
271
|
if constexpr (AttemptVectorization)
|
|
273
272
|
{
|
|
274
|
-
return ::cuda::
|
|
273
|
+
return ::cuda::std::is_sufficiently_aligned<alignof(VectorT)>(d_in);
|
|
275
274
|
}
|
|
276
275
|
else
|
|
277
276
|
{
|
|
@@ -503,9 +502,6 @@ private:
|
|
|
503
502
|
* @tparam InputIteratorT
|
|
504
503
|
* Random-access iterator type for input
|
|
505
504
|
*
|
|
506
|
-
* @tparam OutputIteratorT
|
|
507
|
-
* Random-access iterator type for output
|
|
508
|
-
*
|
|
509
505
|
* @tparam OffsetT
|
|
510
506
|
* Signed integer type for global offsets
|
|
511
507
|
*
|
|
@@ -521,7 +517,6 @@ private:
|
|
|
521
517
|
*/
|
|
522
518
|
template <typename AgentReducePolicy,
|
|
523
519
|
typename InputIteratorT,
|
|
524
|
-
typename OutputIteratorT,
|
|
525
520
|
typename OffsetT,
|
|
526
521
|
typename ReductionOp,
|
|
527
522
|
typename AccumT,
|
|
@@ -529,7 +524,6 @@ template <typename AgentReducePolicy,
|
|
|
529
524
|
struct AgentReduce
|
|
530
525
|
: AgentReduceImpl<AgentReducePolicy,
|
|
531
526
|
InputIteratorT,
|
|
532
|
-
OutputIteratorT,
|
|
533
527
|
OffsetT,
|
|
534
528
|
ReductionOp,
|
|
535
529
|
AccumT,
|
|
@@ -540,7 +534,6 @@ struct AgentReduce
|
|
|
540
534
|
using base_t =
|
|
541
535
|
AgentReduceImpl<AgentReducePolicy,
|
|
542
536
|
InputIteratorT,
|
|
543
|
-
OutputIteratorT,
|
|
544
537
|
OffsetT,
|
|
545
538
|
ReductionOp,
|
|
546
539
|
AccumT,
|
|
@@ -571,9 +564,6 @@ struct AgentReduce
|
|
|
571
564
|
* @tparam InputIteratorT
|
|
572
565
|
* Random-access iterator type for input
|
|
573
566
|
*
|
|
574
|
-
* @tparam OutputIteratorT
|
|
575
|
-
* Random-access iterator type for output
|
|
576
|
-
*
|
|
577
567
|
* @tparam OffsetT
|
|
578
568
|
* Signed integer type for global offsets
|
|
579
569
|
*
|
|
@@ -589,7 +579,6 @@ struct AgentReduce
|
|
|
589
579
|
*/
|
|
590
580
|
template <typename AgentReducePolicy,
|
|
591
581
|
typename InputIteratorT,
|
|
592
|
-
typename OutputIteratorT,
|
|
593
582
|
typename OffsetT,
|
|
594
583
|
typename ReductionOp,
|
|
595
584
|
typename AccumT,
|
|
@@ -597,7 +586,6 @@ template <typename AgentReducePolicy,
|
|
|
597
586
|
struct AgentWarpReduce
|
|
598
587
|
: AgentReduceImpl<AgentReducePolicy,
|
|
599
588
|
InputIteratorT,
|
|
600
|
-
OutputIteratorT,
|
|
601
589
|
OffsetT,
|
|
602
590
|
ReductionOp,
|
|
603
591
|
AccumT,
|
|
@@ -609,7 +597,6 @@ struct AgentWarpReduce
|
|
|
609
597
|
using base_t =
|
|
610
598
|
AgentReduceImpl<AgentReducePolicy,
|
|
611
599
|
InputIteratorT,
|
|
612
|
-
OutputIteratorT,
|
|
613
600
|
OffsetT,
|
|
614
601
|
ReductionOp,
|
|
615
602
|
AccumT,
|
|
@@ -50,8 +50,10 @@
|
|
|
50
50
|
#include <cub/block/block_store.cuh>
|
|
51
51
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
52
52
|
|
|
53
|
-
#include <cuda/std/
|
|
54
|
-
|
|
53
|
+
#include <cuda/std/__functional/operations.h>
|
|
54
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
55
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
56
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
55
57
|
CUB_NAMESPACE_BEGIN
|
|
56
58
|
|
|
57
59
|
/******************************************************************************
|
|
@@ -50,11 +50,14 @@
|
|
|
50
50
|
#include <cub/block/block_load.cuh>
|
|
51
51
|
#include <cub/block/block_scan.cuh>
|
|
52
52
|
#include <cub/block/block_store.cuh>
|
|
53
|
-
#include <cub/grid/grid_queue.cuh>
|
|
54
53
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
55
54
|
|
|
56
|
-
#include <cuda/
|
|
57
|
-
#include <cuda/std/
|
|
55
|
+
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
56
|
+
#include <cuda/std/__functional/operations.h>
|
|
57
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
58
|
+
#include <cuda/std/__type_traits/integral_constant.h>
|
|
59
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
60
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
58
61
|
|
|
59
62
|
CUB_NAMESPACE_BEGIN
|
|
60
63
|
|
|
@@ -49,8 +49,11 @@
|
|
|
49
49
|
#include <cub/block/block_store.cuh>
|
|
50
50
|
#include <cub/grid/grid_queue.cuh>
|
|
51
51
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
52
|
+
#include <cub/util_device.cuh>
|
|
52
53
|
|
|
53
|
-
#include <cuda/std/
|
|
54
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
55
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
56
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
54
57
|
|
|
55
58
|
CUB_NAMESPACE_BEGIN
|
|
56
59
|
|
|
@@ -109,6 +112,27 @@ struct AgentScanPolicy : ScalingType
|
|
|
109
112
|
};
|
|
110
113
|
};
|
|
111
114
|
|
|
115
|
+
#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
116
|
+
namespace detail
|
|
117
|
+
{
|
|
118
|
+
// Only define this when needed.
|
|
119
|
+
// Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
|
|
120
|
+
// either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
|
|
121
|
+
// version is always defined, and that's the only one needed for regular CUB operations.
|
|
122
|
+
//
|
|
123
|
+
// TODO: enable this unconditionally once concepts are always available
|
|
124
|
+
CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
125
|
+
ScanAgentPolicy,
|
|
126
|
+
(GenericAgentPolicy),
|
|
127
|
+
(BLOCK_THREADS, BlockThreads, int),
|
|
128
|
+
(ITEMS_PER_THREAD, ItemsPerThread, int),
|
|
129
|
+
(LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
|
|
130
|
+
(LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
|
|
131
|
+
(STORE_ALGORITHM, StoreAlgorithm, cub::BlockStoreAlgorithm),
|
|
132
|
+
(SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
|
|
133
|
+
} // namespace detail
|
|
134
|
+
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
135
|
+
|
|
112
136
|
/******************************************************************************
|
|
113
137
|
* Thread block abstractions
|
|
114
138
|
******************************************************************************/
|
|
@@ -50,7 +50,11 @@
|
|
|
50
50
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
51
51
|
#include <cub/util_type.cuh>
|
|
52
52
|
|
|
53
|
-
#include <cuda/std/
|
|
53
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
54
|
+
#include <cuda/std/__type_traits/enable_if.h>
|
|
55
|
+
#include <cuda/std/__type_traits/integral_constant.h>
|
|
56
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
57
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
54
58
|
|
|
55
59
|
CUB_NAMESPACE_BEGIN
|
|
56
60
|
|
|
@@ -50,11 +50,17 @@
|
|
|
50
50
|
#include <cub/block/block_scan.cuh>
|
|
51
51
|
#include <cub/block/block_store.cuh>
|
|
52
52
|
#include <cub/device/dispatch/dispatch_common.cuh>
|
|
53
|
-
#include <cub/grid/grid_queue.cuh>
|
|
54
53
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
55
54
|
#include <cub/util_type.cuh>
|
|
56
55
|
|
|
57
|
-
#include <cuda/std/
|
|
56
|
+
#include <cuda/std/__functional/operations.h>
|
|
57
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
58
|
+
#include <cuda/std/__type_traits/enable_if.h>
|
|
59
|
+
#include <cuda/std/__type_traits/integral_constant.h>
|
|
60
|
+
#include <cuda/std/__type_traits/is_callable.h>
|
|
61
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
62
|
+
#include <cuda/std/__type_traits/is_same.h>
|
|
63
|
+
#include <cuda/std/cstdint>
|
|
58
64
|
|
|
59
65
|
CUB_NAMESPACE_BEGIN
|
|
60
66
|
|
|
@@ -44,8 +44,12 @@
|
|
|
44
44
|
#include <cub/block/block_scan.cuh>
|
|
45
45
|
#include <cub/block/block_store.cuh>
|
|
46
46
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
47
|
+
#include <cub/util_device.cuh>
|
|
47
48
|
|
|
48
|
-
#include <cuda/std/
|
|
49
|
+
#include <cuda/std/__functional/operations.h>
|
|
50
|
+
#include <cuda/std/__type_traits/conditional.h>
|
|
51
|
+
#include <cuda/std/__type_traits/enable_if.h>
|
|
52
|
+
#include <cuda/std/__type_traits/is_pointer.h>
|
|
49
53
|
|
|
50
54
|
CUB_NAMESPACE_BEGIN
|
|
51
55
|
|
|
@@ -73,9 +77,22 @@ struct AgentThreeWayPartitionPolicy
|
|
|
73
77
|
};
|
|
74
78
|
};
|
|
75
79
|
|
|
80
|
+
#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
76
81
|
namespace detail
|
|
77
82
|
{
|
|
83
|
+
CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
84
|
+
ThreeWayPartitionAgentPolicy,
|
|
85
|
+
(GenericAgentPolicy),
|
|
86
|
+
(BLOCK_THREADS, BlockThreads, int),
|
|
87
|
+
(ITEMS_PER_THREAD, ItemsPerThread, int),
|
|
88
|
+
(LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
|
|
89
|
+
(LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
|
|
90
|
+
(SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
|
|
91
|
+
} // namespace detail
|
|
92
|
+
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
78
93
|
|
|
94
|
+
namespace detail
|
|
95
|
+
{
|
|
79
96
|
namespace three_way_partition
|
|
80
97
|
{
|
|
81
98
|
|