cuda-cccl 0.3.0__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
- cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
- cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +27 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +13 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +155 -13
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +79 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
- cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
- cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-312-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -53,14 +53,8 @@ struct agent_t
|
|
|
53
53
|
using policy = Policy;
|
|
54
54
|
|
|
55
55
|
// key and value type are taken from the first input sequence (consistent with old Thrust behavior)
|
|
56
|
-
using key_type
|
|
57
|
-
using item_type
|
|
58
|
-
|
|
59
|
-
using keys_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt1>;
|
|
60
|
-
using keys_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, KeysIt2>;
|
|
61
|
-
using items_load_it1 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt1>;
|
|
62
|
-
using items_load_it2 = try_make_cache_modified_iterator_t<Policy::LOAD_MODIFIER, ItemsIt2>;
|
|
63
|
-
|
|
56
|
+
using key_type = it_value_t<KeysIt1>;
|
|
57
|
+
using item_type = it_value_t<ItemsIt1>;
|
|
64
58
|
using block_store_keys = typename BlockStoreType<Policy, KeysOutputIt, key_type>::type;
|
|
65
59
|
using block_store_items = typename BlockStoreType<Policy, ItemsOutputIt, item_type>::type;
|
|
66
60
|
|
|
@@ -84,11 +78,11 @@ struct agent_t
|
|
|
84
78
|
|
|
85
79
|
// Per thread data
|
|
86
80
|
temp_storages& storage;
|
|
87
|
-
|
|
88
|
-
|
|
81
|
+
KeysIt1 keys1_in;
|
|
82
|
+
ItemsIt1 items1_in;
|
|
89
83
|
Offset keys1_count;
|
|
90
|
-
|
|
91
|
-
|
|
84
|
+
KeysIt2 keys2_in;
|
|
85
|
+
ItemsIt2 items2_in;
|
|
92
86
|
Offset keys2_count;
|
|
93
87
|
KeysOutputIt keys_out;
|
|
94
88
|
ItemsOutputIt items_out;
|
|
@@ -128,10 +122,14 @@ struct agent_t
|
|
|
128
122
|
}
|
|
129
123
|
|
|
130
124
|
key_type keys_loc[items_per_thread];
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
125
|
+
{
|
|
126
|
+
auto keys1_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(keys1_in);
|
|
127
|
+
auto keys2_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(keys2_in);
|
|
128
|
+
merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
|
|
129
|
+
keys_loc, keys1_in_cm + keys1_beg, keys2_in_cm + keys2_beg, keys1_count_tile, keys2_count_tile);
|
|
130
|
+
merge_sort::reg_to_shared<threads_per_block>(&storage.keys_shared[0], keys_loc);
|
|
131
|
+
__syncthreads();
|
|
132
|
+
}
|
|
135
133
|
|
|
136
134
|
// now find the merge path for each of thread.
|
|
137
135
|
// we can use int type here, because the number of items in shared memory is limited
|
|
@@ -186,11 +184,15 @@ struct agent_t
|
|
|
186
184
|
if constexpr (have_items)
|
|
187
185
|
{
|
|
188
186
|
item_type items_loc[items_per_thread];
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
187
|
+
{
|
|
188
|
+
auto items1_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(items1_in);
|
|
189
|
+
auto items2_in_cm = try_make_cache_modified_iterator<Policy::LOAD_MODIFIER>(items2_in);
|
|
190
|
+
merge_sort::gmem_to_reg<threads_per_block, IsFullTile>(
|
|
191
|
+
items_loc, items1_in_cm + keys1_beg, items2_in_cm + keys2_beg, keys1_count_tile, keys2_count_tile);
|
|
192
|
+
__syncthreads(); // block_store_keys above uses SMEM, so make sure all threads are done before we write to it
|
|
193
|
+
merge_sort::reg_to_shared<threads_per_block>(&storage.items_shared[0], items_loc);
|
|
194
|
+
__syncthreads();
|
|
195
|
+
}
|
|
194
196
|
|
|
195
197
|
// gather items from shared mem
|
|
196
198
|
_CCCL_PRAGMA_UNROLL_FULL()
|
|
@@ -66,9 +66,28 @@ struct AgentMergeSortPolicy
|
|
|
66
66
|
static constexpr cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
|
|
67
67
|
};
|
|
68
68
|
|
|
69
|
+
#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
69
70
|
namespace detail
|
|
70
71
|
{
|
|
71
|
-
|
|
72
|
+
// Only define this when needed.
|
|
73
|
+
// Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
|
|
74
|
+
// either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
|
|
75
|
+
// version is always defined, and that's the only one needed for regular CUB operations.
|
|
76
|
+
//
|
|
77
|
+
// TODO: enable this unconditionally once concepts are always available
|
|
78
|
+
CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
79
|
+
MergeSortAgentPolicy,
|
|
80
|
+
(GenericAgentPolicy),
|
|
81
|
+
(BLOCK_THREADS, BlockThreads, int),
|
|
82
|
+
(ITEMS_PER_THREAD, ItemsPerThread, int),
|
|
83
|
+
(ITEMS_PER_TILE, ItemsPerTile, int),
|
|
84
|
+
(LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
|
|
85
|
+
(LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
|
|
86
|
+
(STORE_ALGORITHM, StoreAlgorithm, cub::BlockStoreAlgorithm))
|
|
87
|
+
} // namespace detail
|
|
88
|
+
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES
|
|
89
|
+
|
|
90
|
+
namespace detail::merge_sort
|
|
72
91
|
{
|
|
73
92
|
|
|
74
93
|
template <typename Policy,
|
|
@@ -724,7 +743,6 @@ struct AgentMerge
|
|
|
724
743
|
}
|
|
725
744
|
};
|
|
726
745
|
|
|
727
|
-
} // namespace merge_sort
|
|
728
|
-
} // namespace detail
|
|
746
|
+
} // namespace detail::merge_sort
|
|
729
747
|
|
|
730
748
|
CUB_NAMESPACE_END
|
|
@@ -146,9 +146,7 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
|
146
146
|
* Thread block abstractions
|
|
147
147
|
******************************************************************************/
|
|
148
148
|
|
|
149
|
-
namespace detail
|
|
150
|
-
{
|
|
151
|
-
namespace radix_sort
|
|
149
|
+
namespace detail::radix_sort
|
|
152
150
|
{
|
|
153
151
|
|
|
154
152
|
/**
|
|
@@ -783,7 +781,6 @@ struct AgentRadixSortDownsweep
|
|
|
783
781
|
}
|
|
784
782
|
};
|
|
785
783
|
|
|
786
|
-
} // namespace radix_sort
|
|
787
|
-
} // namespace detail
|
|
784
|
+
} // namespace detail::radix_sort
|
|
788
785
|
|
|
789
786
|
CUB_NAMESPACE_END
|
|
@@ -85,9 +85,7 @@ struct AgentRadixSortExclusiveSumPolicy
|
|
|
85
85
|
};
|
|
86
86
|
};
|
|
87
87
|
|
|
88
|
-
namespace detail
|
|
89
|
-
{
|
|
90
|
-
namespace radix_sort
|
|
88
|
+
namespace detail::radix_sort
|
|
91
89
|
{
|
|
92
90
|
|
|
93
91
|
template <typename AgentRadixSortHistogramPolicy,
|
|
@@ -283,7 +281,6 @@ struct AgentRadixSortHistogram
|
|
|
283
281
|
}
|
|
284
282
|
};
|
|
285
283
|
|
|
286
|
-
} // namespace radix_sort
|
|
287
|
-
} // namespace detail
|
|
284
|
+
} // namespace detail::radix_sort
|
|
288
285
|
|
|
289
286
|
CUB_NAMESPACE_END
|
|
@@ -100,9 +100,7 @@ struct AgentRadixSortOnesweepPolicy : ScalingType
|
|
|
100
100
|
static constexpr RadixSortStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
|
|
101
101
|
};
|
|
102
102
|
|
|
103
|
-
namespace detail
|
|
104
|
-
{
|
|
105
|
-
namespace radix_sort
|
|
103
|
+
namespace detail::radix_sort
|
|
106
104
|
{
|
|
107
105
|
|
|
108
106
|
template <typename AgentRadixSortOnesweepPolicy,
|
|
@@ -700,7 +698,6 @@ struct AgentRadixSortOnesweep
|
|
|
700
698
|
}
|
|
701
699
|
};
|
|
702
700
|
|
|
703
|
-
} // namespace radix_sort
|
|
704
|
-
} // namespace detail
|
|
701
|
+
} // namespace detail::radix_sort
|
|
705
702
|
|
|
706
703
|
CUB_NAMESPACE_END
|
|
@@ -103,9 +103,7 @@ struct AgentRadixSortUpsweepPolicy : ScalingType
|
|
|
103
103
|
* Thread block abstractions
|
|
104
104
|
******************************************************************************/
|
|
105
105
|
|
|
106
|
-
namespace detail
|
|
107
|
-
{
|
|
108
|
-
namespace radix_sort
|
|
106
|
+
namespace detail::radix_sort
|
|
109
107
|
{
|
|
110
108
|
|
|
111
109
|
/**
|
|
@@ -552,7 +550,6 @@ struct AgentRadixSortUpsweep
|
|
|
552
550
|
}
|
|
553
551
|
};
|
|
554
552
|
|
|
555
|
-
} // namespace radix_sort
|
|
556
|
-
} // namespace detail
|
|
553
|
+
} // namespace detail::radix_sort
|
|
557
554
|
|
|
558
555
|
CUB_NAMESPACE_END
|
|
@@ -134,9 +134,7 @@ struct AgentRlePolicy
|
|
|
134
134
|
* Thread block abstractions
|
|
135
135
|
******************************************************************************/
|
|
136
136
|
|
|
137
|
-
namespace detail
|
|
138
|
-
{
|
|
139
|
-
namespace rle
|
|
137
|
+
namespace detail::rle
|
|
140
138
|
{
|
|
141
139
|
|
|
142
140
|
/**
|
|
@@ -1121,7 +1119,6 @@ struct AgentRle
|
|
|
1121
1119
|
}
|
|
1122
1120
|
};
|
|
1123
1121
|
|
|
1124
|
-
} // namespace rle
|
|
1125
|
-
} // namespace detail
|
|
1122
|
+
} // namespace detail::rle
|
|
1126
1123
|
|
|
1127
1124
|
CUB_NAMESPACE_END
|
|
@@ -51,6 +51,10 @@
|
|
|
51
51
|
#include <cub/iterator/cache_modified_input_iterator.cuh>
|
|
52
52
|
#include <cub/util_device.cuh>
|
|
53
53
|
|
|
54
|
+
#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
55
|
+
# include <cub/agent/agent_unique_by_key.cuh> // for UniqueByKeyAgentPolicy
|
|
56
|
+
#endif
|
|
57
|
+
|
|
54
58
|
#include <cuda/std/__type_traits/conditional.h>
|
|
55
59
|
#include <cuda/std/__type_traits/is_pointer.h>
|
|
56
60
|
#include <cuda/std/__type_traits/is_same.h>
|
|
@@ -123,7 +127,7 @@ namespace detail
|
|
|
123
127
|
// TODO: enable this unconditionally once concepts are always available
|
|
124
128
|
CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
125
129
|
ScanAgentPolicy,
|
|
126
|
-
(
|
|
130
|
+
(UniqueByKeyAgentPolicy),
|
|
127
131
|
(BLOCK_THREADS, BlockThreads, int),
|
|
128
132
|
(ITEMS_PER_THREAD, ItemsPerThread, int),
|
|
129
133
|
(LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
|
|
@@ -96,9 +96,7 @@ struct AgentScanByKeyPolicy
|
|
|
96
96
|
* Thread block abstractions
|
|
97
97
|
******************************************************************************/
|
|
98
98
|
|
|
99
|
-
namespace detail
|
|
100
|
-
{
|
|
101
|
-
namespace scan_by_key
|
|
99
|
+
namespace detail::scan_by_key
|
|
102
100
|
{
|
|
103
101
|
|
|
104
102
|
/**
|
|
@@ -471,7 +469,6 @@ struct AgentScanByKey
|
|
|
471
469
|
}
|
|
472
470
|
};
|
|
473
471
|
|
|
474
|
-
} // namespace scan_by_key
|
|
475
|
-
} // namespace detail
|
|
472
|
+
} // namespace detail::scan_by_key
|
|
476
473
|
|
|
477
474
|
CUB_NAMESPACE_END
|
|
@@ -45,9 +45,7 @@
|
|
|
45
45
|
|
|
46
46
|
CUB_NAMESPACE_BEGIN
|
|
47
47
|
|
|
48
|
-
namespace detail
|
|
49
|
-
{
|
|
50
|
-
namespace radix_sort
|
|
48
|
+
namespace detail::radix_sort
|
|
51
49
|
{
|
|
52
50
|
|
|
53
51
|
/**
|
|
@@ -286,7 +284,6 @@ struct AgentSegmentedRadixSort
|
|
|
286
284
|
}
|
|
287
285
|
};
|
|
288
286
|
|
|
289
|
-
} // namespace radix_sort
|
|
290
|
-
} // namespace detail
|
|
287
|
+
} // namespace detail::radix_sort
|
|
291
288
|
|
|
292
289
|
CUB_NAMESPACE_END
|
|
@@ -126,9 +126,7 @@ struct AgentSelectIfPolicy
|
|
|
126
126
|
* Thread block abstractions
|
|
127
127
|
******************************************************************************/
|
|
128
128
|
|
|
129
|
-
namespace detail
|
|
130
|
-
{
|
|
131
|
-
namespace select
|
|
129
|
+
namespace detail::select
|
|
132
130
|
{
|
|
133
131
|
|
|
134
132
|
template <typename EqualityOpT>
|
|
@@ -1114,7 +1112,6 @@ struct AgentSelectIf
|
|
|
1114
1112
|
}
|
|
1115
1113
|
};
|
|
1116
1114
|
|
|
1117
|
-
} // namespace select
|
|
1118
|
-
} // namespace detail
|
|
1115
|
+
} // namespace detail::select
|
|
1119
1116
|
|
|
1120
1117
|
CUB_NAMESPACE_END
|
|
@@ -84,9 +84,7 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
|
84
84
|
} // namespace detail
|
|
85
85
|
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
86
86
|
|
|
87
|
-
namespace detail
|
|
88
|
-
{
|
|
89
|
-
namespace sub_warp_merge_sort
|
|
87
|
+
namespace detail::sub_warp_merge_sort
|
|
90
88
|
{
|
|
91
89
|
|
|
92
90
|
/**
|
|
@@ -343,7 +341,6 @@ private:
|
|
|
343
341
|
}
|
|
344
342
|
};
|
|
345
343
|
|
|
346
|
-
} // namespace sub_warp_merge_sort
|
|
347
|
-
} // namespace detail
|
|
344
|
+
} // namespace detail::sub_warp_merge_sort
|
|
348
345
|
|
|
349
346
|
CUB_NAMESPACE_END
|
|
@@ -91,9 +91,7 @@ CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
|
91
91
|
} // namespace detail
|
|
92
92
|
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
93
93
|
|
|
94
|
-
namespace detail
|
|
95
|
-
{
|
|
96
|
-
namespace three_way_partition
|
|
94
|
+
namespace detail::three_way_partition
|
|
97
95
|
{
|
|
98
96
|
|
|
99
97
|
template <class OffsetT>
|
|
@@ -603,7 +601,6 @@ struct AgentThreeWayPartition
|
|
|
603
601
|
}
|
|
604
602
|
};
|
|
605
603
|
|
|
606
|
-
} // namespace three_way_partition
|
|
607
|
-
} // namespace detail
|
|
604
|
+
} // namespace detail::three_way_partition
|
|
608
605
|
|
|
609
606
|
CUB_NAMESPACE_END
|
|
@@ -85,13 +85,31 @@ struct AgentUniqueByKeyPolicy
|
|
|
85
85
|
};
|
|
86
86
|
};
|
|
87
87
|
|
|
88
|
+
#if defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
89
|
+
namespace detail
|
|
90
|
+
{
|
|
91
|
+
// Only define this when needed.
|
|
92
|
+
// Because of overload woes, this depends on C++20 concepts. util_device.h checks that concepts are available when
|
|
93
|
+
// either runtime policies or PTX JSON information are enabled, so if they are, this is always valid. The generic
|
|
94
|
+
// version is always defined, and that's the only one needed for regular CUB operations.
|
|
95
|
+
//
|
|
96
|
+
// TODO: enable this unconditionally once concepts are always available
|
|
97
|
+
CUB_DETAIL_POLICY_WRAPPER_DEFINE(
|
|
98
|
+
UniqueByKeyAgentPolicy,
|
|
99
|
+
(GenericAgentPolicy),
|
|
100
|
+
(BLOCK_THREADS, BlockThreads, int),
|
|
101
|
+
(ITEMS_PER_THREAD, ItemsPerThread, int),
|
|
102
|
+
(LOAD_ALGORITHM, LoadAlgorithm, cub::BlockLoadAlgorithm),
|
|
103
|
+
(LOAD_MODIFIER, LoadModifier, cub::CacheLoadModifier),
|
|
104
|
+
(SCAN_ALGORITHM, ScanAlgorithm, cub::BlockScanAlgorithm))
|
|
105
|
+
} // namespace detail
|
|
106
|
+
#endif // defined(CUB_DEFINE_RUNTIME_POLICIES) || defined(CUB_ENABLE_POLICY_PTX_JSON)
|
|
107
|
+
|
|
88
108
|
/******************************************************************************
|
|
89
109
|
* Thread block abstractions
|
|
90
110
|
******************************************************************************/
|
|
91
111
|
|
|
92
|
-
namespace detail
|
|
93
|
-
{
|
|
94
|
-
namespace unique_by_key
|
|
112
|
+
namespace detail::unique_by_key
|
|
95
113
|
{
|
|
96
114
|
|
|
97
115
|
/**
|
|
@@ -608,7 +626,6 @@ struct AgentUniqueByKey
|
|
|
608
626
|
}
|
|
609
627
|
};
|
|
610
628
|
|
|
611
|
-
} // namespace unique_by_key
|
|
612
|
-
} // namespace detail
|
|
629
|
+
} // namespace detail::unique_by_key
|
|
613
630
|
|
|
614
631
|
CUB_NAMESPACE_END
|
|
@@ -111,10 +111,9 @@ CUB_NAMESPACE_BEGIN
|
|
|
111
111
|
//! // Collectively compute adjacent_difference
|
|
112
112
|
//! int result[4];
|
|
113
113
|
//!
|
|
114
|
-
//! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
|
|
115
|
-
//!
|
|
116
|
-
//!
|
|
117
|
-
//! CustomDifference());
|
|
114
|
+
//! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(thread_data, result,
|
|
115
|
+
//! CustomDifference());
|
|
116
|
+
//! }
|
|
118
117
|
//!
|
|
119
118
|
//! Suppose the set of input `thread_data` across the block of threads is
|
|
120
119
|
//! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
|
|
@@ -283,10 +282,9 @@ public:
|
|
|
283
282
|
//! ...
|
|
284
283
|
//!
|
|
285
284
|
//! // Collectively compute adjacent_difference
|
|
286
|
-
//! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(
|
|
287
|
-
//!
|
|
288
|
-
//!
|
|
289
|
-
//! CustomDifference());
|
|
285
|
+
//! BlockAdjacentDifferenceT(temp_storage).SubtractLeft(thread_data, thread_data,
|
|
286
|
+
//! CustomDifference());
|
|
287
|
+
//! }
|
|
290
288
|
//!
|
|
291
289
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
292
290
|
//! ``{ [4,2,1,1], [1,1,1,1], [2,3,3,3], [3,4,1,4], ... }``.
|
|
@@ -96,6 +96,7 @@ CUB_NAMESPACE_BEGIN
|
|
|
96
96
|
//! // Collectively compute head flags for discontinuities in the segment
|
|
97
97
|
//! int head_flags[4];
|
|
98
98
|
//! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
|
|
99
|
+
//! }
|
|
99
100
|
//!
|
|
100
101
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
101
102
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
|
|
@@ -387,6 +388,7 @@ public:
|
|
|
387
388
|
//! // Collectively compute head flags for discontinuities in the segment
|
|
388
389
|
//! int head_flags[4];
|
|
389
390
|
//! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
|
|
391
|
+
//! }
|
|
390
392
|
//!
|
|
391
393
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
392
394
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``.
|
|
@@ -463,8 +465,9 @@ public:
|
|
|
463
465
|
//!
|
|
464
466
|
//! // Collectively compute head flags for discontinuities in the segment
|
|
465
467
|
//! int head_flags[4];
|
|
466
|
-
//! BlockDiscontinuity(temp_storage).FlagHeads(
|
|
467
|
-
//!
|
|
468
|
+
//! BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data,
|
|
469
|
+
//! cub::Inequality(), tile_predecessor_item);
|
|
470
|
+
//! }
|
|
468
471
|
//!
|
|
469
472
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
470
473
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }``,
|
|
@@ -549,6 +552,7 @@ public:
|
|
|
549
552
|
//! // Collectively compute tail flags for discontinuities in the segment
|
|
550
553
|
//! int tail_flags[4];
|
|
551
554
|
//! BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
|
|
555
|
+
//! }
|
|
552
556
|
//!
|
|
553
557
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
554
558
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``.
|
|
@@ -640,8 +644,9 @@ public:
|
|
|
640
644
|
//!
|
|
641
645
|
//! // Collectively compute tail flags for discontinuities in the segment
|
|
642
646
|
//! int tail_flags[4];
|
|
643
|
-
//! BlockDiscontinuity(temp_storage).FlagTails(
|
|
644
|
-
//!
|
|
647
|
+
//! BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data,
|
|
648
|
+
//! cub::Inequality(), tile_successor_item);
|
|
649
|
+
//! }
|
|
645
650
|
//!
|
|
646
651
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
647
652
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
|
|
@@ -742,8 +747,9 @@ public:
|
|
|
742
747
|
//! // Collectively compute head and flags for discontinuities in the segment
|
|
743
748
|
//! int head_flags[4];
|
|
744
749
|
//! int tail_flags[4];
|
|
745
|
-
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
|
|
746
|
-
//!
|
|
750
|
+
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tail_flags, thread_data,
|
|
751
|
+
//! cub::Inequality());
|
|
752
|
+
//! }
|
|
747
753
|
//!
|
|
748
754
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
749
755
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
|
|
@@ -864,8 +870,10 @@ public:
|
|
|
864
870
|
//! // Collectively compute head and flags for discontinuities in the segment
|
|
865
871
|
//! int head_flags[4];
|
|
866
872
|
//! int tail_flags[4];
|
|
867
|
-
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
|
|
868
|
-
//!
|
|
873
|
+
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tail_flags,
|
|
874
|
+
//! tile_successor_item, thread_data,
|
|
875
|
+
//! cub::Inequality());
|
|
876
|
+
//! }
|
|
869
877
|
//!
|
|
870
878
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
871
879
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``
|
|
@@ -997,9 +1005,10 @@ public:
|
|
|
997
1005
|
//! // Collectively compute head and flags for discontinuities in the segment
|
|
998
1006
|
//! int head_flags[4];
|
|
999
1007
|
//! int tail_flags[4];
|
|
1000
|
-
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
|
|
1001
|
-
//!
|
|
1002
|
-
//!
|
|
1008
|
+
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tile_predecessor_item,
|
|
1009
|
+
//! tail_flags, tile_successor_item,
|
|
1010
|
+
//! thread_data, cub::Inequality());
|
|
1011
|
+
//! }
|
|
1003
1012
|
//!
|
|
1004
1013
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
1005
1014
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
|
|
@@ -1126,9 +1135,10 @@ public:
|
|
|
1126
1135
|
//! // Collectively compute head and flags for discontinuities in the segment
|
|
1127
1136
|
//! int head_flags[4];
|
|
1128
1137
|
//! int tail_flags[4];
|
|
1129
|
-
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(
|
|
1130
|
-
//!
|
|
1131
|
-
//!
|
|
1138
|
+
//! BlockDiscontinuity(temp_storage).FlagHeadsAndTails(head_flags, tile_predecessor_item,
|
|
1139
|
+
//! tail_flags, tile_successor_item,
|
|
1140
|
+
//! thread_data, cub::Inequality());
|
|
1141
|
+
//! }
|
|
1132
1142
|
//!
|
|
1133
1143
|
//! Suppose the set of input ``thread_data`` across the block of threads is
|
|
1134
1144
|
//! ``{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }``,
|
|
@@ -101,6 +101,7 @@ CUB_NAMESPACE_BEGIN
|
|
|
101
101
|
//!
|
|
102
102
|
//! // Collectively exchange data into a blocked arrangement across threads
|
|
103
103
|
//! BlockExchange(temp_storage).StripedToBlocked(thread_data);
|
|
104
|
+
//! }
|
|
104
105
|
//!
|
|
105
106
|
//! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
|
|
106
107
|
//! [1,129,257,385], ..., [127,255,383,511] }``. The corresponding output ``thread_data`` in those threads will be
|
|
@@ -883,6 +884,7 @@ public:
|
|
|
883
884
|
//!
|
|
884
885
|
//! // Collectively exchange data into a blocked arrangement across threads
|
|
885
886
|
//! BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
|
|
887
|
+
//! }
|
|
886
888
|
//!
|
|
887
889
|
//! Suppose the set of striped input ``thread_data`` across the block of threads is ``{ [0,128,256,384],
|
|
888
890
|
//! [1,129,257,385], ..., [127,255,383,511] }`` after loading from device-accessible memory. The corresponding output
|
|
@@ -933,6 +935,7 @@ public:
|
|
|
933
935
|
//!
|
|
934
936
|
//! // Store data striped across block threads into an ordered tile
|
|
935
937
|
//! cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
|
|
938
|
+
//! }
|
|
936
939
|
//!
|
|
937
940
|
//! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
|
|
938
941
|
//! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be
|
|
@@ -983,6 +986,7 @@ public:
|
|
|
983
986
|
//!
|
|
984
987
|
//! // Collectively exchange data into a blocked arrangement across threads
|
|
985
988
|
//! BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
|
|
989
|
+
//! }
|
|
986
990
|
//!
|
|
987
991
|
//! Suppose the set of warp-striped input ``thread_data`` across the block of threads is ``{ [0,32,64,96],
|
|
988
992
|
//! [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }`` after loading from device-accessible memory. (The first 128
|
|
@@ -1037,6 +1041,7 @@ public:
|
|
|
1037
1041
|
//!
|
|
1038
1042
|
//! // Store data striped across warp threads into an ordered tile
|
|
1039
1043
|
//! cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
|
|
1044
|
+
//! }
|
|
1040
1045
|
//!
|
|
1041
1046
|
//! Suppose the set of blocked input ``thread_data`` across the block of threads is ``{ [0,1,2,3], [4,5,6,7],
|
|
1042
1047
|
//! [8,9,10,11], ..., [508,509,510,511] }``. The corresponding output ``thread_data`` in those threads will be
|
|
@@ -140,6 +140,7 @@ enum BlockHistogramAlgorithm
|
|
|
140
140
|
//!
|
|
141
141
|
//! // Compute the block-wide histogram
|
|
142
142
|
//! BlockHistogram(temp_storage).Histogram(data, smem_histogram);
|
|
143
|
+
//! }
|
|
143
144
|
//!
|
|
144
145
|
//! Performance and Usage Considerations
|
|
145
146
|
//! +++++++++++++++++++++++++++++++++++++++++++++
|
|
@@ -281,6 +282,7 @@ public:
|
|
|
281
282
|
//!
|
|
282
283
|
//! // Update the block-wide histogram
|
|
283
284
|
//! BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
|
|
285
|
+
//! }
|
|
284
286
|
//!
|
|
285
287
|
//! @endrst
|
|
286
288
|
//!
|
|
@@ -338,6 +340,7 @@ public:
|
|
|
338
340
|
//!
|
|
339
341
|
//! // Compute the block-wide histogram
|
|
340
342
|
//! BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
|
|
343
|
+
//! }
|
|
341
344
|
//!
|
|
342
345
|
//! @endrst
|
|
343
346
|
//!
|
|
@@ -399,6 +402,7 @@ public:
|
|
|
399
402
|
//!
|
|
400
403
|
//! // Update the block-wide histogram
|
|
401
404
|
//! BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
|
|
405
|
+
//! }
|
|
402
406
|
//!
|
|
403
407
|
//! @endrst
|
|
404
408
|
//!
|
|
@@ -771,6 +771,7 @@ enum BlockLoadAlgorithm
|
|
|
771
771
|
//! // Load a segment of consecutive items that are blocked across threads
|
|
772
772
|
//! int thread_data[4];
|
|
773
773
|
//! BlockLoad(temp_storage).Load(d_data, thread_data);
|
|
774
|
+
//! }
|
|
774
775
|
//!
|
|
775
776
|
//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads in
|
|
776
777
|
//! those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
|
|
@@ -1123,6 +1124,7 @@ public:
|
|
|
1123
1124
|
//! // Load a segment of consecutive items that are blocked across threads
|
|
1124
1125
|
//! int thread_data[4];
|
|
1125
1126
|
//! BlockLoad(temp_storage).Load(d_data, thread_data);
|
|
1127
|
+
//! }
|
|
1126
1128
|
//!
|
|
1127
1129
|
//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, ...``. The set of ``thread_data`` across the block of threads
|
|
1128
1130
|
//! in those threads will be ``{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }``.
|
|
@@ -1170,6 +1172,7 @@ public:
|
|
|
1170
1172
|
//! // Load a segment of consecutive items that are blocked across threads
|
|
1171
1173
|
//! int thread_data[4];
|
|
1172
1174
|
//! BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end);
|
|
1175
|
+
//! }
|
|
1173
1176
|
//!
|
|
1174
1177
|
//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...`` and ``block_items_end`` is ``5``. The set of
|
|
1175
1178
|
//! ``thread_data`` across the block of threads in those threads will be ``{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }``,
|
|
@@ -1222,6 +1225,7 @@ public:
|
|
|
1222
1225
|
//! // Load a segment of consecutive items that are blocked across threads
|
|
1223
1226
|
//! int thread_data[4];
|
|
1224
1227
|
//! BlockLoad(temp_storage).Load(d_data, thread_data, block_items_end, -1);
|
|
1228
|
+
//! }
|
|
1225
1229
|
//!
|
|
1226
1230
|
//! Suppose the input ``d_data`` is ``0, 1, 2, 3, 4, 5, 6...``, ``block_items_end`` is ``5``, and the out-of-bounds
|
|
1227
1231
|
//! default is ``-1``. The set of ``thread_data`` across the block of threads in those threads will be
|
|
@@ -50,6 +50,7 @@
|
|
|
50
50
|
|
|
51
51
|
#include <cuda/__ptx/instructions/get_sreg.h>
|
|
52
52
|
#include <cuda/std/__algorithm/max.h>
|
|
53
|
+
#include <cuda/std/__bit/integral.h>
|
|
53
54
|
#include <cuda/std/__functional/operations.h>
|
|
54
55
|
#include <cuda/std/__type_traits/conditional.h>
|
|
55
56
|
#include <cuda/std/__type_traits/is_same.h>
|
|
@@ -168,6 +169,7 @@ struct warp_in_block_matcher_t<Bits, 0, PartialWarpId>
|
|
|
168
169
|
//! block_radix_rank(temp_storage).RankKeys(keys, ranks, extractor);
|
|
169
170
|
//!
|
|
170
171
|
//! ...
|
|
172
|
+
//! }
|
|
171
173
|
//!
|
|
172
174
|
//! Suppose the set of input ``keys`` across the block of threads is ``{ [16,10], [9,11] }``.
|
|
173
175
|
//! The corresponding output ``ranks`` in those threads will be ``{ [3,1], [0,2] }``.
|
|
@@ -1072,7 +1074,7 @@ struct BlockRadixRankMatchEarlyCounts
|
|
|
1072
1074
|
atomicOr(p_match_mask, lane_mask);
|
|
1073
1075
|
__syncwarp(WARP_MASK);
|
|
1074
1076
|
int bin_mask = *p_match_mask;
|
|
1075
|
-
int leader = (
|
|
1077
|
+
int leader = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
|
|
1076
1078
|
int warp_offset = 0;
|
|
1077
1079
|
int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
|
|
1078
1080
|
if (lane == leader)
|
|
@@ -1102,7 +1104,7 @@ struct BlockRadixRankMatchEarlyCounts
|
|
|
1102
1104
|
::cuda::std::uint32_t bin = Digit(keys[u]);
|
|
1103
1105
|
int bin_mask =
|
|
1104
1106
|
detail::warp_in_block_matcher_t<RADIX_BITS, PARTIAL_WARP_THREADS, BLOCK_WARPS - 1>::match_any(bin, warp);
|
|
1105
|
-
int leader = (
|
|
1107
|
+
int leader = ::cuda::std::__bit_log2(static_cast<unsigned>(bin_mask));
|
|
1106
1108
|
int warp_offset = 0;
|
|
1107
1109
|
int popc = __popc(bin_mask & ::cuda::ptx::get_sreg_lanemask_le());
|
|
1108
1110
|
if (lane == leader)
|