cuda-cccl 0.3.0__cp310-cp310-manylinux_2_24_aarch64.whl → 0.3.2__cp310-cp310-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
- cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
- cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +27 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +13 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +155 -13
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +79 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
- cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
- cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-310-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
# Python signatures are declared in the companion Python stub file _bindings.pyi
|
|
6
6
|
# Make sure to update PYI with change to Python API to ensure that Python
|
|
7
|
-
# static type checker tools like mypy green-lights cuda.
|
|
7
|
+
# static type checker tools like mypy green-lights cuda.compute
|
|
8
8
|
|
|
9
9
|
from libc.string cimport memset, memcpy
|
|
10
10
|
from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
|
|
@@ -120,6 +120,10 @@ cdef extern from "cccl/c/types.h":
|
|
|
120
120
|
ASCENDING "CCCL_ASCENDING"
|
|
121
121
|
DESCENDING "CCCL_DESCENDING"
|
|
122
122
|
|
|
123
|
+
cpdef enum cccl_init_kind_t:
|
|
124
|
+
VALUE_INIT "CCCL_VALUE_INIT"
|
|
125
|
+
FUTURE_VALUE_INIT "CCCL_FUTURE_VALUE_INIT"
|
|
126
|
+
NO_INIT "CCCL_NO_INIT"
|
|
123
127
|
|
|
124
128
|
cdef void arg_type_check(
|
|
125
129
|
str arg_name,
|
|
@@ -136,6 +140,7 @@ OpKind = cccl_op_kind_t
|
|
|
136
140
|
TypeEnum = cccl_type_enum
|
|
137
141
|
IteratorKind = cccl_iterator_kind_t
|
|
138
142
|
SortOrder = cccl_sort_order_t
|
|
143
|
+
InitKind = cccl_init_kind_t
|
|
139
144
|
|
|
140
145
|
cdef void _validate_alignment(int alignment) except *:
|
|
141
146
|
"""
|
|
@@ -724,6 +729,11 @@ cdef class Iterator:
|
|
|
724
729
|
else:
|
|
725
730
|
return IteratorKind.ITERATOR
|
|
726
731
|
|
|
732
|
+
@property
|
|
733
|
+
def value_type(self):
|
|
734
|
+
cdef cccl_type_info type_info = self.iter_data.value_type
|
|
735
|
+
return TypeInfo(type_info.size, type_info.alignment, type_info.type)
|
|
736
|
+
|
|
727
737
|
def is_kind_pointer(self):
|
|
728
738
|
cdef cccl_iterator_kind_t it_kind = self.iter_data.type
|
|
729
739
|
return (it_kind == cccl_iterator_kind_t.POINTER)
|
|
@@ -947,8 +957,9 @@ cdef extern from "cccl/c/scan.h":
|
|
|
947
957
|
cccl_iterator_t,
|
|
948
958
|
cccl_iterator_t,
|
|
949
959
|
cccl_op_t,
|
|
950
|
-
|
|
960
|
+
cccl_type_info,
|
|
951
961
|
_Bool,
|
|
962
|
+
cccl_init_kind_t,
|
|
952
963
|
int, int, const char*, const char*, const char*, const char*
|
|
953
964
|
) nogil
|
|
954
965
|
|
|
@@ -976,6 +987,41 @@ cdef extern from "cccl/c/scan.h":
|
|
|
976
987
|
CUstream
|
|
977
988
|
) nogil
|
|
978
989
|
|
|
990
|
+
cdef CUresult cccl_device_exclusive_scan_future_value(
|
|
991
|
+
cccl_device_scan_build_result_t,
|
|
992
|
+
void *,
|
|
993
|
+
size_t *,
|
|
994
|
+
cccl_iterator_t,
|
|
995
|
+
cccl_iterator_t,
|
|
996
|
+
uint64_t,
|
|
997
|
+
cccl_op_t,
|
|
998
|
+
cccl_iterator_t,
|
|
999
|
+
CUstream
|
|
1000
|
+
) nogil
|
|
1001
|
+
|
|
1002
|
+
cdef CUresult cccl_device_inclusive_scan_future_value(
|
|
1003
|
+
cccl_device_scan_build_result_t,
|
|
1004
|
+
void *,
|
|
1005
|
+
size_t *,
|
|
1006
|
+
cccl_iterator_t,
|
|
1007
|
+
cccl_iterator_t,
|
|
1008
|
+
uint64_t,
|
|
1009
|
+
cccl_op_t,
|
|
1010
|
+
cccl_iterator_t,
|
|
1011
|
+
CUstream
|
|
1012
|
+
) nogil
|
|
1013
|
+
|
|
1014
|
+
cdef CUresult cccl_device_inclusive_scan_no_init(
|
|
1015
|
+
cccl_device_scan_build_result_t,
|
|
1016
|
+
void *,
|
|
1017
|
+
size_t *,
|
|
1018
|
+
cccl_iterator_t,
|
|
1019
|
+
cccl_iterator_t,
|
|
1020
|
+
uint64_t,
|
|
1021
|
+
cccl_op_t,
|
|
1022
|
+
CUstream
|
|
1023
|
+
) nogil
|
|
1024
|
+
|
|
979
1025
|
cdef CUresult cccl_device_scan_cleanup(
|
|
980
1026
|
cccl_device_scan_build_result_t*
|
|
981
1027
|
) nogil
|
|
@@ -989,8 +1035,9 @@ cdef class DeviceScanBuildResult:
|
|
|
989
1035
|
Iterator d_in,
|
|
990
1036
|
Iterator d_out,
|
|
991
1037
|
Op op,
|
|
992
|
-
|
|
1038
|
+
TypeInfo init_type,
|
|
993
1039
|
bint force_inclusive,
|
|
1040
|
+
cccl_init_kind_t init_kind,
|
|
994
1041
|
CommonData common_data
|
|
995
1042
|
):
|
|
996
1043
|
cdef CUresult status = -1
|
|
@@ -1008,8 +1055,9 @@ cdef class DeviceScanBuildResult:
|
|
|
1008
1055
|
d_in.iter_data,
|
|
1009
1056
|
d_out.iter_data,
|
|
1010
1057
|
op.op_data,
|
|
1011
|
-
|
|
1058
|
+
init_type.type_info,
|
|
1012
1059
|
force_inclusive,
|
|
1060
|
+
init_kind,
|
|
1013
1061
|
cc_major,
|
|
1014
1062
|
cc_minor,
|
|
1015
1063
|
cub_path,
|
|
@@ -1035,7 +1083,7 @@ cdef class DeviceScanBuildResult:
|
|
|
1035
1083
|
Iterator d_out,
|
|
1036
1084
|
size_t num_items,
|
|
1037
1085
|
Op op,
|
|
1038
|
-
Value
|
|
1086
|
+
Value init_value,
|
|
1039
1087
|
stream
|
|
1040
1088
|
):
|
|
1041
1089
|
cdef CUresult status = -1
|
|
@@ -1052,7 +1100,7 @@ cdef class DeviceScanBuildResult:
|
|
|
1052
1100
|
d_out.iter_data,
|
|
1053
1101
|
<uint64_t>num_items,
|
|
1054
1102
|
op.op_data,
|
|
1055
|
-
|
|
1103
|
+
init_value.value_data,
|
|
1056
1104
|
c_stream
|
|
1057
1105
|
)
|
|
1058
1106
|
if status != 0:
|
|
@@ -1069,7 +1117,7 @@ cdef class DeviceScanBuildResult:
|
|
|
1069
1117
|
Iterator d_out,
|
|
1070
1118
|
size_t num_items,
|
|
1071
1119
|
Op op,
|
|
1072
|
-
Value
|
|
1120
|
+
Value init_value,
|
|
1073
1121
|
stream
|
|
1074
1122
|
):
|
|
1075
1123
|
cdef CUresult status = -1
|
|
@@ -1086,7 +1134,7 @@ cdef class DeviceScanBuildResult:
|
|
|
1086
1134
|
d_out.iter_data,
|
|
1087
1135
|
<uint64_t>num_items,
|
|
1088
1136
|
op.op_data,
|
|
1089
|
-
|
|
1137
|
+
init_value.value_data,
|
|
1090
1138
|
c_stream
|
|
1091
1139
|
)
|
|
1092
1140
|
if status != 0:
|
|
@@ -1095,6 +1143,107 @@ cdef class DeviceScanBuildResult:
|
|
|
1095
1143
|
)
|
|
1096
1144
|
return storage_sz
|
|
1097
1145
|
|
|
1146
|
+
cpdef int compute_inclusive_future_value(
|
|
1147
|
+
DeviceScanBuildResult self,
|
|
1148
|
+
temp_storage_ptr,
|
|
1149
|
+
temp_storage_bytes,
|
|
1150
|
+
Iterator d_in,
|
|
1151
|
+
Iterator d_out,
|
|
1152
|
+
size_t num_items,
|
|
1153
|
+
Op op,
|
|
1154
|
+
Iterator init_value,
|
|
1155
|
+
stream
|
|
1156
|
+
):
|
|
1157
|
+
cdef CUresult status = -1
|
|
1158
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
1159
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
1160
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
1161
|
+
|
|
1162
|
+
with nogil:
|
|
1163
|
+
status = cccl_device_inclusive_scan_future_value(
|
|
1164
|
+
self.build_data,
|
|
1165
|
+
storage_ptr,
|
|
1166
|
+
&storage_sz,
|
|
1167
|
+
d_in.iter_data,
|
|
1168
|
+
d_out.iter_data,
|
|
1169
|
+
<uint64_t>num_items,
|
|
1170
|
+
op.op_data,
|
|
1171
|
+
init_value.iter_data,
|
|
1172
|
+
c_stream
|
|
1173
|
+
)
|
|
1174
|
+
if status != 0:
|
|
1175
|
+
raise RuntimeError(
|
|
1176
|
+
f"Failed executing inclusive scan, error code: {status}"
|
|
1177
|
+
)
|
|
1178
|
+
return storage_sz
|
|
1179
|
+
|
|
1180
|
+
cpdef int compute_exclusive_future_value(
|
|
1181
|
+
DeviceScanBuildResult self,
|
|
1182
|
+
temp_storage_ptr,
|
|
1183
|
+
temp_storage_bytes,
|
|
1184
|
+
Iterator d_in,
|
|
1185
|
+
Iterator d_out,
|
|
1186
|
+
size_t num_items,
|
|
1187
|
+
Op op,
|
|
1188
|
+
Iterator init_value,
|
|
1189
|
+
stream
|
|
1190
|
+
):
|
|
1191
|
+
cdef CUresult status = -1
|
|
1192
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
1193
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
1194
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
1195
|
+
|
|
1196
|
+
with nogil:
|
|
1197
|
+
status = cccl_device_exclusive_scan_future_value(
|
|
1198
|
+
self.build_data,
|
|
1199
|
+
storage_ptr,
|
|
1200
|
+
&storage_sz,
|
|
1201
|
+
d_in.iter_data,
|
|
1202
|
+
d_out.iter_data,
|
|
1203
|
+
<uint64_t>num_items,
|
|
1204
|
+
op.op_data,
|
|
1205
|
+
init_value.iter_data,
|
|
1206
|
+
c_stream
|
|
1207
|
+
)
|
|
1208
|
+
if status != 0:
|
|
1209
|
+
raise RuntimeError(
|
|
1210
|
+
f"Failed executing exclusive scan, error code: {status}"
|
|
1211
|
+
)
|
|
1212
|
+
return storage_sz
|
|
1213
|
+
|
|
1214
|
+
cpdef int compute_inclusive_no_init(
|
|
1215
|
+
DeviceScanBuildResult self,
|
|
1216
|
+
temp_storage_ptr,
|
|
1217
|
+
temp_storage_bytes,
|
|
1218
|
+
Iterator d_in,
|
|
1219
|
+
Iterator d_out,
|
|
1220
|
+
size_t num_items,
|
|
1221
|
+
Op op,
|
|
1222
|
+
object init_value,
|
|
1223
|
+
stream
|
|
1224
|
+
):
|
|
1225
|
+
cdef CUresult status = -1
|
|
1226
|
+
cdef void *storage_ptr = (<void *><uintptr_t>temp_storage_ptr) if temp_storage_ptr else NULL
|
|
1227
|
+
cdef size_t storage_sz = <size_t>temp_storage_bytes
|
|
1228
|
+
cdef CUstream c_stream = <CUstream><uintptr_t>(stream) if stream else NULL
|
|
1229
|
+
|
|
1230
|
+
with nogil:
|
|
1231
|
+
status = cccl_device_inclusive_scan_no_init(
|
|
1232
|
+
self.build_data,
|
|
1233
|
+
storage_ptr,
|
|
1234
|
+
&storage_sz,
|
|
1235
|
+
d_in.iter_data,
|
|
1236
|
+
d_out.iter_data,
|
|
1237
|
+
<uint64_t>num_items,
|
|
1238
|
+
op.op_data,
|
|
1239
|
+
c_stream
|
|
1240
|
+
)
|
|
1241
|
+
if status != 0:
|
|
1242
|
+
raise RuntimeError(
|
|
1243
|
+
f"Failed executing inclusive scan, error code: {status}"
|
|
1244
|
+
)
|
|
1245
|
+
return storage_sz
|
|
1246
|
+
|
|
1098
1247
|
def _get_cubin(self):
|
|
1099
1248
|
return PyBytes_FromStringAndSize(
|
|
1100
1249
|
<const char*>self.build_data.cubin,
|
|
@@ -148,7 +148,7 @@ def make_histogram_even(
|
|
|
148
148
|
Example:
|
|
149
149
|
Below, ``make_histogram_even`` is used to create a histogram object that can be reused.
|
|
150
150
|
|
|
151
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
151
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_object.py
|
|
152
152
|
:language: python
|
|
153
153
|
:start-after: # example-begin
|
|
154
154
|
|
|
@@ -190,7 +190,7 @@ def histogram_even(
|
|
|
190
190
|
Example:
|
|
191
191
|
Below, ``histogram_even`` is used to compute a histogram with evenly-spaced bins.
|
|
192
192
|
|
|
193
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
193
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/histogram/histogram_even_basic.py
|
|
194
194
|
:language: python
|
|
195
195
|
:start-after: # example-begin
|
|
196
196
|
:caption: Basic histogram example.
|
|
@@ -166,7 +166,7 @@ def make_merge_sort(
|
|
|
166
166
|
Example:
|
|
167
167
|
Below, ``make_merge_sort`` is used to create a merge sort object that can be reused.
|
|
168
168
|
|
|
169
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
169
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_object.py
|
|
170
170
|
:language: python
|
|
171
171
|
:start-after: # example-begin
|
|
172
172
|
|
|
@@ -201,7 +201,7 @@ def merge_sort(
|
|
|
201
201
|
Example:
|
|
202
202
|
Below, ``merge_sort`` is used to sort a sequence of keys inplace. It also rearranges the items according to the keys' order.
|
|
203
203
|
|
|
204
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
204
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/merge_sort_basic.py
|
|
205
205
|
:language: python
|
|
206
206
|
:start-after: # example-begin
|
|
207
207
|
|
|
@@ -222,7 +222,7 @@ def make_radix_sort(
|
|
|
222
222
|
Example:
|
|
223
223
|
Below, ``make_radix_sort`` is used to create a radix sort object that can be reused.
|
|
224
224
|
|
|
225
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
225
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_object.py
|
|
226
226
|
:language: python
|
|
227
227
|
:start-after: # example-begin
|
|
228
228
|
|
|
@@ -259,14 +259,14 @@ def radix_sort(
|
|
|
259
259
|
Example:
|
|
260
260
|
Below, ``radix_sort`` is used to sort a sequence of keys. It also rearranges the values according to the keys' order.
|
|
261
261
|
|
|
262
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
262
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_basic.py
|
|
263
263
|
:language: python
|
|
264
264
|
:start-after: # example-begin
|
|
265
265
|
|
|
266
266
|
|
|
267
267
|
In the following example, ``radix_sort`` is used to sort a sequence of keys with a ``DoubleBuffer` for reduced temporary storage.
|
|
268
268
|
|
|
269
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
269
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/sort/radix_sort_buffer.py
|
|
270
270
|
:language: python
|
|
271
271
|
:start-after: # example-begin
|
|
272
272
|
|
|
@@ -130,7 +130,7 @@ def make_reduce_into(
|
|
|
130
130
|
Example:
|
|
131
131
|
Below, ``make_reduce_into`` is used to create a reduction object that can be reused.
|
|
132
132
|
|
|
133
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
133
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/reduce_object.py
|
|
134
134
|
:language: python
|
|
135
135
|
:start-after: # example-begin
|
|
136
136
|
|
|
@@ -163,7 +163,7 @@ def reduce_into(
|
|
|
163
163
|
Example:
|
|
164
164
|
Below, ``reduce_into`` is used to compute the sum of a sequence of integers.
|
|
165
165
|
|
|
166
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
166
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/sum_reduction.py
|
|
167
167
|
:language: python
|
|
168
168
|
:start-after: # example-begin
|
|
169
169
|
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
#
|
|
4
4
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
5
5
|
|
|
6
|
-
from typing import Callable, Union
|
|
6
|
+
from typing import Callable, Union, cast
|
|
7
7
|
|
|
8
8
|
import numba
|
|
9
9
|
import numpy as np
|
|
@@ -20,14 +20,27 @@ from ..op import OpKind
|
|
|
20
20
|
from ..typing import DeviceArrayLike, GpuStruct
|
|
21
21
|
|
|
22
22
|
|
|
23
|
+
def get_init_kind(
|
|
24
|
+
init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
|
|
25
|
+
) -> _bindings.InitKind:
|
|
26
|
+
match init_value:
|
|
27
|
+
case None:
|
|
28
|
+
return _bindings.InitKind.NO_INIT
|
|
29
|
+
case _ if isinstance(init_value, DeviceArrayLike):
|
|
30
|
+
return _bindings.InitKind.FUTURE_VALUE_INIT
|
|
31
|
+
case _:
|
|
32
|
+
return _bindings.InitKind.VALUE_INIT
|
|
33
|
+
|
|
34
|
+
|
|
23
35
|
class _Scan:
|
|
24
36
|
__slots__ = [
|
|
25
37
|
"build_result",
|
|
26
38
|
"d_in_cccl",
|
|
27
39
|
"d_out_cccl",
|
|
28
|
-
"
|
|
40
|
+
"init_value_cccl",
|
|
29
41
|
"op_wrapper",
|
|
30
42
|
"device_scan_fn",
|
|
43
|
+
"init_kind",
|
|
31
44
|
]
|
|
32
45
|
|
|
33
46
|
# TODO: constructor shouldn't require concrete `d_in`, `d_out`:
|
|
@@ -36,36 +49,74 @@ class _Scan:
|
|
|
36
49
|
d_in: DeviceArrayLike | IteratorBase,
|
|
37
50
|
d_out: DeviceArrayLike | IteratorBase,
|
|
38
51
|
op: Callable | OpKind,
|
|
39
|
-
|
|
52
|
+
init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
|
|
40
53
|
force_inclusive: bool,
|
|
41
54
|
):
|
|
42
55
|
self.d_in_cccl = cccl.to_cccl_input_iter(d_in)
|
|
43
56
|
self.d_out_cccl = cccl.to_cccl_output_iter(d_out)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
57
|
+
|
|
58
|
+
self.init_kind = get_init_kind(init_value)
|
|
59
|
+
|
|
60
|
+
self.init_value_cccl: _bindings.Iterator | _bindings.Value | None
|
|
61
|
+
|
|
62
|
+
match self.init_kind:
|
|
63
|
+
case _bindings.InitKind.NO_INIT:
|
|
64
|
+
# TODO: we just need to extract the dtype from the input iterator
|
|
65
|
+
if not isinstance(d_in, DeviceArrayLike):
|
|
66
|
+
raise ValueError(
|
|
67
|
+
"No init value not supported for non-DeviceArrayLike input"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
self.init_value_cccl = None
|
|
71
|
+
value_type = numba.from_dtype(protocols.get_dtype(d_in))
|
|
72
|
+
init_value_type_info = self.d_in_cccl.value_type
|
|
73
|
+
|
|
74
|
+
case _bindings.InitKind.FUTURE_VALUE_INIT:
|
|
75
|
+
self.init_value_cccl = cccl.to_cccl_input_iter(init_value)
|
|
76
|
+
value_type = numba.from_dtype(
|
|
77
|
+
protocols.get_dtype(cast(DeviceArrayLike, init_value))
|
|
78
|
+
)
|
|
79
|
+
init_value_type_info = self.init_value_cccl.value_type
|
|
80
|
+
|
|
81
|
+
case _bindings.InitKind.VALUE_INIT:
|
|
82
|
+
self.init_value_cccl = cccl.to_cccl_value(init_value)
|
|
83
|
+
value_type = (
|
|
84
|
+
numba.from_dtype(init_value.dtype)
|
|
85
|
+
if isinstance(init_value, np.ndarray)
|
|
86
|
+
else numba.typeof(init_value)
|
|
87
|
+
)
|
|
88
|
+
init_value_type_info = self.init_value_cccl.type
|
|
49
89
|
|
|
50
90
|
# For well-known operations, we don't need a signature
|
|
51
91
|
if isinstance(op, OpKind):
|
|
52
92
|
self.op_wrapper = cccl.to_cccl_op(op, None)
|
|
53
93
|
else:
|
|
54
94
|
self.op_wrapper = cccl.to_cccl_op(op, value_type(value_type, value_type))
|
|
95
|
+
|
|
55
96
|
self.build_result = call_build(
|
|
56
97
|
_bindings.DeviceScanBuildResult,
|
|
57
98
|
self.d_in_cccl,
|
|
58
99
|
self.d_out_cccl,
|
|
59
100
|
self.op_wrapper,
|
|
60
|
-
|
|
101
|
+
init_value_type_info,
|
|
61
102
|
force_inclusive,
|
|
103
|
+
self.init_kind,
|
|
62
104
|
)
|
|
63
105
|
|
|
64
|
-
self.
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
106
|
+
match (force_inclusive, self.init_kind):
|
|
107
|
+
case (True, _bindings.InitKind.FUTURE_VALUE_INIT):
|
|
108
|
+
self.device_scan_fn = self.build_result.compute_inclusive_future_value
|
|
109
|
+
case (True, _bindings.InitKind.VALUE_INIT):
|
|
110
|
+
self.device_scan_fn = self.build_result.compute_inclusive
|
|
111
|
+
case (True, _bindings.InitKind.NO_INIT):
|
|
112
|
+
self.device_scan_fn = self.build_result.compute_inclusive_no_init
|
|
113
|
+
|
|
114
|
+
case (False, _bindings.InitKind.FUTURE_VALUE_INIT):
|
|
115
|
+
self.device_scan_fn = self.build_result.compute_exclusive_future_value
|
|
116
|
+
case (False, _bindings.InitKind.VALUE_INIT):
|
|
117
|
+
self.device_scan_fn = self.build_result.compute_exclusive
|
|
118
|
+
case (False, _bindings.InitKind.NO_INIT):
|
|
119
|
+
raise ValueError("Exclusive scan with No init value is not supported")
|
|
69
120
|
|
|
70
121
|
def __call__(
|
|
71
122
|
self,
|
|
@@ -73,13 +124,25 @@ class _Scan:
|
|
|
73
124
|
d_in,
|
|
74
125
|
d_out,
|
|
75
126
|
num_items: int,
|
|
76
|
-
|
|
127
|
+
init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
|
|
77
128
|
stream=None,
|
|
78
129
|
):
|
|
79
130
|
set_cccl_iterator_state(self.d_in_cccl, d_in)
|
|
80
131
|
set_cccl_iterator_state(self.d_out_cccl, d_out)
|
|
81
132
|
|
|
82
|
-
self.
|
|
133
|
+
match self.init_kind:
|
|
134
|
+
case _bindings.InitKind.FUTURE_VALUE_INIT:
|
|
135
|
+
# We know that the init_value_cccl is an Iterator, so this cast
|
|
136
|
+
# tells MyPy what the actual type is. cast() is a no-op at runtime,
|
|
137
|
+
# which makes it better than isinstance() since this is a hot path
|
|
138
|
+
# and we have to minimize the work we do prior to calling the
|
|
139
|
+
# kernel.
|
|
140
|
+
self.init_value_cccl = cast(_bindings.Iterator, self.init_value_cccl)
|
|
141
|
+
set_cccl_iterator_state(self.init_value_cccl, init_value)
|
|
142
|
+
|
|
143
|
+
case _bindings.InitKind.VALUE_INIT:
|
|
144
|
+
self.init_value_cccl = cast(_bindings.Value, self.init_value_cccl)
|
|
145
|
+
self.init_value_cccl.state = to_cccl_value_state(init_value)
|
|
83
146
|
|
|
84
147
|
stream_handle = validate_and_get_stream(stream)
|
|
85
148
|
|
|
@@ -97,7 +160,7 @@ class _Scan:
|
|
|
97
160
|
self.d_out_cccl,
|
|
98
161
|
num_items,
|
|
99
162
|
self.op_wrapper,
|
|
100
|
-
self.
|
|
163
|
+
self.init_value_cccl,
|
|
101
164
|
stream_handle,
|
|
102
165
|
)
|
|
103
166
|
return temp_storage_bytes
|
|
@@ -107,7 +170,7 @@ def make_cache_key(
|
|
|
107
170
|
d_in: DeviceArrayLike | IteratorBase,
|
|
108
171
|
d_out: DeviceArrayLike | IteratorBase,
|
|
109
172
|
op: Callable | OpKind,
|
|
110
|
-
|
|
173
|
+
init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
|
|
111
174
|
):
|
|
112
175
|
d_in_key = (
|
|
113
176
|
d_in.kind if isinstance(d_in, IteratorBase) else protocols.get_dtype(d_in)
|
|
@@ -123,8 +186,17 @@ def make_cache_key(
|
|
|
123
186
|
else:
|
|
124
187
|
op_key = CachableFunction(op)
|
|
125
188
|
|
|
126
|
-
|
|
127
|
-
|
|
189
|
+
init_kind_key = get_init_kind(init_value)
|
|
190
|
+
match init_kind_key:
|
|
191
|
+
case _bindings.InitKind.NO_INIT:
|
|
192
|
+
init_value_key = None
|
|
193
|
+
case _bindings.InitKind.FUTURE_VALUE_INIT:
|
|
194
|
+
init_value_key = protocols.get_dtype(cast(DeviceArrayLike, init_value))
|
|
195
|
+
case _bindings.InitKind.VALUE_INIT:
|
|
196
|
+
init_value = cast(np.ndarray | GpuStruct, init_value)
|
|
197
|
+
init_value_key = init_value.dtype
|
|
198
|
+
|
|
199
|
+
return (d_in_key, d_out_key, op_key, init_value_key, init_kind_key)
|
|
128
200
|
|
|
129
201
|
|
|
130
202
|
# TODO Figure out `sum` without operator and initial value
|
|
@@ -134,14 +206,14 @@ def make_exclusive_scan(
|
|
|
134
206
|
d_in: DeviceArrayLike | IteratorBase,
|
|
135
207
|
d_out: DeviceArrayLike | IteratorBase,
|
|
136
208
|
op: Callable | OpKind,
|
|
137
|
-
|
|
209
|
+
init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
|
|
138
210
|
):
|
|
139
211
|
"""Computes a device-wide scan using the specified binary ``op`` and initial value ``init``.
|
|
140
212
|
|
|
141
213
|
Example:
|
|
142
214
|
Below, ``make_exclusive_scan`` is used to create an exclusive scan object that can be reused.
|
|
143
215
|
|
|
144
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
216
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_object.py
|
|
145
217
|
:language: python
|
|
146
218
|
:start-after: # example-begin
|
|
147
219
|
|
|
@@ -150,19 +222,19 @@ def make_exclusive_scan(
|
|
|
150
222
|
d_in: Device array or iterator containing the input sequence of data items
|
|
151
223
|
d_out: Device array that will store the result of the scan
|
|
152
224
|
op: Callable or OpKind representing the binary operator to apply
|
|
153
|
-
|
|
225
|
+
init_value: Numpy array, device array, or GPU struct storing initial value of the scan, or None for no initial value
|
|
154
226
|
|
|
155
227
|
Returns:
|
|
156
228
|
A callable object that can be used to perform the scan
|
|
157
229
|
"""
|
|
158
|
-
return _Scan(d_in, d_out, op,
|
|
230
|
+
return _Scan(d_in, d_out, op, init_value, False)
|
|
159
231
|
|
|
160
232
|
|
|
161
233
|
def exclusive_scan(
|
|
162
234
|
d_in: DeviceArrayLike | IteratorBase,
|
|
163
235
|
d_out: DeviceArrayLike | IteratorBase,
|
|
164
236
|
op: Callable | OpKind,
|
|
165
|
-
|
|
237
|
+
init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
|
|
166
238
|
num_items: int,
|
|
167
239
|
stream=None,
|
|
168
240
|
):
|
|
@@ -174,7 +246,7 @@ def exclusive_scan(
|
|
|
174
246
|
Example:
|
|
175
247
|
Below, ``exclusive_scan`` is used to compute an exclusive scan with max operation.
|
|
176
248
|
|
|
177
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
249
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/exclusive_scan_max.py
|
|
178
250
|
:language: python
|
|
179
251
|
:start-after: # example-begin
|
|
180
252
|
|
|
@@ -183,14 +255,14 @@ def exclusive_scan(
|
|
|
183
255
|
d_in: Device array or iterator containing the input sequence of data items
|
|
184
256
|
d_out: Device array or iterator to store the result of the scan
|
|
185
257
|
op: Binary scan operator
|
|
186
|
-
|
|
258
|
+
init_value: Initial value for the scan
|
|
187
259
|
num_items: Number of items to scan
|
|
188
260
|
stream: CUDA stream for the operation (optional)
|
|
189
261
|
"""
|
|
190
|
-
scanner = make_exclusive_scan(d_in, d_out, op,
|
|
191
|
-
tmp_storage_bytes = scanner(None, d_in, d_out, num_items,
|
|
262
|
+
scanner = make_exclusive_scan(d_in, d_out, op, init_value)
|
|
263
|
+
tmp_storage_bytes = scanner(None, d_in, d_out, num_items, init_value, stream)
|
|
192
264
|
tmp_storage = TempStorageBuffer(tmp_storage_bytes, stream)
|
|
193
|
-
scanner(tmp_storage, d_in, d_out, num_items,
|
|
265
|
+
scanner(tmp_storage, d_in, d_out, num_items, init_value, stream)
|
|
194
266
|
|
|
195
267
|
|
|
196
268
|
# TODO Figure out `sum` without operator and initial value
|
|
@@ -200,14 +272,14 @@ def make_inclusive_scan(
|
|
|
200
272
|
d_in: DeviceArrayLike | IteratorBase,
|
|
201
273
|
d_out: DeviceArrayLike | IteratorBase,
|
|
202
274
|
op: Callable | OpKind,
|
|
203
|
-
|
|
275
|
+
init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
|
|
204
276
|
):
|
|
205
277
|
"""Computes a device-wide scan using the specified binary ``op`` and initial value ``init``.
|
|
206
278
|
|
|
207
279
|
Example:
|
|
208
280
|
Below, ``make_inclusive_scan`` is used to create an inclusive scan object that can be reused.
|
|
209
281
|
|
|
210
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
282
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_object.py
|
|
211
283
|
:language: python
|
|
212
284
|
:start-after: # example-begin
|
|
213
285
|
|
|
@@ -216,19 +288,19 @@ def make_inclusive_scan(
|
|
|
216
288
|
d_in: Device array or iterator containing the input sequence of data items
|
|
217
289
|
d_out: Device array that will store the result of the scan
|
|
218
290
|
op: Callable or OpKind representing the binary operator to apply
|
|
219
|
-
|
|
291
|
+
init_value: Numpy array, device array, or GPU struct storing initial value of the scan, or None for no initial value
|
|
220
292
|
|
|
221
293
|
Returns:
|
|
222
294
|
A callable object that can be used to perform the scan
|
|
223
295
|
"""
|
|
224
|
-
return _Scan(d_in, d_out, op,
|
|
296
|
+
return _Scan(d_in, d_out, op, init_value, True)
|
|
225
297
|
|
|
226
298
|
|
|
227
299
|
def inclusive_scan(
|
|
228
300
|
d_in: DeviceArrayLike | IteratorBase,
|
|
229
301
|
d_out: DeviceArrayLike | IteratorBase,
|
|
230
302
|
op: Callable | OpKind,
|
|
231
|
-
|
|
303
|
+
init_value: np.ndarray | DeviceArrayLike | GpuStruct | None,
|
|
232
304
|
num_items: int,
|
|
233
305
|
stream=None,
|
|
234
306
|
):
|
|
@@ -240,7 +312,7 @@ def inclusive_scan(
|
|
|
240
312
|
Example:
|
|
241
313
|
Below, ``inclusive_scan`` is used to compute an inclusive scan (prefix sum).
|
|
242
314
|
|
|
243
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
315
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/scan/inclusive_scan_custom.py
|
|
244
316
|
:language: python
|
|
245
317
|
:start-after: # example-begin
|
|
246
318
|
|
|
@@ -249,11 +321,11 @@ def inclusive_scan(
|
|
|
249
321
|
d_in: Device array or iterator containing the input sequence of data items
|
|
250
322
|
d_out: Device array or iterator to store the result of the scan
|
|
251
323
|
op: Binary scan operator
|
|
252
|
-
|
|
324
|
+
init_value: Initial value for the scan
|
|
253
325
|
num_items: Number of items to scan
|
|
254
326
|
stream: CUDA stream for the operation (optional)
|
|
255
327
|
"""
|
|
256
|
-
scanner = make_inclusive_scan(d_in, d_out, op,
|
|
257
|
-
tmp_storage_bytes = scanner(None, d_in, d_out, num_items,
|
|
328
|
+
scanner = make_inclusive_scan(d_in, d_out, op, init_value)
|
|
329
|
+
tmp_storage_bytes = scanner(None, d_in, d_out, num_items, init_value, stream)
|
|
258
330
|
tmp_storage = TempStorageBuffer(tmp_storage_bytes, stream)
|
|
259
|
-
scanner(tmp_storage, d_in, d_out, num_items,
|
|
331
|
+
scanner(tmp_storage, d_in, d_out, num_items, init_value, stream)
|
|
@@ -179,7 +179,7 @@ def make_segmented_reduce(
|
|
|
179
179
|
Example:
|
|
180
180
|
Below, ``make_segmented_reduce`` is used to create a segmented reduction object that can be reused.
|
|
181
181
|
|
|
182
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
182
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_object.py
|
|
183
183
|
:language: python
|
|
184
184
|
:start-after: # example-begin
|
|
185
185
|
|
|
@@ -216,7 +216,7 @@ def segmented_reduce(
|
|
|
216
216
|
Example:
|
|
217
217
|
Below, ``segmented_reduce`` is used to compute the minimum value of segments in a sequence of integers.
|
|
218
218
|
|
|
219
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
219
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/segmented/segmented_reduce_basic.py
|
|
220
220
|
:language: python
|
|
221
221
|
:start-after: # example-begin
|
|
222
222
|
|