cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
- cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
- cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +27 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +13 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +155 -13
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +79 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
- cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
- cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -48,6 +48,13 @@
|
|
|
48
48
|
#include <thrust/system/cuda/detail/util.h>
|
|
49
49
|
#include <thrust/type_traits/is_trivially_relocatable.h>
|
|
50
50
|
|
|
51
|
+
#if _CCCL_HAS_CUDA_COMPILER()
|
|
52
|
+
# include <cub/device/dispatch/tuning/tuning_transform.cuh>
|
|
53
|
+
#endif // _CCCL_HAS_CUDA_COMPILER()
|
|
54
|
+
|
|
55
|
+
#include <cuda/__fwd/zip_iterator.h>
|
|
56
|
+
#include <cuda/std/tuple>
|
|
57
|
+
|
|
51
58
|
THRUST_NAMESPACE_BEGIN
|
|
52
59
|
namespace cuda_cub
|
|
53
60
|
{
|
|
@@ -61,6 +68,21 @@ template <class Derived, class InputIt, class OutputIt, class TransformOp>
|
|
|
61
68
|
OutputIt _CCCL_API _CCCL_FORCEINLINE
|
|
62
69
|
transform(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, TransformOp transform_op);
|
|
63
70
|
|
|
71
|
+
// Forward declare to work around a cyclic include, since "cuda/detail/transform.h" includes this header
|
|
72
|
+
// We want this to unwrap zip_transform_iterator
|
|
73
|
+
namespace __transform
|
|
74
|
+
{
|
|
75
|
+
_CCCL_EXEC_CHECK_DISABLE
|
|
76
|
+
template <class Derived, class Offset, class... InputIts, class OutputIt, class TransformOp, class Predicate>
|
|
77
|
+
OutputIt _CCCL_API _CCCL_FORCEINLINE cub_transform_many(
|
|
78
|
+
execution_policy<Derived>& policy,
|
|
79
|
+
::cuda::std::tuple<InputIts...> firsts,
|
|
80
|
+
OutputIt result,
|
|
81
|
+
Offset num_items,
|
|
82
|
+
TransformOp transform_op,
|
|
83
|
+
Predicate pred);
|
|
84
|
+
} // namespace __transform
|
|
85
|
+
|
|
64
86
|
namespace __copy
|
|
65
87
|
{
|
|
66
88
|
template <class H, class D, class T, class Size>
|
|
@@ -190,6 +212,17 @@ device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last,
|
|
|
190
212
|
|
|
191
213
|
return result + n;
|
|
192
214
|
}
|
|
215
|
+
else if constexpr (::cuda::__is_zip_transform_iterator<InputIt>)
|
|
216
|
+
{
|
|
217
|
+
const auto n = ::cuda::std::distance(first, last);
|
|
218
|
+
return cuda_cub::__transform::cub_transform_many(
|
|
219
|
+
policy,
|
|
220
|
+
::cuda::std::move(first).__base(),
|
|
221
|
+
result,
|
|
222
|
+
n,
|
|
223
|
+
::cuda::std::move(first).__pred(),
|
|
224
|
+
cub::detail::transform::always_true_predicate{});
|
|
225
|
+
}
|
|
193
226
|
else
|
|
194
227
|
{
|
|
195
228
|
return cuda_cub::transform(
|
|
@@ -39,11 +39,13 @@
|
|
|
39
39
|
#if _CCCL_HAS_CUDA_COMPILER()
|
|
40
40
|
# include <thrust/system/cuda/config.h>
|
|
41
41
|
|
|
42
|
-
# include <thrust/distance.h>
|
|
43
|
-
# include <thrust/iterator/counting_iterator.h>
|
|
44
|
-
# include <thrust/iterator/transform_iterator.h>
|
|
45
42
|
# include <thrust/system/cuda/detail/execution_policy.h>
|
|
46
43
|
|
|
44
|
+
# include <cuda/__iterator/counting_iterator.h>
|
|
45
|
+
# include <cuda/__iterator/transform_iterator.h>
|
|
46
|
+
# include <cuda/__iterator/zip_iterator.h>
|
|
47
|
+
# include <cuda/std/__iterator/distance.h>
|
|
48
|
+
|
|
47
49
|
THRUST_NAMESPACE_BEGIN
|
|
48
50
|
namespace cuda_cub
|
|
49
51
|
{
|
|
@@ -62,7 +64,6 @@ InputIt _CCCL_HOST_DEVICE find(execution_policy<Derived>& policy, InputIt first,
|
|
|
62
64
|
}; // namespace cuda_cub
|
|
63
65
|
THRUST_NAMESPACE_END
|
|
64
66
|
|
|
65
|
-
# include <thrust/iterator/zip_iterator.h>
|
|
66
67
|
# include <thrust/system/cuda/detail/reduce.h>
|
|
67
68
|
|
|
68
69
|
THRUST_NAMESPACE_BEGIN
|
|
@@ -92,109 +93,13 @@ struct functor
|
|
|
92
93
|
}
|
|
93
94
|
}
|
|
94
95
|
};
|
|
95
|
-
|
|
96
|
-
template <class ValueType, class InputIt, class UnaryOp>
|
|
97
|
-
struct transform_input_iterator_t
|
|
98
|
-
{
|
|
99
|
-
using self_t = transform_input_iterator_t;
|
|
100
|
-
using difference_type = thrust::detail::it_difference_t<InputIt>;
|
|
101
|
-
using value_type = ValueType;
|
|
102
|
-
using pointer = void;
|
|
103
|
-
using reference = value_type;
|
|
104
|
-
using iterator_category = ::cuda::std::random_access_iterator_tag;
|
|
105
|
-
|
|
106
|
-
InputIt input;
|
|
107
|
-
mutable UnaryOp op;
|
|
108
|
-
|
|
109
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE transform_input_iterator_t(InputIt input, UnaryOp op)
|
|
110
|
-
: input(input)
|
|
111
|
-
, op(op)
|
|
112
|
-
{}
|
|
113
|
-
|
|
114
|
-
transform_input_iterator_t(const self_t&) = default;
|
|
115
|
-
|
|
116
|
-
// UnaryOp might not be copy assignable, such as when it is a lambda. Define
|
|
117
|
-
// an explicit copy assignment operator that doesn't try to assign it.
|
|
118
|
-
_CCCL_HOST_DEVICE self_t& operator=(const self_t& o)
|
|
119
|
-
{
|
|
120
|
-
input = o.input;
|
|
121
|
-
return *this;
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator++(int)
|
|
125
|
-
{
|
|
126
|
-
self_t retval = *this;
|
|
127
|
-
++input;
|
|
128
|
-
return retval;
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator++()
|
|
132
|
-
{
|
|
133
|
-
++input;
|
|
134
|
-
return *this;
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*() const
|
|
138
|
-
{
|
|
139
|
-
thrust::detail::it_value_t<InputIt> x = *input;
|
|
140
|
-
return op(x);
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator*()
|
|
144
|
-
{
|
|
145
|
-
thrust::detail::it_value_t<InputIt> x = *input;
|
|
146
|
-
return op(x);
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator+(difference_type n) const
|
|
150
|
-
{
|
|
151
|
-
return self_t(input + n, op);
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t& operator+=(difference_type n)
|
|
155
|
-
{
|
|
156
|
-
input += n;
|
|
157
|
-
return *this;
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator-(difference_type n) const
|
|
161
|
-
{
|
|
162
|
-
return self_t(input - n, op);
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t& operator-=(difference_type n)
|
|
166
|
-
{
|
|
167
|
-
input -= n;
|
|
168
|
-
return *this;
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE difference_type operator-(self_t other) const
|
|
172
|
-
{
|
|
173
|
-
return input - other.input;
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE reference operator[](difference_type n) const
|
|
177
|
-
{
|
|
178
|
-
return op(input[n]);
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator==(const self_t& rhs) const
|
|
182
|
-
{
|
|
183
|
-
return (input == rhs.input);
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator!=(const self_t& rhs) const
|
|
187
|
-
{
|
|
188
|
-
return (input != rhs.input);
|
|
189
|
-
}
|
|
190
|
-
};
|
|
191
96
|
} // namespace __find_if
|
|
192
97
|
|
|
193
98
|
template <class Derived, class InputIt, class Size, class Predicate>
|
|
194
99
|
InputIt _CCCL_HOST_DEVICE
|
|
195
100
|
find_if_n(execution_policy<Derived>& policy, InputIt first, Size num_items, Predicate predicate)
|
|
196
101
|
{
|
|
197
|
-
using result_type =
|
|
102
|
+
using result_type = ::cuda::std::tuple<bool, Size>;
|
|
198
103
|
|
|
199
104
|
// empty sequence
|
|
200
105
|
if (num_items == 0)
|
|
@@ -212,27 +117,20 @@ find_if_n(execution_policy<Derived>& policy, InputIt first, Size num_items, Pred
|
|
|
212
117
|
const Size interval_threshold = 1 << 20;
|
|
213
118
|
const Size interval_size = (::cuda::std::min) (interval_threshold, num_items);
|
|
214
119
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
using IteratorTuple = thrust::tuple<XfrmIterator, counting_iterator<Size>>;
|
|
220
|
-
using ZipIterator = thrust::zip_iterator<IteratorTuple>;
|
|
221
|
-
|
|
222
|
-
IteratorTuple iter_tuple = thrust::make_tuple(XfrmIterator(first, predicate), counting_iterator<Size>(0));
|
|
223
|
-
|
|
224
|
-
ZipIterator begin = thrust::make_zip_iterator(iter_tuple);
|
|
225
|
-
ZipIterator end = begin + num_items;
|
|
120
|
+
const auto begin = ::cuda::make_zip_iterator(
|
|
121
|
+
::cuda::make_transform_iterator(try_unwrap_contiguous_iterator(first), predicate),
|
|
122
|
+
::cuda::counting_iterator<Size>(0));
|
|
123
|
+
const auto end = begin + num_items;
|
|
226
124
|
|
|
227
|
-
for (
|
|
125
|
+
for (auto interval_begin = begin; interval_begin < end; interval_begin += interval_size)
|
|
228
126
|
{
|
|
229
|
-
|
|
127
|
+
auto interval_end = interval_begin + interval_size;
|
|
230
128
|
if (end < interval_end)
|
|
231
129
|
{
|
|
232
130
|
interval_end = end;
|
|
233
131
|
} // end if
|
|
234
132
|
|
|
235
|
-
result_type result = reduce(
|
|
133
|
+
const result_type result = reduce(
|
|
236
134
|
policy, interval_begin, interval_end, result_type(false, interval_end - begin), __find_if::functor<result_type>());
|
|
237
135
|
|
|
238
136
|
// see if we found something
|
|
@@ -73,12 +73,14 @@ struct transform_pair_of_input_iterators_t
|
|
|
73
73
|
using value_type = ValueType;
|
|
74
74
|
using pointer = void;
|
|
75
75
|
using reference = value_type;
|
|
76
|
-
using iterator_category = std::random_access_iterator_tag;
|
|
76
|
+
using iterator_category = ::cuda::std::random_access_iterator_tag;
|
|
77
77
|
|
|
78
78
|
InputIt1 input1;
|
|
79
79
|
InputIt2 input2;
|
|
80
80
|
mutable BinaryOp op;
|
|
81
81
|
|
|
82
|
+
transform_pair_of_input_iterators_t() = default;
|
|
83
|
+
|
|
82
84
|
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE
|
|
83
85
|
transform_pair_of_input_iterators_t(InputIt1 input1_, InputIt2 input2_, BinaryOp op_)
|
|
84
86
|
: input1(input1_)
|
|
@@ -107,7 +109,7 @@ struct transform_pair_of_input_iterators_t
|
|
|
107
109
|
}
|
|
108
110
|
|
|
109
111
|
/// Prefix increment
|
|
110
|
-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t operator++()
|
|
112
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_t& operator++()
|
|
111
113
|
{
|
|
112
114
|
++input1;
|
|
113
115
|
++input2;
|
|
@@ -177,6 +179,10 @@ struct transform_pair_of_input_iterators_t
|
|
|
177
179
|
return (input1 != rhs.input1) || (input2 != rhs.input2);
|
|
178
180
|
}
|
|
179
181
|
|
|
182
|
+
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE bool operator<(const self_t& rhs) const
|
|
183
|
+
{
|
|
184
|
+
return input1 < rhs.input1;
|
|
185
|
+
}
|
|
180
186
|
}; // struct transform_pair_of_input_iterators_t
|
|
181
187
|
} // namespace detail
|
|
182
188
|
|
|
@@ -79,7 +79,7 @@ namespace detail
|
|
|
79
79
|
template <typename Iterator>
|
|
80
80
|
inline constexpr bool is_libcxx_wrap_iter_v = false;
|
|
81
81
|
|
|
82
|
-
#if
|
|
82
|
+
#if _CCCL_HOST_STD_LIB(LIBCXX)
|
|
83
83
|
template <typename Iterator>
|
|
84
84
|
inline constexpr bool is_libcxx_wrap_iter_v<
|
|
85
85
|
# if _LIBCPP_VERSION < 14000
|
|
@@ -88,23 +88,23 @@ inline constexpr bool is_libcxx_wrap_iter_v<
|
|
|
88
88
|
std::__wrap_iter<Iterator>
|
|
89
89
|
# endif
|
|
90
90
|
> = true;
|
|
91
|
-
#endif
|
|
91
|
+
#endif // _CCCL_HOST_STD_LIB(LIBCXX)
|
|
92
92
|
|
|
93
93
|
template <typename Iterator>
|
|
94
94
|
inline constexpr bool is_libstdcxx_normal_iterator_v = false;
|
|
95
95
|
|
|
96
|
-
#if
|
|
96
|
+
#if _CCCL_HOST_STD_LIB(LIBSTDCXX)
|
|
97
97
|
template <typename Iterator, typename Container>
|
|
98
98
|
inline constexpr bool is_libstdcxx_normal_iterator_v<::__gnu_cxx::__normal_iterator<Iterator, Container>> = true;
|
|
99
|
-
#endif
|
|
99
|
+
#endif // _CCCL_HOST_STD_LIB(LIBSTDCXX)
|
|
100
100
|
|
|
101
|
-
#if
|
|
101
|
+
#if _CCCL_HOST_STD_LIB(STL)
|
|
102
102
|
template <typename Iterator>
|
|
103
103
|
inline constexpr bool is_msvc_contiguous_iterator_v = ::cuda::std::is_pointer_v<::std::_Unwrapped_t<Iterator>>;
|
|
104
|
-
#else
|
|
104
|
+
#else // ^^^ _CCCL_HOST_STD_LIB(STL) ^^^ / vvv !_CCCL_HOST_STD_LIB(STL) vvv
|
|
105
105
|
template <typename Iterator>
|
|
106
106
|
inline constexpr bool is_msvc_contiguous_iterator_v = false;
|
|
107
|
-
#endif
|
|
107
|
+
#endif // ^^^ !_CCCL_HOST_STD_LIB(STL) ^^^
|
|
108
108
|
|
|
109
109
|
template <typename Iterator>
|
|
110
110
|
inline constexpr bool is_contiguous_iterator_impl_v =
|
|
@@ -1,77 +1,24 @@
|
|
|
1
|
-
# Copyright (c)
|
|
1
|
+
# Copyright (c) 2025, NVIDIA CORPORATION.
|
|
2
2
|
#
|
|
3
|
-
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
4
14
|
|
|
5
|
-
|
|
6
|
-
DoubleBuffer,
|
|
7
|
-
SortOrder,
|
|
8
|
-
binary_transform,
|
|
9
|
-
exclusive_scan,
|
|
10
|
-
histogram_even,
|
|
11
|
-
inclusive_scan,
|
|
12
|
-
make_binary_transform,
|
|
13
|
-
make_exclusive_scan,
|
|
14
|
-
make_histogram_even,
|
|
15
|
-
make_inclusive_scan,
|
|
16
|
-
make_merge_sort,
|
|
17
|
-
make_radix_sort,
|
|
18
|
-
make_reduce_into,
|
|
19
|
-
make_segmented_reduce,
|
|
20
|
-
make_three_way_partition,
|
|
21
|
-
make_unary_transform,
|
|
22
|
-
make_unique_by_key,
|
|
23
|
-
merge_sort,
|
|
24
|
-
radix_sort,
|
|
25
|
-
reduce_into,
|
|
26
|
-
segmented_reduce,
|
|
27
|
-
three_way_partition,
|
|
28
|
-
unary_transform,
|
|
29
|
-
unique_by_key,
|
|
30
|
-
)
|
|
31
|
-
from .iterators import (
|
|
32
|
-
CacheModifiedInputIterator,
|
|
33
|
-
ConstantIterator,
|
|
34
|
-
CountingIterator,
|
|
35
|
-
ReverseIterator,
|
|
36
|
-
TransformIterator,
|
|
37
|
-
TransformOutputIterator,
|
|
38
|
-
ZipIterator,
|
|
39
|
-
)
|
|
40
|
-
from .op import OpKind
|
|
41
|
-
from .struct import gpu_struct
|
|
15
|
+
# alias for backwards compatibility
|
|
42
16
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
"
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
"histogram_even",
|
|
52
|
-
"inclusive_scan",
|
|
53
|
-
"make_binary_transform",
|
|
54
|
-
"make_exclusive_scan",
|
|
55
|
-
"make_histogram_even",
|
|
56
|
-
"make_inclusive_scan",
|
|
57
|
-
"make_merge_sort",
|
|
58
|
-
"make_radix_sort",
|
|
59
|
-
"make_reduce_into",
|
|
60
|
-
"make_segmented_reduce",
|
|
61
|
-
"make_three_way_partition",
|
|
62
|
-
"make_unary_transform",
|
|
63
|
-
"make_unique_by_key",
|
|
64
|
-
"merge_sort",
|
|
65
|
-
"OpKind",
|
|
66
|
-
"radix_sort",
|
|
67
|
-
"reduce_into",
|
|
68
|
-
"ReverseIterator",
|
|
69
|
-
"segmented_reduce",
|
|
70
|
-
"SortOrder",
|
|
71
|
-
"TransformIterator",
|
|
72
|
-
"three_way_partition",
|
|
73
|
-
"TransformOutputIterator",
|
|
74
|
-
"unary_transform",
|
|
75
|
-
"unique_by_key",
|
|
76
|
-
"ZipIterator",
|
|
77
|
-
]
|
|
17
|
+
from warnings import warn
|
|
18
|
+
|
|
19
|
+
from cuda.compute import * # noqa: F403
|
|
20
|
+
|
|
21
|
+
warn(
|
|
22
|
+
"The module cuda.cccl.parallel.experimental is deprecated. Use cuda.compute instead.",
|
|
23
|
+
FutureWarning,
|
|
24
|
+
)
|
cuda/compute/__init__.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
|
+
|
|
5
|
+
from .algorithms import (
|
|
6
|
+
DoubleBuffer,
|
|
7
|
+
SortOrder,
|
|
8
|
+
binary_transform,
|
|
9
|
+
exclusive_scan,
|
|
10
|
+
histogram_even,
|
|
11
|
+
inclusive_scan,
|
|
12
|
+
make_binary_transform,
|
|
13
|
+
make_exclusive_scan,
|
|
14
|
+
make_histogram_even,
|
|
15
|
+
make_inclusive_scan,
|
|
16
|
+
make_merge_sort,
|
|
17
|
+
make_radix_sort,
|
|
18
|
+
make_reduce_into,
|
|
19
|
+
make_segmented_reduce,
|
|
20
|
+
make_three_way_partition,
|
|
21
|
+
make_unary_transform,
|
|
22
|
+
make_unique_by_key,
|
|
23
|
+
merge_sort,
|
|
24
|
+
radix_sort,
|
|
25
|
+
reduce_into,
|
|
26
|
+
segmented_reduce,
|
|
27
|
+
three_way_partition,
|
|
28
|
+
unary_transform,
|
|
29
|
+
unique_by_key,
|
|
30
|
+
)
|
|
31
|
+
from .iterators import (
|
|
32
|
+
CacheModifiedInputIterator,
|
|
33
|
+
ConstantIterator,
|
|
34
|
+
CountingIterator,
|
|
35
|
+
PermutationIterator,
|
|
36
|
+
ReverseIterator,
|
|
37
|
+
TransformIterator,
|
|
38
|
+
TransformOutputIterator,
|
|
39
|
+
ZipIterator,
|
|
40
|
+
)
|
|
41
|
+
from .op import OpKind
|
|
42
|
+
from .struct import gpu_struct
|
|
43
|
+
|
|
44
|
+
__all__ = [
|
|
45
|
+
"binary_transform",
|
|
46
|
+
"CacheModifiedInputIterator",
|
|
47
|
+
"ConstantIterator",
|
|
48
|
+
"CountingIterator",
|
|
49
|
+
"DoubleBuffer",
|
|
50
|
+
"exclusive_scan",
|
|
51
|
+
"gpu_struct",
|
|
52
|
+
"histogram_even",
|
|
53
|
+
"inclusive_scan",
|
|
54
|
+
"make_binary_transform",
|
|
55
|
+
"make_exclusive_scan",
|
|
56
|
+
"make_histogram_even",
|
|
57
|
+
"make_inclusive_scan",
|
|
58
|
+
"make_merge_sort",
|
|
59
|
+
"make_radix_sort",
|
|
60
|
+
"make_reduce_into",
|
|
61
|
+
"make_segmented_reduce",
|
|
62
|
+
"make_three_way_partition",
|
|
63
|
+
"make_unary_transform",
|
|
64
|
+
"make_unique_by_key",
|
|
65
|
+
"merge_sort",
|
|
66
|
+
"OpKind",
|
|
67
|
+
"PermutationIterator",
|
|
68
|
+
"radix_sort",
|
|
69
|
+
"reduce_into",
|
|
70
|
+
"ReverseIterator",
|
|
71
|
+
"segmented_reduce",
|
|
72
|
+
"SortOrder",
|
|
73
|
+
"TransformIterator",
|
|
74
|
+
"TransformOutputIterator",
|
|
75
|
+
"three_way_partition",
|
|
76
|
+
"unary_transform",
|
|
77
|
+
"unique_by_key",
|
|
78
|
+
"ZipIterator",
|
|
79
|
+
]
|
|
@@ -57,6 +57,12 @@ class SortOrder(IntEnum):
|
|
|
57
57
|
ASCENDING = ...
|
|
58
58
|
DESCENDING = ...
|
|
59
59
|
|
|
60
|
+
class InitKind(IntEnum):
|
|
61
|
+
_value_: int
|
|
62
|
+
NO_INIT = ...
|
|
63
|
+
FUTURE_VALUE_INIT = ...
|
|
64
|
+
VALUE_INIT = ...
|
|
65
|
+
|
|
60
66
|
class Op:
|
|
61
67
|
def __init__(
|
|
62
68
|
self,
|
|
@@ -133,6 +139,8 @@ class Iterator:
|
|
|
133
139
|
def state(self, value) -> None: ...
|
|
134
140
|
@property
|
|
135
141
|
def type(self) -> IteratorKind: ...
|
|
142
|
+
@property
|
|
143
|
+
def value_type(self) -> TypeInfo: ...
|
|
136
144
|
def as_bytes(self) -> bytes: ...
|
|
137
145
|
def is_kind_pointer(self) -> bool: ...
|
|
138
146
|
def is_kind_iterator(self) -> bool: ...
|
|
@@ -197,8 +205,9 @@ class DeviceScanBuildResult:
|
|
|
197
205
|
d_in: Iterator,
|
|
198
206
|
d_out: Iterator,
|
|
199
207
|
binary_op: Op,
|
|
200
|
-
|
|
208
|
+
init_type: TypeInfo,
|
|
201
209
|
force_inclusive: bool,
|
|
210
|
+
init_kind: InitKind,
|
|
202
211
|
info: CommonData,
|
|
203
212
|
): ...
|
|
204
213
|
def compute_inclusive(
|
|
@@ -223,6 +232,39 @@ class DeviceScanBuildResult:
|
|
|
223
232
|
h_init: Value,
|
|
224
233
|
stream,
|
|
225
234
|
) -> int: ...
|
|
235
|
+
def compute_inclusive_future_value(
|
|
236
|
+
self,
|
|
237
|
+
temp_storage_ptr: int | None,
|
|
238
|
+
temp_storage_nbytes: int,
|
|
239
|
+
d_in: Iterator,
|
|
240
|
+
d_out: Iterator,
|
|
241
|
+
num_items: int,
|
|
242
|
+
binary_op: Op,
|
|
243
|
+
h_init: Iterator,
|
|
244
|
+
stream,
|
|
245
|
+
) -> int: ...
|
|
246
|
+
def compute_exclusive_future_value(
|
|
247
|
+
self,
|
|
248
|
+
temp_storage_ptr: int | None,
|
|
249
|
+
temp_storage_nbytes: int,
|
|
250
|
+
d_in: Iterator,
|
|
251
|
+
d_out: Iterator,
|
|
252
|
+
num_items: int,
|
|
253
|
+
binary_op: Op,
|
|
254
|
+
h_init: Iterator,
|
|
255
|
+
stream,
|
|
256
|
+
) -> int: ...
|
|
257
|
+
def compute_inclusive_no_init(
|
|
258
|
+
self,
|
|
259
|
+
temp_storage_ptr: int | None,
|
|
260
|
+
temp_storage_nbytes: int,
|
|
261
|
+
d_in: Iterator,
|
|
262
|
+
d_out: Iterator,
|
|
263
|
+
num_items: int,
|
|
264
|
+
binary_op: Op,
|
|
265
|
+
h_init: None,
|
|
266
|
+
stream,
|
|
267
|
+
) -> int: ...
|
|
226
268
|
|
|
227
269
|
# ---------------------
|
|
228
270
|
# DeviceSegmentedReduce
|