cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
- cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
- cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +27 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +13 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +155 -13
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +79 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
- cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
- cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -165,7 +165,7 @@ def make_three_way_partition(
|
|
|
165
165
|
Example:
|
|
166
166
|
Below, ``make_three_way_partition`` is used to create a three-way partition object that can be reused.
|
|
167
167
|
|
|
168
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
168
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_object.py
|
|
169
169
|
:language: python
|
|
170
170
|
:start-after: # example-begin
|
|
171
171
|
|
|
@@ -214,7 +214,7 @@ def three_way_partition(
|
|
|
214
214
|
Example:
|
|
215
215
|
Below, ``three_way_partition`` is used to partition a sequence of integers into three parts.
|
|
216
216
|
|
|
217
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
217
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/three_way_partition/three_way_partition_basic.py
|
|
218
218
|
:language: python
|
|
219
219
|
:start-after: # example-begin
|
|
220
220
|
|
|
@@ -11,7 +11,7 @@ from .._caching import CachableFunction, cache_with_key
|
|
|
11
11
|
from .._cccl_interop import set_cccl_iterator_state
|
|
12
12
|
from .._utils import protocols
|
|
13
13
|
from ..iterators._iterators import IteratorBase
|
|
14
|
-
from ..numba_utils import get_inferred_return_type
|
|
14
|
+
from ..numba_utils import get_inferred_return_type, signature_from_annotations
|
|
15
15
|
from ..op import OpKind
|
|
16
16
|
from ..typing import DeviceArrayLike
|
|
17
17
|
|
|
@@ -32,16 +32,20 @@ class _UnaryTransform:
|
|
|
32
32
|
):
|
|
33
33
|
self.d_in_cccl = cccl.to_cccl_input_iter(d_in)
|
|
34
34
|
self.d_out_cccl = cccl.to_cccl_output_iter(d_out)
|
|
35
|
-
in_value_type = cccl.get_value_type(d_in)
|
|
36
|
-
out_value_type = cccl.get_value_type(d_out)
|
|
37
35
|
|
|
38
36
|
# For well-known operations, we don't need a signature
|
|
39
37
|
if isinstance(op, OpKind):
|
|
40
38
|
self.op_wrapper = cccl.to_cccl_op(op, None)
|
|
41
39
|
else:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
40
|
+
try:
|
|
41
|
+
sig = signature_from_annotations(op)
|
|
42
|
+
except ValueError:
|
|
43
|
+
in_value_type = cccl.get_value_type(d_in)
|
|
44
|
+
out_value_type = cccl.get_value_type(d_out)
|
|
45
|
+
if not out_value_type.is_internal:
|
|
46
|
+
out_value_type = get_inferred_return_type(op, (in_value_type,))
|
|
47
|
+
sig = out_value_type(in_value_type)
|
|
48
|
+
|
|
45
49
|
self.op_wrapper = cccl.to_cccl_op(op, sig=sig)
|
|
46
50
|
self.build_result = cccl.call_build(
|
|
47
51
|
_bindings.DeviceUnaryTransform,
|
|
@@ -97,11 +101,14 @@ class _BinaryTransform:
|
|
|
97
101
|
if isinstance(op, OpKind):
|
|
98
102
|
self.op_wrapper = cccl.to_cccl_op(op, None)
|
|
99
103
|
else:
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
104
|
+
try:
|
|
105
|
+
sig = signature_from_annotations(op)
|
|
106
|
+
except ValueError:
|
|
107
|
+
if not out_value_type.is_internal:
|
|
108
|
+
out_value_type = get_inferred_return_type(
|
|
109
|
+
op, (in1_value_type, in2_value_type)
|
|
110
|
+
)
|
|
111
|
+
sig = out_value_type(in1_value_type, in2_value_type)
|
|
105
112
|
self.op_wrapper = cccl.to_cccl_op(op, sig=sig)
|
|
106
113
|
self.build_result = cccl.call_build(
|
|
107
114
|
_bindings.DeviceBinaryTransform,
|
|
@@ -196,7 +203,7 @@ def make_unary_transform(
|
|
|
196
203
|
storage allocation. For simpler usage, consider using :func:`unary_transform`.
|
|
197
204
|
|
|
198
205
|
Example:
|
|
199
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
206
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_object.py
|
|
200
207
|
:language: python
|
|
201
208
|
:start-after: # example-begin
|
|
202
209
|
|
|
@@ -227,7 +234,7 @@ def make_binary_transform(
|
|
|
227
234
|
storage allocation. For simpler usage, consider using :func:`binary_transform`.
|
|
228
235
|
|
|
229
236
|
Example:
|
|
230
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
237
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_object.py
|
|
231
238
|
:language: python
|
|
232
239
|
:start-after: # example-begin
|
|
233
240
|
|
|
@@ -259,7 +266,14 @@ def unary_transform(
|
|
|
259
266
|
Example:
|
|
260
267
|
Below, ``unary_transform`` is used to apply a transformation to each element of the input.
|
|
261
268
|
|
|
262
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
269
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/unary_transform_basic.py
|
|
270
|
+
:language: python
|
|
271
|
+
:start-after: # example-begin
|
|
272
|
+
|
|
273
|
+
When working with custom struct types, you need to provide type annotations
|
|
274
|
+
to help with type inference. See the binary transform struct example for reference:
|
|
275
|
+
|
|
276
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_struct.py
|
|
263
277
|
:language: python
|
|
264
278
|
:start-after: # example-begin
|
|
265
279
|
|
|
@@ -291,7 +305,14 @@ def binary_transform(
|
|
|
291
305
|
Example:
|
|
292
306
|
Below, ``binary_transform`` is used to apply a transformation to pairs of elements from two input sequences.
|
|
293
307
|
|
|
294
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
308
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_basic.py
|
|
309
|
+
:language: python
|
|
310
|
+
:start-after: # example-begin
|
|
311
|
+
|
|
312
|
+
When working with custom struct types, you need to provide type annotations
|
|
313
|
+
to help with type inference. See the following example:
|
|
314
|
+
|
|
315
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/transform/binary_transform_struct.py
|
|
295
316
|
:language: python
|
|
296
317
|
:start-after: # example-begin
|
|
297
318
|
|
|
@@ -171,7 +171,7 @@ def make_unique_by_key(
|
|
|
171
171
|
Example:
|
|
172
172
|
Below, ``make_unique_by_key`` is used to create a unique by key object that can be reused.
|
|
173
173
|
|
|
174
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
174
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_object.py
|
|
175
175
|
:language: python
|
|
176
176
|
:start-after: # example-begin
|
|
177
177
|
|
|
@@ -211,7 +211,7 @@ def unique_by_key(
|
|
|
211
211
|
Example:
|
|
212
212
|
Below, ``unique_by_key`` is used to populate the arrays of output keys and items with the first key and its corresponding item from each sequence of equal keys. It also outputs the number of items selected.
|
|
213
213
|
|
|
214
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
214
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/unique/unique_by_key_basic.py
|
|
215
215
|
:language: python
|
|
216
216
|
:start-after: # example-begin
|
|
217
217
|
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -2,6 +2,7 @@ from ._factories import (
|
|
|
2
2
|
CacheModifiedInputIterator,
|
|
3
3
|
ConstantIterator,
|
|
4
4
|
CountingIterator,
|
|
5
|
+
PermutationIterator,
|
|
5
6
|
ReverseIterator,
|
|
6
7
|
TransformIterator,
|
|
7
8
|
TransformOutputIterator,
|
|
@@ -12,6 +13,7 @@ __all__ = [
|
|
|
12
13
|
"CacheModifiedInputIterator",
|
|
13
14
|
"ConstantIterator",
|
|
14
15
|
"CountingIterator",
|
|
16
|
+
"PermutationIterator",
|
|
15
17
|
"ReverseIterator",
|
|
16
18
|
"TransformIterator",
|
|
17
19
|
"TransformOutputIterator",
|
|
@@ -10,6 +10,7 @@ from ._iterators import (
|
|
|
10
10
|
CountingIterator as _CountingIterator,
|
|
11
11
|
)
|
|
12
12
|
from ._iterators import (
|
|
13
|
+
make_permutation_iterator,
|
|
13
14
|
make_reverse_iterator,
|
|
14
15
|
make_transform_iterator,
|
|
15
16
|
)
|
|
@@ -26,7 +27,7 @@ def CacheModifiedInputIterator(device_array, modifier):
|
|
|
26
27
|
Example:
|
|
27
28
|
The code snippet below demonstrates the usage of a ``CacheModifiedInputIterator``:
|
|
28
29
|
|
|
29
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
30
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/cache_modified_iterator_basic.py
|
|
30
31
|
:language: python
|
|
31
32
|
:start-after: # example-begin
|
|
32
33
|
|
|
@@ -55,7 +56,7 @@ def ConstantIterator(value):
|
|
|
55
56
|
The code snippet below demonstrates the usage of a ``ConstantIterator``
|
|
56
57
|
representing a sequence of constant values:
|
|
57
58
|
|
|
58
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
59
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/constant_iterator_basic.py
|
|
59
60
|
:language: python
|
|
60
61
|
:start-after: # example-begin
|
|
61
62
|
|
|
@@ -78,7 +79,7 @@ def CountingIterator(offset):
|
|
|
78
79
|
The code snippet below demonstrates the usage of a ``CountingIterator``
|
|
79
80
|
representing the sequence ``[10, 11, 12]``:
|
|
80
81
|
|
|
81
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
82
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/counting_iterator_basic.py
|
|
82
83
|
:language: python
|
|
83
84
|
:start-after: # example-begin
|
|
84
85
|
|
|
@@ -100,13 +101,13 @@ def ReverseIterator(sequence):
|
|
|
100
101
|
Examples:
|
|
101
102
|
The code snippet below demonstrates the usage of a ``ReverseIterator`` as an input iterator:
|
|
102
103
|
|
|
103
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
104
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_input_iterator.py
|
|
104
105
|
:language: python
|
|
105
106
|
:start-after: # example-begin
|
|
106
107
|
|
|
107
108
|
The code snippet below demonstrates the usage of a ``ReverseIterator`` as an output iterator:
|
|
108
109
|
|
|
109
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
110
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/reverse_output_iterator.py
|
|
110
111
|
:language: python
|
|
111
112
|
:start-after: # example-begin
|
|
112
113
|
|
|
@@ -129,7 +130,7 @@ def TransformIterator(it, op):
|
|
|
129
130
|
The code snippet below demonstrates the usage of a ``TransformIterator`` composed with a ``CountingIterator``
|
|
130
131
|
to transform the input before performing a reduction.
|
|
131
132
|
|
|
132
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
133
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_iterator_basic.py
|
|
133
134
|
:language: python
|
|
134
135
|
:start-after: # example-begin
|
|
135
136
|
Args:
|
|
@@ -151,7 +152,7 @@ def TransformOutputIterator(it, op):
|
|
|
151
152
|
The code snippet below demonstrates the usage of a ``TransformOutputIterator`` to transform the output
|
|
152
153
|
of a reduction before writing to an output array.
|
|
153
154
|
|
|
154
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
155
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/transform_output_iterator.py
|
|
155
156
|
:language: python
|
|
156
157
|
:start-after: # example-begin
|
|
157
158
|
|
|
@@ -165,6 +166,33 @@ def TransformOutputIterator(it, op):
|
|
|
165
166
|
return make_transform_iterator(it, op, "output")
|
|
166
167
|
|
|
167
168
|
|
|
169
|
+
def PermutationIterator(values, indices):
|
|
170
|
+
"""Returns an Iterator that accesses values through an index mapping.
|
|
171
|
+
|
|
172
|
+
Similar to https://nvidia.github.io/cccl/thrust/api/classthrust_1_1permutation__iterator.html
|
|
173
|
+
|
|
174
|
+
The permutation iterator accesses elements from the values collection using indices
|
|
175
|
+
from the indices collection, effectively computing values[indices[i]] at position i.
|
|
176
|
+
This is useful for gather/scatter operations and indirect array access patterns.
|
|
177
|
+
|
|
178
|
+
Example:
|
|
179
|
+
The code snippet below demonstrates the usage of a ``PermutationIterator``
|
|
180
|
+
to access values in a permuted order:
|
|
181
|
+
|
|
182
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/permutation_iterator_basic.py
|
|
183
|
+
:language: python
|
|
184
|
+
:start-after: # example-begin
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
values: The values array or iterator to be permuted
|
|
188
|
+
indices: An iterator or device array providing the indices for permutation
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
A ``PermutationIterator`` object that yields values[indices[i]] at position i
|
|
192
|
+
"""
|
|
193
|
+
return make_permutation_iterator(values, indices)
|
|
194
|
+
|
|
195
|
+
|
|
168
196
|
def ZipIterator(*iterators):
|
|
169
197
|
"""Returns an Iterator representing a zipped sequence of values from N iterators.
|
|
170
198
|
|
|
@@ -178,7 +206,7 @@ def ZipIterator(*iterators):
|
|
|
178
206
|
The code snippet below demonstrates the usage of a ``ZipIterator``
|
|
179
207
|
combining two device arrays:
|
|
180
208
|
|
|
181
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
209
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/iterator/zip_iterator_elementwise.py
|
|
182
210
|
:language: python
|
|
183
211
|
:start-after: # example-begin
|
|
184
212
|
|
|
@@ -207,7 +207,15 @@ def pointer_add_intrinsic(context, ptr, offset):
|
|
|
207
207
|
def codegen(context, builder, sig, args):
|
|
208
208
|
ptr, index = args
|
|
209
209
|
base = builder.ptrtoint(ptr, ir.IntType(_DEVICE_POINTER_BITWIDTH))
|
|
210
|
-
|
|
210
|
+
sizeof = sizeof_pointee(context, ptr)
|
|
211
|
+
# Cast index to match sizeof type if needed
|
|
212
|
+
if index.type != sizeof.type:
|
|
213
|
+
index = (
|
|
214
|
+
builder.sext(index, sizeof.type)
|
|
215
|
+
if index.type.width < sizeof.type.width
|
|
216
|
+
else builder.trunc(index, sizeof.type)
|
|
217
|
+
)
|
|
218
|
+
offset = builder.mul(index, sizeof)
|
|
211
219
|
result = builder.add(base, offset)
|
|
212
220
|
return builder.inttoptr(result, ptr.type)
|
|
213
221
|
|
|
@@ -610,3 +618,200 @@ def _get_last_element_ptr(device_array) -> int:
|
|
|
610
618
|
|
|
611
619
|
ptr = get_data_pointer(device_array)
|
|
612
620
|
return ptr + offset_to_last_element
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
class PermutationIteratorKind(IteratorKind):
|
|
624
|
+
pass
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
def make_permutation_iterator(values, indices):
|
|
628
|
+
"""
|
|
629
|
+
Create a PermutationIterator that accesses values through an index mapping.
|
|
630
|
+
|
|
631
|
+
The permutation iterator accesses elements from `values` using indices from `indices`,
|
|
632
|
+
effectively computing values[indices[i]] at position i.
|
|
633
|
+
|
|
634
|
+
Args:
|
|
635
|
+
values: The values array or iterator to permute
|
|
636
|
+
indices: The indices array or iterator specifying the permutation
|
|
637
|
+
|
|
638
|
+
Returns:
|
|
639
|
+
PermutationIterator: Iterator that yields permuted values
|
|
640
|
+
"""
|
|
641
|
+
# Convert arrays to iterators if needed
|
|
642
|
+
if hasattr(values, "__cuda_array_interface__"):
|
|
643
|
+
values = pointer(values, numba.from_dtype(get_dtype(values)))
|
|
644
|
+
elif not isinstance(values, IteratorBase):
|
|
645
|
+
raise TypeError("values must be a device array or iterator")
|
|
646
|
+
|
|
647
|
+
if hasattr(indices, "__cuda_array_interface__"):
|
|
648
|
+
indices = pointer(indices, numba.from_dtype(get_dtype(indices)))
|
|
649
|
+
elif not isinstance(indices, IteratorBase):
|
|
650
|
+
raise TypeError("indices must be an iterator or device array")
|
|
651
|
+
|
|
652
|
+
# JIT compile value advance/dereference methods
|
|
653
|
+
value_dtype = values.value_type
|
|
654
|
+
values_state_type = values.state_type
|
|
655
|
+
index_type = indices.value_type
|
|
656
|
+
value_advance = cuda.jit(values.advance, device=True)
|
|
657
|
+
value_input_dereference = cuda.jit(values.input_dereference, device=True)
|
|
658
|
+
|
|
659
|
+
try:
|
|
660
|
+
output_deref = values.output_dereference
|
|
661
|
+
if output_deref is not None:
|
|
662
|
+
value_output_dereference = cuda.jit(output_deref, device=True)
|
|
663
|
+
values_is_output_iterator = True
|
|
664
|
+
else:
|
|
665
|
+
values_is_output_iterator = False
|
|
666
|
+
except AttributeError:
|
|
667
|
+
values_is_output_iterator = False
|
|
668
|
+
|
|
669
|
+
# JIT compile index advance/dereference methods
|
|
670
|
+
index_advance = cuda.jit(indices.advance, device=True)
|
|
671
|
+
index_input_dereference = cuda.jit(indices.input_dereference, device=True)
|
|
672
|
+
|
|
673
|
+
# The cvalue and state for PermutationIterator are
|
|
674
|
+
# structs composed of the cvalues and states of the
|
|
675
|
+
# value and index iterators.
|
|
676
|
+
from ..struct import gpu_struct_from_numba_types
|
|
677
|
+
|
|
678
|
+
class PermutationCValueStruct(ctypes.Structure):
|
|
679
|
+
_fields_ = [
|
|
680
|
+
("value_state", values.cvalue.__class__),
|
|
681
|
+
("index_state", indices.cvalue.__class__),
|
|
682
|
+
]
|
|
683
|
+
|
|
684
|
+
PermutationState = gpu_struct_from_numba_types(
|
|
685
|
+
"PermutationState",
|
|
686
|
+
("value_state", "index_state"),
|
|
687
|
+
(values_state_type, indices.state_type),
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
cvalue = PermutationCValueStruct(values.cvalue, indices.cvalue)
|
|
691
|
+
state_type = PermutationState._numba_type
|
|
692
|
+
value_type = value_dtype
|
|
693
|
+
|
|
694
|
+
# Define intrinsics for accessing struct fields
|
|
695
|
+
@intrinsic
|
|
696
|
+
def get_value_state_field_ptr(context, struct_ptr_type):
|
|
697
|
+
def codegen(context, builder, sig, args):
|
|
698
|
+
struct_ptr = args[0]
|
|
699
|
+
# Use GEP to get pointer to field at index 0 (value_state)
|
|
700
|
+
field_ptr = builder.gep(
|
|
701
|
+
struct_ptr,
|
|
702
|
+
[ir.Constant(ir.IntType(32), 0), ir.Constant(ir.IntType(32), 0)],
|
|
703
|
+
)
|
|
704
|
+
return field_ptr
|
|
705
|
+
|
|
706
|
+
from numba.core.datamodel.registry import default_manager
|
|
707
|
+
|
|
708
|
+
struct_model = default_manager.lookup(struct_ptr_type.dtype)
|
|
709
|
+
field_type = struct_model._members[0]
|
|
710
|
+
return types.CPointer(field_type)(struct_ptr_type), codegen
|
|
711
|
+
|
|
712
|
+
@intrinsic
|
|
713
|
+
def get_index_state_field_ptr(context, struct_ptr_type):
|
|
714
|
+
def codegen(context, builder, sig, args):
|
|
715
|
+
struct_ptr = args[0]
|
|
716
|
+
# Use GEP to get pointer to field at index 1 (index_state)
|
|
717
|
+
field_ptr = builder.gep(
|
|
718
|
+
struct_ptr,
|
|
719
|
+
[ir.Constant(ir.IntType(32), 0), ir.Constant(ir.IntType(32), 1)],
|
|
720
|
+
)
|
|
721
|
+
return field_ptr
|
|
722
|
+
|
|
723
|
+
from numba.core.datamodel.registry import default_manager
|
|
724
|
+
|
|
725
|
+
struct_model = default_manager.lookup(struct_ptr_type.dtype)
|
|
726
|
+
field_type = struct_model._members[1]
|
|
727
|
+
return types.CPointer(field_type)(struct_ptr_type), codegen
|
|
728
|
+
|
|
729
|
+
# Create intrinsic for allocating temporary storage for index
|
|
730
|
+
@intrinsic
|
|
731
|
+
def alloca_temp_for_index_type(context):
|
|
732
|
+
def codegen(context, builder, sig, args):
|
|
733
|
+
temp_value_type = context.get_value_type(index_type)
|
|
734
|
+
temp_ptr = builder.alloca(temp_value_type)
|
|
735
|
+
return temp_ptr
|
|
736
|
+
|
|
737
|
+
return types.CPointer(index_type)(), codegen
|
|
738
|
+
|
|
739
|
+
# Create intrinsic for allocating temporary storage for value state
|
|
740
|
+
@intrinsic
|
|
741
|
+
def alloca_temp_for_value_state(context):
|
|
742
|
+
def codegen(context, builder, sig, args):
|
|
743
|
+
temp_state_type = context.get_value_type(values_state_type)
|
|
744
|
+
temp_ptr = builder.alloca(temp_state_type)
|
|
745
|
+
return temp_ptr
|
|
746
|
+
|
|
747
|
+
return types.CPointer(values_state_type)(), codegen
|
|
748
|
+
|
|
749
|
+
class PermutationIterator(IteratorBase):
|
|
750
|
+
iterator_kind_type = PermutationIteratorKind
|
|
751
|
+
|
|
752
|
+
def __init__(self, values_it, indices_it):
|
|
753
|
+
self._values = values_it
|
|
754
|
+
self._indices = indices_it
|
|
755
|
+
super().__init__(
|
|
756
|
+
cvalue=cvalue,
|
|
757
|
+
state_type=state_type,
|
|
758
|
+
value_type=value_type,
|
|
759
|
+
)
|
|
760
|
+
self._kind = self.__class__.iterator_kind_type(
|
|
761
|
+
(value_type, values_it.kind, indices_it.kind), state_type
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
@property
|
|
765
|
+
def advance(self):
|
|
766
|
+
return PermutationIterator._advance
|
|
767
|
+
|
|
768
|
+
@property
|
|
769
|
+
def input_dereference(self):
|
|
770
|
+
return PermutationIterator._input_dereference
|
|
771
|
+
|
|
772
|
+
@property
|
|
773
|
+
def output_dereference(self):
|
|
774
|
+
if not values_is_output_iterator:
|
|
775
|
+
raise AttributeError(
|
|
776
|
+
"PermutationIterator cannot be used as output iterator "
|
|
777
|
+
"when values iterator does not support output"
|
|
778
|
+
)
|
|
779
|
+
return PermutationIterator._output_dereference
|
|
780
|
+
|
|
781
|
+
@staticmethod
|
|
782
|
+
def _advance(state, distance):
|
|
783
|
+
# advance the index iterator
|
|
784
|
+
index_state_ptr = get_index_state_field_ptr(state)
|
|
785
|
+
index_advance(index_state_ptr, distance)
|
|
786
|
+
|
|
787
|
+
@staticmethod
|
|
788
|
+
def _input_dereference(state, result):
|
|
789
|
+
# dereference index to get the index value
|
|
790
|
+
index_state_ptr = get_index_state_field_ptr(state)
|
|
791
|
+
temp_index = alloca_temp_for_index_type()
|
|
792
|
+
index_input_dereference(index_state_ptr, temp_index)
|
|
793
|
+
|
|
794
|
+
# copy the value state (which always points to position 0)
|
|
795
|
+
# and advance it by the index value
|
|
796
|
+
value_state_ptr = get_value_state_field_ptr(state)
|
|
797
|
+
temp_value_state = alloca_temp_for_value_state()
|
|
798
|
+
temp_value_state[0] = value_state_ptr[0]
|
|
799
|
+
value_advance(temp_value_state, temp_index[0])
|
|
800
|
+
value_input_dereference(temp_value_state, result)
|
|
801
|
+
|
|
802
|
+
@staticmethod
|
|
803
|
+
def _output_dereference(state, x):
|
|
804
|
+
# dereference index to get the index value
|
|
805
|
+
index_state_ptr = get_index_state_field_ptr(state)
|
|
806
|
+
temp_index = alloca_temp_for_index_type()
|
|
807
|
+
index_input_dereference(index_state_ptr, temp_index)
|
|
808
|
+
|
|
809
|
+
# copy the value state (which always points to position 0)
|
|
810
|
+
# and advance it by the index value
|
|
811
|
+
value_state_ptr = get_value_state_field_ptr(state)
|
|
812
|
+
temp_value_state = alloca_temp_for_value_state()
|
|
813
|
+
temp_value_state[0] = value_state_ptr[0]
|
|
814
|
+
value_advance(temp_value_state, temp_index[0])
|
|
815
|
+
value_output_dereference(temp_value_state, x)
|
|
816
|
+
|
|
817
|
+
return PermutationIterator(values, indices)
|
|
@@ -39,10 +39,10 @@ def signature_from_annotations(func) -> numba.core.typing.Signature:
|
|
|
39
39
|
argspec = inspect.getfullargspec(func)
|
|
40
40
|
num_args = len(argspec.args)
|
|
41
41
|
try:
|
|
42
|
-
|
|
42
|
+
ret_ann = argspec.annotations["return"]
|
|
43
43
|
except KeyError:
|
|
44
44
|
raise ValueError("Function has incomplete annotations: missing return type")
|
|
45
|
-
|
|
45
|
+
retty = to_numba_type(ret_ann)
|
|
46
46
|
if num_args != len(argspec.annotations) - 1: # -1 for the return type
|
|
47
47
|
raise ValueError("One or more arguments are missing type annotations")
|
|
48
48
|
argtys = tuple(
|
|
@@ -207,7 +207,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
|
|
|
207
207
|
to a dataclass). The type of each field must be a subclass of
|
|
208
208
|
`np.number`, like `np.int32` or `np.float64`.
|
|
209
209
|
|
|
210
|
-
Arrays of GPUStruct objects can be used as inputs to cuda.
|
|
210
|
+
Arrays of GPUStruct objects can be used as inputs to cuda.compute
|
|
211
211
|
algorithms.
|
|
212
212
|
|
|
213
213
|
Example:
|
|
@@ -216,7 +216,7 @@ def gpu_struct(this: type) -> Type[GpuStruct]:
|
|
|
216
216
|
a reduction on an input array of floating point values to compute its
|
|
217
217
|
the smallest and the largest absolute values:
|
|
218
218
|
|
|
219
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
219
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/compute/examples/reduction/minmax_reduction.py
|
|
220
220
|
:language: python
|
|
221
221
|
:start-after: # example-begin
|
|
222
222
|
|
|
@@ -7,9 +7,11 @@ from typing import Any
|
|
|
7
7
|
|
|
8
8
|
from typing_extensions import (
|
|
9
9
|
Protocol,
|
|
10
|
+
runtime_checkable,
|
|
10
11
|
) # TODO: typing_extensions required for Python 3.7 docs env
|
|
11
12
|
|
|
12
13
|
|
|
14
|
+
@runtime_checkable
|
|
13
15
|
class DeviceArrayLike(Protocol):
|
|
14
16
|
"""
|
|
15
17
|
Objects representing a device array, having a `.__cuda_array_interface__`
|
cuda/coop/__init__.py
ADDED
|
@@ -5,8 +5,9 @@
|
|
|
5
5
|
import functools
|
|
6
6
|
|
|
7
7
|
from cuda.bindings import nvrtc
|
|
8
|
-
|
|
9
|
-
from
|
|
8
|
+
|
|
9
|
+
from ._caching import disk_cache
|
|
10
|
+
from ._common import check_in, version
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def CHECK_NVRTC(err, prog):
|
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
4
|
|
|
5
5
|
"""
|
|
6
|
-
cuda.
|
|
7
|
-
|
|
6
|
+
cuda.coop._scan_op
|
|
7
|
+
==================
|
|
8
8
|
|
|
9
9
|
This module implements the ``ScanOp`` class and related functions.
|
|
10
10
|
"""
|
|
@@ -14,7 +14,7 @@ from enum import Enum
|
|
|
14
14
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
|
|
17
|
-
from
|
|
17
|
+
from ._typing import (
|
|
18
18
|
ScanOpType,
|
|
19
19
|
)
|
|
20
20
|
|
|
@@ -17,8 +17,8 @@ from numba.core.typing import signature
|
|
|
17
17
|
from numba.cuda import LTOIR
|
|
18
18
|
from numba.cuda.cudadrv import driver as cuda_driver
|
|
19
19
|
|
|
20
|
-
import
|
|
21
|
-
from
|
|
20
|
+
from . import _nvrtc as nvrtc
|
|
21
|
+
from ._common import find_unsigned
|
|
22
22
|
|
|
23
23
|
NUMBA_TYPES_TO_CPP = {
|
|
24
24
|
types.boolean: "bool",
|
|
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
|
|
|
9
9
|
import numba
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
12
|
-
from
|
|
12
|
+
from ._common import dim3
|
|
13
13
|
|
|
14
14
|
# Type alias for dimension parameters that can be passed to CUDA functions.
|
|
15
15
|
DimType = Union["dim3", int, Tuple[int, int], Tuple[int, int, int]]
|
|
@@ -2,18 +2,18 @@
|
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from ._block_exchange import (
|
|
6
6
|
BlockExchangeType,
|
|
7
7
|
exchange,
|
|
8
8
|
)
|
|
9
|
-
from
|
|
10
|
-
from
|
|
11
|
-
from
|
|
9
|
+
from ._block_load_store import load, store
|
|
10
|
+
from ._block_merge_sort import merge_sort_keys
|
|
11
|
+
from ._block_radix_sort import (
|
|
12
12
|
radix_sort_keys,
|
|
13
13
|
radix_sort_keys_descending,
|
|
14
14
|
)
|
|
15
|
-
from
|
|
16
|
-
from
|
|
15
|
+
from ._block_reduce import reduce, sum
|
|
16
|
+
from ._block_scan import (
|
|
17
17
|
exclusive_scan,
|
|
18
18
|
exclusive_sum,
|
|
19
19
|
inclusive_scan,
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
4
|
|
|
5
5
|
"""
|
|
6
|
-
cuda.
|
|
6
|
+
cuda.coop.block_exchange
|
|
7
7
|
====================================
|
|
8
8
|
|
|
9
9
|
This module provides a set of :ref:`collective <collective-primitives>` methods
|
|
@@ -105,13 +105,13 @@ def exchange(
|
|
|
105
105
|
perform. Currently, only :py:attr:`StripedToBlocked` is supported.
|
|
106
106
|
|
|
107
107
|
:param dtype: Supplies the data type of the input and output arrays.
|
|
108
|
-
:type dtype: :py:class:`cuda.
|
|
108
|
+
:type dtype: :py:class:`cuda.coop._typing.DtypeType`
|
|
109
109
|
|
|
110
110
|
:param threads_per_block: Supplies the number of threads in the block,
|
|
111
111
|
either as an integer for a 1D block or a tuple of two or three integers
|
|
112
112
|
for a 2D or 3D block, respectively.
|
|
113
113
|
:type threads_per_block:
|
|
114
|
-
:py:class:`cuda.
|
|
114
|
+
:py:class:`cuda.coop._typing.DimType`
|
|
115
115
|
|
|
116
116
|
:param items_per_thread: Supplies the number of items partitioned onto each
|
|
117
117
|
thread.
|
|
@@ -137,7 +137,7 @@ def exchange(
|
|
|
137
137
|
:raises ValueError: If ``items_per_thread`` is greater than 1 and
|
|
138
138
|
``methods`` is not *None* (i.e. a user-defined type is being used).
|
|
139
139
|
|
|
140
|
-
:returns: An :py:class:`cuda.
|
|
140
|
+
:returns: An :py:class:`cuda.coop._types.Invocable`
|
|
141
141
|
object representing the specialized kernel that call be called from
|
|
142
142
|
a Numba JIT'd CUDA kernel.
|
|
143
143
|
|