cuda-cccl 0.3.0__cp311-cp311-manylinux_2_24_aarch64.whl → 0.3.2__cp311-cp311-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
- cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
- cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +27 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +13 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +155 -13
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +79 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
- cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
- cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-311-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,12 +5,12 @@
|
|
|
5
5
|
|
|
6
6
|
import numba
|
|
7
7
|
|
|
8
|
-
from
|
|
8
|
+
from .._common import (
|
|
9
9
|
make_binary_tempfile,
|
|
10
10
|
normalize_dim_param,
|
|
11
11
|
normalize_dtype_param,
|
|
12
12
|
)
|
|
13
|
-
from
|
|
13
|
+
from .._types import (
|
|
14
14
|
Algorithm,
|
|
15
15
|
Dependency,
|
|
16
16
|
DependentArray,
|
|
@@ -70,13 +70,13 @@ def load(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
|
|
|
70
70
|
The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
|
|
71
71
|
each thread handling 4 integers.
|
|
72
72
|
|
|
73
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
73
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
|
|
74
74
|
:language: python
|
|
75
75
|
:dedent:
|
|
76
76
|
:start-after: example-begin imports
|
|
77
77
|
:end-before: example-end imports
|
|
78
78
|
|
|
79
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
79
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
|
|
80
80
|
:language: python
|
|
81
81
|
:dedent:
|
|
82
82
|
:start-after: example-begin load_store
|
|
@@ -158,13 +158,13 @@ def store(dtype, threads_per_block, items_per_thread=1, algorithm="direct"):
|
|
|
158
158
|
The code snippet below illustrates a striped load and store of 128 integer items by 32 threads, with
|
|
159
159
|
each thread handling 4 integers.
|
|
160
160
|
|
|
161
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
161
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
|
|
162
162
|
:language: python
|
|
163
163
|
:dedent:
|
|
164
164
|
:start-after: example-begin imports
|
|
165
165
|
:end-before: example-end imports
|
|
166
166
|
|
|
167
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
167
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_load_store_api.py
|
|
168
168
|
:language: python
|
|
169
169
|
:dedent:
|
|
170
170
|
:start-after: example-begin load_store
|
|
@@ -6,12 +6,12 @@ from typing import TYPE_CHECKING, Callable, Literal, Union
|
|
|
6
6
|
|
|
7
7
|
import numba
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from .._common import (
|
|
10
10
|
make_binary_tempfile,
|
|
11
11
|
normalize_dim_param,
|
|
12
12
|
normalize_dtype_param,
|
|
13
13
|
)
|
|
14
|
-
from
|
|
14
|
+
from .._types import (
|
|
15
15
|
Algorithm,
|
|
16
16
|
Constant,
|
|
17
17
|
Dependency,
|
|
@@ -41,7 +41,7 @@ def merge_sort_keys(
|
|
|
41
41
|
are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
|
|
42
42
|
where each thread owns 4 consecutive keys. We start by importing necessary modules:
|
|
43
43
|
|
|
44
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
44
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
|
|
45
45
|
:language: python
|
|
46
46
|
:dedent:
|
|
47
47
|
:start-after: example-begin imports
|
|
@@ -49,7 +49,7 @@ def merge_sort_keys(
|
|
|
49
49
|
|
|
50
50
|
Below is the code snippet that demonstrates the usage of the ``merge_sort_keys`` API:
|
|
51
51
|
|
|
52
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
52
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_merge_sort_api.py
|
|
53
53
|
:language: python
|
|
54
54
|
:dedent:
|
|
55
55
|
:start-after: example-begin merge-sort
|
|
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Tuple, Union
|
|
|
6
6
|
|
|
7
7
|
import numba
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from .._common import (
|
|
10
10
|
CUB_BLOCK_SCAN_ALGOS,
|
|
11
11
|
CudaSharedMemConfig,
|
|
12
12
|
dim3,
|
|
@@ -14,7 +14,7 @@ from cuda.cccl.cooperative.experimental._common import (
|
|
|
14
14
|
normalize_dim_param,
|
|
15
15
|
normalize_dtype_param,
|
|
16
16
|
)
|
|
17
|
-
from
|
|
17
|
+
from .._types import (
|
|
18
18
|
Algorithm,
|
|
19
19
|
Dependency,
|
|
20
20
|
DependentArray,
|
|
@@ -140,7 +140,7 @@ def radix_sort_keys(dtype, threads_per_block, items_per_thread):
|
|
|
140
140
|
are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
|
|
141
141
|
where each thread owns 4 consecutive keys. We start by importing necessary modules:
|
|
142
142
|
|
|
143
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
143
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
|
|
144
144
|
:language: python
|
|
145
145
|
:dedent:
|
|
146
146
|
:start-after: example-begin imports
|
|
@@ -148,7 +148,7 @@ def radix_sort_keys(dtype, threads_per_block, items_per_thread):
|
|
|
148
148
|
|
|
149
149
|
Below is the code snippet that demonstrates the usage of the ``radix_sort_keys`` API:
|
|
150
150
|
|
|
151
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
151
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
|
|
152
152
|
:language: python
|
|
153
153
|
:dedent:
|
|
154
154
|
:start-after: example-begin radix-sort
|
|
@@ -181,7 +181,7 @@ def radix_sort_keys_descending(dtype, threads_per_block, items_per_thread):
|
|
|
181
181
|
are partitioned in a :ref:`blocked arrangement <flexible-data-arrangement>` across 128 threads
|
|
182
182
|
where each thread owns 4 consecutive keys. We start by importing necessary modules:
|
|
183
183
|
|
|
184
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
184
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
|
|
185
185
|
:language: python
|
|
186
186
|
:dedent:
|
|
187
187
|
:start-after: example-begin imports
|
|
@@ -189,7 +189,7 @@ def radix_sort_keys_descending(dtype, threads_per_block, items_per_thread):
|
|
|
189
189
|
|
|
190
190
|
Below is the code snippet that demonstrates the usage of the ``radix_sort_keys`` API:
|
|
191
191
|
|
|
192
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
192
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_radix_sort_api.py
|
|
193
193
|
:language: python
|
|
194
194
|
:dedent:
|
|
195
195
|
:start-after: example-begin radix-sort-descending
|
|
@@ -6,13 +6,13 @@ from typing import TYPE_CHECKING, Callable, Literal, Tuple, Union
|
|
|
6
6
|
|
|
7
7
|
import numba
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from .._common import (
|
|
10
10
|
CUB_BLOCK_REDUCE_ALGOS,
|
|
11
11
|
make_binary_tempfile,
|
|
12
12
|
normalize_dim_param,
|
|
13
13
|
normalize_dtype_param,
|
|
14
14
|
)
|
|
15
|
-
from
|
|
15
|
+
from .._types import (
|
|
16
16
|
Algorithm,
|
|
17
17
|
Dependency,
|
|
18
18
|
DependentArray,
|
|
@@ -208,13 +208,13 @@ def reduce(
|
|
|
208
208
|
The code snippet below illustrates a max reduction of 128 integer items that are
|
|
209
209
|
partitioned across 128 threads.
|
|
210
210
|
|
|
211
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
211
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
|
|
212
212
|
:language: python
|
|
213
213
|
:dedent:
|
|
214
214
|
:start-after: example-begin imports
|
|
215
215
|
:end-before: example-end imports
|
|
216
216
|
|
|
217
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
217
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
|
|
218
218
|
:language: python
|
|
219
219
|
:dedent:
|
|
220
220
|
:start-after: example-begin reduce
|
|
@@ -269,13 +269,13 @@ def sum(
|
|
|
269
269
|
The code snippet below illustrates a sum of 128 integer items that are partitioned
|
|
270
270
|
across 128 threads.
|
|
271
271
|
|
|
272
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
272
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
|
|
273
273
|
:language: python
|
|
274
274
|
:dedent:
|
|
275
275
|
:start-after: example-begin imports
|
|
276
276
|
:end-before: example-end imports
|
|
277
277
|
|
|
278
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
278
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_reduce_api.py
|
|
279
279
|
:language: python
|
|
280
280
|
:dedent:
|
|
281
281
|
:start-after: example-begin sum
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
4
|
|
|
5
5
|
"""
|
|
6
|
-
cuda.
|
|
6
|
+
cuda.coop.block_scan
|
|
7
7
|
===========================
|
|
8
8
|
|
|
9
9
|
This module provides a set of :ref:`collective <collective-primitives>`
|
|
@@ -73,16 +73,16 @@ from typing import Any, Callable, Literal
|
|
|
73
73
|
|
|
74
74
|
import numba
|
|
75
75
|
|
|
76
|
-
from
|
|
76
|
+
from .._common import (
|
|
77
77
|
CUB_BLOCK_SCAN_ALGOS,
|
|
78
78
|
make_binary_tempfile,
|
|
79
79
|
normalize_dim_param,
|
|
80
80
|
normalize_dtype_param,
|
|
81
81
|
)
|
|
82
|
-
from
|
|
82
|
+
from .._scan_op import (
|
|
83
83
|
ScanOp,
|
|
84
84
|
)
|
|
85
|
-
from
|
|
85
|
+
from .._types import (
|
|
86
86
|
Algorithm,
|
|
87
87
|
Dependency,
|
|
88
88
|
DependentArray,
|
|
@@ -94,7 +94,7 @@ from cuda.cccl.cooperative.experimental._types import (
|
|
|
94
94
|
TemplateParameter,
|
|
95
95
|
numba_type_to_wrapper,
|
|
96
96
|
)
|
|
97
|
-
from
|
|
97
|
+
from .._typing import (
|
|
98
98
|
DimType,
|
|
99
99
|
DtypeType,
|
|
100
100
|
ScanOpType,
|
|
@@ -669,7 +669,7 @@ def exclusive_sum(
|
|
|
669
669
|
:ref:`blocked arrangement <flexible-data-arrangement>` across 128
|
|
670
670
|
threads where each thread owns 4 consecutive items.
|
|
671
671
|
|
|
672
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
672
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_scan_api.py
|
|
673
673
|
:language: python
|
|
674
674
|
:dedent:
|
|
675
675
|
:start-after: example-begin imports
|
|
@@ -678,7 +678,7 @@ def exclusive_sum(
|
|
|
678
678
|
Below is the code snippet that demonstrates the usage of the
|
|
679
679
|
``exclusive_sum`` API:
|
|
680
680
|
|
|
681
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
681
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_block_scan_api.py
|
|
682
682
|
:language: python
|
|
683
683
|
:dedent:
|
|
684
684
|
:start-after: example-begin exclusive-sum
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
4
|
+
|
|
5
|
+
from ._warp_merge_sort import merge_sort_keys
|
|
6
|
+
from ._warp_reduce import reduce, sum
|
|
7
|
+
from ._warp_scan import exclusive_sum
|
|
8
|
+
|
|
9
|
+
__all__ = ["exclusive_sum", "reduce", "sum", "merge_sort_keys"]
|
|
@@ -4,8 +4,8 @@
|
|
|
4
4
|
|
|
5
5
|
import numba
|
|
6
6
|
|
|
7
|
-
from
|
|
8
|
-
from
|
|
7
|
+
from .._common import make_binary_tempfile
|
|
8
|
+
from .._types import (
|
|
9
9
|
Algorithm,
|
|
10
10
|
Constant,
|
|
11
11
|
Dependency,
|
|
@@ -30,7 +30,7 @@ def merge_sort_keys(
|
|
|
30
30
|
|
|
31
31
|
Below is the code snippet that demonstrates the usage of the ``merge_sort_keys`` API:
|
|
32
32
|
|
|
33
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
33
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_merge_sort_api.py
|
|
34
34
|
:language: python
|
|
35
35
|
:dedent:
|
|
36
36
|
:start-after: example-begin merge-sort
|
|
@@ -4,8 +4,8 @@
|
|
|
4
4
|
|
|
5
5
|
import numba
|
|
6
6
|
|
|
7
|
-
from
|
|
8
|
-
from
|
|
7
|
+
from .._common import make_binary_tempfile
|
|
8
|
+
from .._types import (
|
|
9
9
|
Algorithm,
|
|
10
10
|
Dependency,
|
|
11
11
|
DependentPythonOperator,
|
|
@@ -28,7 +28,7 @@ def reduce(dtype, binary_op, threads_in_warp=32, methods=None):
|
|
|
28
28
|
The code snippet below illustrates a max reduction of 32 integer items that
|
|
29
29
|
are partitioned across a warp of threads.
|
|
30
30
|
|
|
31
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
31
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
|
|
32
32
|
:language: python
|
|
33
33
|
:dedent:
|
|
34
34
|
:start-after: example-begin imports
|
|
@@ -36,7 +36,7 @@ def reduce(dtype, binary_op, threads_in_warp=32, methods=None):
|
|
|
36
36
|
|
|
37
37
|
Below is the code snippet that demonstrates the usage of the ``reduce`` API:
|
|
38
38
|
|
|
39
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
39
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
|
|
40
40
|
:language: python
|
|
41
41
|
:dedent:
|
|
42
42
|
:start-after: example-begin reduce
|
|
@@ -100,7 +100,7 @@ def sum(dtype, threads_in_warp=32):
|
|
|
100
100
|
The code snippet below illustrates a reduction of 32 integer items that
|
|
101
101
|
are partitioned across a warp of threads.
|
|
102
102
|
|
|
103
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
103
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
|
|
104
104
|
:language: python
|
|
105
105
|
:dedent:
|
|
106
106
|
:start-after: example-begin imports
|
|
@@ -108,7 +108,7 @@ def sum(dtype, threads_in_warp=32):
|
|
|
108
108
|
|
|
109
109
|
Below is the code snippet that demonstrates the usage of the ``reduce`` API:
|
|
110
110
|
|
|
111
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
111
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_reduce_api.py
|
|
112
112
|
:language: python
|
|
113
113
|
:dedent:
|
|
114
114
|
:start-after: example-begin sum
|
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
|
|
6
6
|
import numba
|
|
7
7
|
|
|
8
|
-
from
|
|
9
|
-
from
|
|
8
|
+
from .._common import make_binary_tempfile
|
|
9
|
+
from .._types import (
|
|
10
10
|
Algorithm,
|
|
11
11
|
Dependency,
|
|
12
12
|
DependentReference,
|
|
@@ -23,7 +23,7 @@ def exclusive_sum(dtype, threads_in_warp=32):
|
|
|
23
23
|
Example:
|
|
24
24
|
The code snippet below illustrates an exclusive prefix sum of 32 integer items:
|
|
25
25
|
|
|
26
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
26
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_scan_api.py
|
|
27
27
|
:language: python
|
|
28
28
|
:dedent:
|
|
29
29
|
:start-after: example-begin imports
|
|
@@ -31,7 +31,7 @@ def exclusive_sum(dtype, threads_in_warp=32):
|
|
|
31
31
|
|
|
32
32
|
Below is the code snippet that demonstrates the usage of the ``exclusive_sum`` API:
|
|
33
33
|
|
|
34
|
-
.. literalinclude:: ../../python/cuda_cccl/tests/
|
|
34
|
+
.. literalinclude:: ../../python/cuda_cccl/tests/coop/test_warp_scan_api.py
|
|
35
35
|
:language: python
|
|
36
36
|
:dedent:
|
|
37
37
|
:start-after: example-begin exclusive-sum
|