cuda-cccl 0.3.0__cp313-cp313-manylinux_2_24_aarch64.whl → 0.3.2__cp313-cp313-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cuda-cccl might be problematic. Click here for more details.
- cuda/cccl/cooperative/__init__.py +7 -1
- cuda/cccl/cooperative/experimental/__init__.py +21 -5
- cuda/cccl/headers/include/cub/agent/agent_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_for.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_histogram.cuh +354 -572
- cuda/cccl/headers/include/cub/agent/agent_merge.cuh +23 -21
- cuda/cccl/headers/include/cub/agent/agent_merge_sort.cuh +21 -3
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_downsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_onesweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_radix_sort_upsweep.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_rle.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_scan.cuh +5 -1
- cuda/cccl/headers/include/cub/agent/agent_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_segmented_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_select_if.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_sub_warp_merge_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/agent/agent_unique_by_key.cuh +22 -5
- cuda/cccl/headers/include/cub/block/block_adjacent_difference.cuh +6 -8
- cuda/cccl/headers/include/cub/block/block_discontinuity.cuh +24 -14
- cuda/cccl/headers/include/cub/block/block_exchange.cuh +5 -0
- cuda/cccl/headers/include/cub/block/block_histogram.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_load.cuh +4 -0
- cuda/cccl/headers/include/cub/block/block_radix_rank.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_radix_sort.cuh +4 -2
- cuda/cccl/headers/include/cub/block/block_reduce.cuh +1 -0
- cuda/cccl/headers/include/cub/block/block_scan.cuh +12 -2
- cuda/cccl/headers/include/cub/block/block_store.cuh +3 -2
- cuda/cccl/headers/include/cub/detail/device_memory_resource.cuh +1 -0
- cuda/cccl/headers/include/cub/detail/mdspan_utils.cuh +34 -30
- cuda/cccl/headers/include/cub/detail/ptx-json-parser.h +1 -1
- cuda/cccl/headers/include/cub/device/device_for.cuh +118 -40
- cuda/cccl/headers/include/cub/device/device_reduce.cuh +6 -7
- cuda/cccl/headers/include/cub/device/device_segmented_reduce.cuh +170 -260
- cuda/cccl/headers/include/cub/device/device_transform.cuh +122 -91
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh +6 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_radix_sort.cuh +2 -11
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh +12 -29
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh +2 -7
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_segmented_sort.cuh +2 -3
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh +4 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh +0 -1
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh +3 -5
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh +13 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/for_each.cuh +72 -37
- cuda/cccl/headers/include/cub/device/dispatch/kernels/reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/segmented_reduce.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/kernels/transform.cuh +22 -27
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_adjacent_difference.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_batch_memcpy.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_for.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_histogram.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_merge_sort.cuh +8 -0
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_radix_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_run_length_encode.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_segmented_sort.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_three_way_partition.cuh +2 -5
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_transform.cuh +61 -70
- cuda/cccl/headers/include/cub/device/dispatch/tuning/tuning_unique_by_key.cuh +10 -0
- cuda/cccl/headers/include/cub/thread/thread_reduce.cuh +24 -17
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_shfl.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_reduce_smem.cuh +3 -2
- cuda/cccl/headers/include/cub/warp/specializations/warp_scan_shfl.cuh +2 -2
- cuda/cccl/headers/include/cub/warp/warp_load.cuh +6 -6
- cuda/cccl/headers/include/cub/warp/warp_reduce.cuh +7 -2
- cuda/cccl/headers/include/cub/warp/warp_scan.cuh +7 -3
- cuda/cccl/headers/include/cub/warp/warp_store.cuh +1 -0
- cuda/cccl/headers/include/cuda/__algorithm/common.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/copy.h +1 -1
- cuda/cccl/headers/include/cuda/__algorithm/fill.h +1 -1
- cuda/cccl/headers/include/cuda/__barrier/barrier_block_scope.h +19 -0
- cuda/cccl/headers/include/cuda/__cccl_config +1 -0
- cuda/cccl/headers/include/cuda/__cmath/fast_modulo_division.h +3 -74
- cuda/cccl/headers/include/cuda/__cmath/mul_hi.h +146 -0
- cuda/cccl/headers/include/cuda/__complex/get_real_imag.h +0 -4
- cuda/cccl/headers/include/cuda/__device/all_devices.h +46 -143
- cuda/cccl/headers/include/cuda/__device/arch_id.h +176 -0
- cuda/cccl/headers/include/cuda/__device/arch_traits.h +247 -323
- cuda/cccl/headers/include/cuda/__device/attributes.h +174 -123
- cuda/cccl/headers/include/cuda/__device/compute_capability.h +171 -0
- cuda/cccl/headers/include/cuda/__device/device_ref.h +27 -49
- cuda/cccl/headers/include/cuda/__device/physical_device.h +100 -96
- cuda/cccl/headers/include/cuda/__driver/driver_api.h +105 -3
- cuda/cccl/headers/include/cuda/__event/event.h +27 -26
- cuda/cccl/headers/include/cuda/__event/event_ref.h +5 -5
- cuda/cccl/headers/include/cuda/__event/timed_event.h +10 -7
- cuda/cccl/headers/include/cuda/__fwd/devices.h +44 -0
- cuda/cccl/headers/include/cuda/__fwd/zip_iterator.h +9 -0
- cuda/cccl/headers/include/cuda/__iterator/constant_iterator.h +46 -31
- cuda/cccl/headers/include/cuda/__iterator/strided_iterator.h +79 -47
- cuda/cccl/headers/include/cuda/__iterator/tabulate_output_iterator.h +59 -36
- cuda/cccl/headers/include/cuda/__iterator/transform_input_output_iterator.h +79 -49
- cuda/cccl/headers/include/cuda/__iterator/transform_iterator.h +74 -48
- cuda/cccl/headers/include/cuda/__iterator/transform_output_iterator.h +80 -55
- cuda/cccl/headers/include/cuda/__iterator/zip_common.h +148 -0
- cuda/cccl/headers/include/cuda/__iterator/zip_iterator.h +21 -137
- cuda/cccl/headers/include/cuda/__iterator/zip_transform_iterator.h +592 -0
- cuda/cccl/headers/include/cuda/__mdspan/host_device_accessor.h +127 -60
- cuda/cccl/headers/include/cuda/__mdspan/host_device_mdspan.h +178 -3
- cuda/cccl/headers/include/cuda/__mdspan/restrict_accessor.h +38 -8
- cuda/cccl/headers/include/cuda/__mdspan/restrict_mdspan.h +67 -1
- cuda/cccl/headers/include/cuda/__memory/ptr_in_range.h +93 -0
- cuda/cccl/headers/include/cuda/__memory_resource/get_memory_resource.h +4 -4
- cuda/cccl/headers/include/cuda/__memory_resource/properties.h +44 -0
- cuda/cccl/headers/include/cuda/__memory_resource/resource.h +1 -1
- cuda/cccl/headers/include/cuda/__memory_resource/resource_ref.h +4 -6
- cuda/cccl/headers/include/cuda/__nvtx/nvtx3.h +2 -1
- cuda/cccl/headers/include/cuda/__runtime/ensure_current_context.h +9 -7
- cuda/cccl/headers/include/cuda/__stream/stream.h +8 -8
- cuda/cccl/headers/include/cuda/__stream/stream_ref.h +18 -16
- cuda/cccl/headers/include/cuda/__utility/basic_any.h +1 -1
- cuda/cccl/headers/include/cuda/__utility/in_range.h +65 -0
- cuda/cccl/headers/include/cuda/algorithm +1 -1
- cuda/cccl/headers/include/cuda/cmath +1 -0
- cuda/cccl/headers/include/cuda/devices +13 -0
- cuda/cccl/headers/include/cuda/iterator +1 -0
- cuda/cccl/headers/include/cuda/memory +1 -0
- cuda/cccl/headers/include/cuda/std/__algorithm/equal_range.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/find.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/includes.h +2 -4
- cuda/cccl/headers/include/cuda/std/__algorithm/lower_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/make_projected.h +7 -15
- cuda/cccl/headers/include/cuda/std/__algorithm/min_element.h +1 -1
- cuda/cccl/headers/include/cuda/std/__algorithm/minmax_element.h +1 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/partial_sort_copy.h +2 -2
- cuda/cccl/headers/include/cuda/std/__algorithm/upper_bound.h +1 -1
- cuda/cccl/headers/include/cuda/std/__bit/countl.h +8 -1
- cuda/cccl/headers/include/cuda/std/__bit/countr.h +2 -2
- cuda/cccl/headers/include/cuda/std/__bit/reference.h +11 -11
- cuda/cccl/headers/include/cuda/std/__cccl/algorithm_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/builtin.h +46 -49
- cuda/cccl/headers/include/cuda/std/__cccl/execution_space.h +6 -0
- cuda/cccl/headers/include/cuda/std/__cccl/host_std_lib.h +52 -0
- cuda/cccl/headers/include/cuda/std/__cccl/memory_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__cccl/numeric_wrapper.h +36 -0
- cuda/cccl/headers/include/cuda/std/__chrono/duration.h +16 -16
- cuda/cccl/headers/include/cuda/std/__chrono/steady_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__chrono/system_clock.h +5 -5
- cuda/cccl/headers/include/cuda/std/__cmath/isnan.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/complex.h +3 -2
- cuda/cccl/headers/include/cuda/std/__complex/literals.h +14 -34
- cuda/cccl/headers/include/cuda/std/__complex/nvbf16.h +2 -1
- cuda/cccl/headers/include/cuda/std/__complex/nvfp16.h +4 -3
- cuda/cccl/headers/include/cuda/std/__concepts/invocable.h +2 -2
- cuda/cccl/headers/include/cuda/std/__cstdlib/malloc.h +3 -2
- cuda/cccl/headers/include/cuda/std/__floating_point/fp.h +1 -1
- cuda/cccl/headers/include/cuda/std/__functional/bind.h +10 -13
- cuda/cccl/headers/include/cuda/std/__functional/function.h +5 -8
- cuda/cccl/headers/include/cuda/std/__functional/invoke.h +71 -335
- cuda/cccl/headers/include/cuda/std/__functional/mem_fn.h +1 -2
- cuda/cccl/headers/include/cuda/std/__functional/reference_wrapper.h +3 -3
- cuda/cccl/headers/include/cuda/std/__functional/weak_result_type.h +0 -6
- cuda/cccl/headers/include/cuda/std/__fwd/allocator.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/char_traits.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/complex.h +13 -4
- cuda/cccl/headers/include/cuda/std/__fwd/mdspan.h +23 -0
- cuda/cccl/headers/include/cuda/std/__fwd/pair.h +13 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string.h +22 -0
- cuda/cccl/headers/include/cuda/std/__fwd/string_view.h +14 -0
- cuda/cccl/headers/include/cuda/std/__internal/features.h +0 -5
- cuda/cccl/headers/include/cuda/std/__internal/namespaces.h +21 -0
- cuda/cccl/headers/include/cuda/std/__iterator/iterator_traits.h +5 -5
- cuda/cccl/headers/include/cuda/std/__mdspan/extents.h +7 -1
- cuda/cccl/headers/include/cuda/std/__mdspan/mdspan.h +53 -39
- cuda/cccl/headers/include/cuda/std/__memory/allocator.h +3 -3
- cuda/cccl/headers/include/cuda/std/__memory/construct_at.h +1 -3
- cuda/cccl/headers/include/cuda/std/__optional/optional_base.h +1 -0
- cuda/cccl/headers/include/cuda/std/__ranges/compressed_movable_box.h +892 -0
- cuda/cccl/headers/include/cuda/std/__ranges/movable_box.h +2 -2
- cuda/cccl/headers/include/cuda/std/__tuple_dir/make_tuple_types.h +23 -1
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like.h +4 -0
- cuda/cccl/headers/include/cuda/std/__tuple_dir/tuple_like_ext.h +4 -0
- cuda/cccl/headers/include/cuda/std/__type_traits/is_primary_template.h +7 -5
- cuda/cccl/headers/include/cuda/std/__type_traits/result_of.h +1 -1
- cuda/cccl/headers/include/cuda/std/__utility/pair.h +0 -5
- cuda/cccl/headers/include/cuda/std/bitset +1 -1
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/__config +15 -12
- cuda/cccl/headers/include/cuda/std/detail/libcxx/include/variant +11 -9
- cuda/cccl/headers/include/cuda/std/inplace_vector +4 -4
- cuda/cccl/headers/include/cuda/std/numbers +5 -0
- cuda/cccl/headers/include/cuda/std/string_view +155 -13
- cuda/cccl/headers/include/cuda/std/version +1 -4
- cuda/cccl/headers/include/cuda/stream_ref +5 -0
- cuda/cccl/headers/include/cuda/utility +1 -0
- cuda/cccl/headers/include/nv/target +7 -2
- cuda/cccl/headers/include/thrust/allocate_unique.h +1 -1
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.h +309 -33
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.h +151 -4
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.h +60 -3
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.h +45 -3
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.h +31 -6
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.h +29 -16
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.h +41 -4
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.h +42 -4
- cuda/cccl/headers/include/thrust/detail/complex/ccosh.h +3 -3
- cuda/cccl/headers/include/thrust/detail/integer_math.h +3 -20
- cuda/cccl/headers/include/thrust/detail/internal_functional.h +1 -1
- cuda/cccl/headers/include/thrust/detail/memory_algorithms.h +1 -1
- cuda/cccl/headers/include/thrust/detail/temporary_array.h +1 -1
- cuda/cccl/headers/include/thrust/detail/type_traits.h +1 -1
- cuda/cccl/headers/include/thrust/device_delete.h +18 -3
- cuda/cccl/headers/include/thrust/device_free.h +16 -3
- cuda/cccl/headers/include/thrust/device_new.h +29 -8
- cuda/cccl/headers/include/thrust/host_vector.h +1 -1
- cuda/cccl/headers/include/thrust/iterator/iterator_traits.h +11 -0
- cuda/cccl/headers/include/thrust/iterator/tabulate_output_iterator.h +5 -2
- cuda/cccl/headers/include/thrust/mr/disjoint_pool.h +1 -1
- cuda/cccl/headers/include/thrust/mr/pool.h +1 -1
- cuda/cccl/headers/include/thrust/system/cuda/detail/copy.h +33 -0
- cuda/cccl/headers/include/thrust/system/cuda/detail/find.h +13 -115
- cuda/cccl/headers/include/thrust/system/cuda/detail/mismatch.h +8 -2
- cuda/cccl/headers/include/thrust/type_traits/is_contiguous_iterator.h +7 -7
- cuda/cccl/parallel/experimental/__init__.py +21 -74
- cuda/compute/__init__.py +79 -0
- cuda/{cccl/parallel/experimental → compute}/_bindings.pyi +43 -1
- cuda/{cccl/parallel/experimental → compute}/_bindings_impl.pyx +157 -8
- cuda/{cccl/parallel/experimental → compute}/algorithms/_histogram.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_merge_sort.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_radix_sort.py +3 -3
- cuda/{cccl/parallel/experimental → compute}/algorithms/_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_scan.py +112 -40
- cuda/{cccl/parallel/experimental → compute}/algorithms/_segmented_reduce.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_three_way_partition.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/algorithms/_transform.py +36 -15
- cuda/{cccl/parallel/experimental → compute}/algorithms/_unique_by_key.py +2 -2
- cuda/compute/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu12/cccl/libcccl.c.parallel.so +0 -0
- cuda/compute/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/cu13/cccl/libcccl.c.parallel.so +0 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/__init__.py +2 -0
- cuda/{cccl/parallel/experimental → compute}/iterators/_factories.py +36 -8
- cuda/{cccl/parallel/experimental → compute}/iterators/_iterators.py +206 -1
- cuda/{cccl/parallel/experimental → compute}/numba_utils.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/struct.py +2 -2
- cuda/{cccl/parallel/experimental → compute}/typing.py +2 -0
- cuda/coop/__init__.py +8 -0
- cuda/{cccl/cooperative/experimental → coop}/_nvrtc.py +3 -2
- cuda/{cccl/cooperative/experimental → coop}/_scan_op.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/_types.py +2 -2
- cuda/{cccl/cooperative/experimental → coop}/_typing.py +1 -1
- cuda/{cccl/cooperative/experimental → coop}/block/__init__.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_exchange.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_load_store.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_merge_sort.py +4 -4
- cuda/{cccl/cooperative/experimental → coop}/block/_block_radix_sort.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/block/_block_scan.py +7 -7
- cuda/coop/warp/__init__.py +9 -0
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_merge_sort.py +3 -3
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_reduce.py +6 -6
- cuda/{cccl/cooperative/experimental → coop}/warp/_warp_scan.py +4 -4
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/METADATA +1 -1
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/RECORD +275 -276
- cuda/cccl/cooperative/experimental/warp/__init__.py +0 -9
- cuda/cccl/headers/include/cub/device/dispatch/dispatch_advance_iterators.cuh +0 -111
- cuda/cccl/headers/include/thrust/detail/algorithm_wrapper.h +0 -37
- cuda/cccl/headers/include/thrust/detail/allocator/allocator_traits.inl +0 -371
- cuda/cccl/headers/include/thrust/detail/allocator/copy_construct_range.inl +0 -242
- cuda/cccl/headers/include/thrust/detail/allocator/destroy_range.inl +0 -137
- cuda/cccl/headers/include/thrust/detail/allocator/fill_construct_range.inl +0 -99
- cuda/cccl/headers/include/thrust/detail/allocator/malloc_allocator.inl +0 -68
- cuda/cccl/headers/include/thrust/detail/allocator/tagged_allocator.inl +0 -86
- cuda/cccl/headers/include/thrust/detail/allocator/temporary_allocator.inl +0 -79
- cuda/cccl/headers/include/thrust/detail/allocator/value_initialize_range.inl +0 -98
- cuda/cccl/headers/include/thrust/detail/device_delete.inl +0 -52
- cuda/cccl/headers/include/thrust/detail/device_free.inl +0 -47
- cuda/cccl/headers/include/thrust/detail/device_new.inl +0 -61
- cuda/cccl/headers/include/thrust/detail/memory_wrapper.h +0 -40
- cuda/cccl/headers/include/thrust/detail/numeric_wrapper.h +0 -37
- cuda/cccl/parallel/experimental/.gitignore +0 -4
- cuda/cccl/parallel/experimental/cu12/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- cuda/cccl/parallel/experimental/cu13/_bindings_impl.cpython-313-aarch64-linux-gnu.so +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_bindings.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_caching.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_cccl_interop.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/protocols.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/_utils/temp_storage_buffer.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/algorithms/__init__.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/cccl/.gitkeep +0 -0
- /cuda/{cccl/parallel/experimental → compute}/iterators/_zip_iterator.py +0 -0
- /cuda/{cccl/parallel/experimental → compute}/op.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_caching.py +0 -0
- /cuda/{cccl/cooperative/experimental → coop}/_common.py +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/WHEEL +0 -0
- {cuda_cccl-0.3.0.dist-info → cuda_cccl-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -216,11 +216,10 @@ _CCCL_HOST_API inline void __deviceGetName(char* __name_out, int __len, int __or
|
|
|
216
216
|
return __result;
|
|
217
217
|
}
|
|
218
218
|
|
|
219
|
-
_CCCL_HOST_API inline
|
|
219
|
+
[[nodiscard]] _CCCL_HOST_API inline ::cudaError_t __primaryCtxReleaseNoThrow(::CUdevice __dev)
|
|
220
220
|
{
|
|
221
221
|
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuDevicePrimaryCtxRelease);
|
|
222
|
-
|
|
223
|
-
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to release context for a device", __dev);
|
|
222
|
+
return static_cast<::cudaError_t>(__driver_fn(__dev));
|
|
224
223
|
}
|
|
225
224
|
|
|
226
225
|
[[nodiscard]] _CCCL_HOST_API inline bool __isPrimaryCtxActive(::CUdevice __dev)
|
|
@@ -325,6 +324,109 @@ _CCCL_HOST_API void __memsetAsync(void* __dst, _Tp __value, size_t __count, ::CU
|
|
|
325
324
|
}
|
|
326
325
|
}
|
|
327
326
|
|
|
327
|
+
_CCCL_HOST_API inline ::cudaError_t __mempoolCreateNoThrow(::CUmemoryPool* __pool, ::CUmemPoolProps* __props)
|
|
328
|
+
{
|
|
329
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolCreate);
|
|
330
|
+
return static_cast<::cudaError_t>(__driver_fn(__pool, __props));
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
_CCCL_HOST_API inline void __mempoolSetAttribute(::CUmemoryPool __pool, ::CUmemPool_attribute __attr, void* __value)
|
|
334
|
+
{
|
|
335
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolSetAttribute);
|
|
336
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to set attribute for a memory pool", __pool, __attr, __value);
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
_CCCL_HOST_API inline size_t __mempoolGetAttribute(::CUmemoryPool __pool, ::CUmemPool_attribute __attr)
|
|
340
|
+
{
|
|
341
|
+
size_t __value = 0;
|
|
342
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolGetAttribute);
|
|
343
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get attribute for a memory pool", __pool, __attr, &__value);
|
|
344
|
+
return __value;
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
_CCCL_HOST_API inline void __mempoolDestroy(::CUmemoryPool __pool)
|
|
348
|
+
{
|
|
349
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolDestroy);
|
|
350
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to destroy a memory pool", __pool);
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
_CCCL_HOST_API inline ::CUdeviceptr
|
|
354
|
+
__mallocFromPoolAsync(::cuda::std::size_t __bytes, ::CUmemoryPool __pool, ::CUstream __stream)
|
|
355
|
+
{
|
|
356
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemAllocFromPoolAsync);
|
|
357
|
+
::CUdeviceptr __result = 0;
|
|
358
|
+
::cuda::__driver::__call_driver_fn(
|
|
359
|
+
__driver_fn, "Failed to allocate memory from a memory pool", &__result, __bytes, __pool, __stream);
|
|
360
|
+
return __result;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
_CCCL_HOST_API inline void __mempoolTrimTo(::CUmemoryPool __pool, ::cuda::std::size_t __min_bytes_to_keep)
|
|
364
|
+
{
|
|
365
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolTrimTo);
|
|
366
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to trim a memory pool", __pool, __min_bytes_to_keep);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
_CCCL_HOST_API inline ::cudaError_t __freeAsyncNoThrow(::CUdeviceptr __dptr, ::CUstream __stream)
|
|
370
|
+
{
|
|
371
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemFreeAsync);
|
|
372
|
+
return static_cast<::cudaError_t>(__driver_fn(__dptr, __stream));
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
_CCCL_HOST_API inline void __mempoolSetAccess(::CUmemoryPool __pool, ::CUmemAccessDesc* __descs, ::size_t __count)
|
|
376
|
+
{
|
|
377
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolSetAccess);
|
|
378
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to set access of a memory pool", __pool, __descs, __count);
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
_CCCL_HOST_API inline ::CUmemAccess_flags __mempoolGetAccess(::CUmemoryPool __pool, ::CUmemLocation* __location)
|
|
382
|
+
{
|
|
383
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemPoolGetAccess);
|
|
384
|
+
::CUmemAccess_flags __flags;
|
|
385
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to get access of a memory pool", &__flags, __pool, __location);
|
|
386
|
+
return __flags;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
# if _CCCL_CTK_AT_LEAST(13, 0)
|
|
390
|
+
_CCCL_HOST_API inline ::CUmemoryPool
|
|
391
|
+
__getDefaultMemPool(CUmemLocation __location, CUmemAllocationType_enum __allocation_type)
|
|
392
|
+
{
|
|
393
|
+
static auto __driver_fn =
|
|
394
|
+
_CCCLRT_GET_DRIVER_FUNCTION_VERSIONED(cuMemGetDefaultMemPool, cuMemGetDefaultMemPool, 13, 0);
|
|
395
|
+
::CUmemoryPool __result = nullptr;
|
|
396
|
+
::cuda::__driver::__call_driver_fn(
|
|
397
|
+
__driver_fn, "Failed to get default memory pool", &__result, &__location, __allocation_type);
|
|
398
|
+
return __result;
|
|
399
|
+
}
|
|
400
|
+
# endif // _CCCL_CTK_AT_LEAST(13, 0)
|
|
401
|
+
|
|
402
|
+
_CCCL_HOST_API inline ::CUdeviceptr __mallocManaged(::cuda::std::size_t __bytes, unsigned int __flags)
|
|
403
|
+
{
|
|
404
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemAllocManaged);
|
|
405
|
+
::CUdeviceptr __result = 0;
|
|
406
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to allocate managed memory", &__result, __bytes, __flags);
|
|
407
|
+
return __result;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
_CCCL_HOST_API inline void* __mallocHost(::cuda::std::size_t __bytes)
|
|
411
|
+
{
|
|
412
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemAllocHost);
|
|
413
|
+
void* __result = nullptr;
|
|
414
|
+
::cuda::__driver::__call_driver_fn(__driver_fn, "Failed to allocate host memory", &__result, __bytes);
|
|
415
|
+
return __result;
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
_CCCL_HOST_API inline ::cudaError_t __freeNoThrow(::CUdeviceptr __dptr)
|
|
419
|
+
{
|
|
420
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemFree);
|
|
421
|
+
return static_cast<::cudaError_t>(__driver_fn(__dptr));
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
_CCCL_HOST_API inline ::cudaError_t __freeHostNoThrow(void* __dptr)
|
|
425
|
+
{
|
|
426
|
+
static auto __driver_fn = _CCCLRT_GET_DRIVER_FUNCTION(cuMemFreeHost);
|
|
427
|
+
return static_cast<::cudaError_t>(__driver_fn(__dptr));
|
|
428
|
+
}
|
|
429
|
+
|
|
328
430
|
// Unified Addressing
|
|
329
431
|
|
|
330
432
|
// TODO: we don't want to have these functions here, refactoring expected
|
|
@@ -23,12 +23,13 @@
|
|
|
23
23
|
|
|
24
24
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
25
25
|
|
|
26
|
+
# include <cuda/__device/device_ref.h>
|
|
26
27
|
# include <cuda/__driver/driver_api.h>
|
|
27
28
|
# include <cuda/__event/event_ref.h>
|
|
28
29
|
# include <cuda/__runtime/ensure_current_context.h>
|
|
29
30
|
# include <cuda/__utility/no_init.h>
|
|
31
|
+
# include <cuda/std/__utility/to_underlying.h>
|
|
30
32
|
# include <cuda/std/cstddef>
|
|
31
|
-
# include <cuda/std/utility>
|
|
32
33
|
|
|
33
34
|
# include <cuda/std/__cccl/prologue.h>
|
|
34
35
|
|
|
@@ -36,38 +37,43 @@ _CCCL_BEGIN_NAMESPACE_CUDA
|
|
|
36
37
|
|
|
37
38
|
class timed_event;
|
|
38
39
|
|
|
40
|
+
//! @brief Flags to use when creating the event.
|
|
41
|
+
enum class event_flags : unsigned
|
|
42
|
+
{
|
|
43
|
+
none = cudaEventDefault,
|
|
44
|
+
blocking_sync = cudaEventBlockingSync,
|
|
45
|
+
interprocess = cudaEventInterprocess,
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
[[nodiscard]] _CCCL_HOST_API constexpr event_flags operator|(event_flags __lhs, event_flags __rhs) noexcept
|
|
49
|
+
{
|
|
50
|
+
return static_cast<event_flags>(::cuda::std::to_underlying(__lhs) | ::cuda::std::to_underlying(__rhs));
|
|
51
|
+
}
|
|
52
|
+
|
|
39
53
|
//! @brief An owning wrapper for an untimed `cudaEvent_t`.
|
|
40
54
|
class event : public event_ref
|
|
41
55
|
{
|
|
42
56
|
friend class timed_event;
|
|
43
57
|
|
|
44
58
|
public:
|
|
45
|
-
//! @brief Flags to use when creating the event.
|
|
46
|
-
enum class flags : unsigned
|
|
47
|
-
{
|
|
48
|
-
none = cudaEventDefault,
|
|
49
|
-
blocking_sync = cudaEventBlockingSync,
|
|
50
|
-
interprocess = cudaEventInterprocess,
|
|
51
|
-
};
|
|
52
|
-
|
|
53
59
|
//! @brief Construct a new `event` object with timing disabled, and record
|
|
54
60
|
//! the event in the specified stream.
|
|
55
61
|
//!
|
|
56
62
|
//! @throws cuda_error if the event creation fails.
|
|
57
|
-
explicit event(stream_ref __stream,
|
|
63
|
+
_CCCL_HOST_API explicit event(stream_ref __stream, event_flags __flags = event_flags::none);
|
|
58
64
|
|
|
59
65
|
//! @brief Construct a new `event` object with timing disabled. The event can only be recorded on streams from the
|
|
60
66
|
//! specified device.
|
|
61
67
|
//!
|
|
62
68
|
//! @throws cuda_error if the event creation fails.
|
|
63
|
-
explicit event(device_ref __device,
|
|
64
|
-
: event(__device,
|
|
69
|
+
_CCCL_HOST_API explicit event(device_ref __device, event_flags __flags = event_flags::none)
|
|
70
|
+
: event(__device, ::cuda::std::to_underlying(__flags) | cudaEventDisableTiming)
|
|
65
71
|
{}
|
|
66
72
|
|
|
67
73
|
//! @brief Construct a new `event` object into the moved-from state.
|
|
68
74
|
//!
|
|
69
75
|
//! @post `get()` returns `cudaEvent_t()`.
|
|
70
|
-
explicit constexpr event(no_init_t) noexcept
|
|
76
|
+
_CCCL_HOST_API explicit constexpr event(no_init_t) noexcept
|
|
71
77
|
: event_ref(::cudaEvent_t{})
|
|
72
78
|
{}
|
|
73
79
|
|
|
@@ -76,7 +82,7 @@ public:
|
|
|
76
82
|
//! @param __other
|
|
77
83
|
//!
|
|
78
84
|
//! @post `__other` is in a moved-from state.
|
|
79
|
-
constexpr event(event&& __other) noexcept
|
|
85
|
+
_CCCL_HOST_API constexpr event(event&& __other) noexcept
|
|
80
86
|
: event_ref(::cuda::std::exchange(__other.__event_, {}))
|
|
81
87
|
{}
|
|
82
88
|
|
|
@@ -86,7 +92,7 @@ public:
|
|
|
86
92
|
//! @brief Destroy the `event` object
|
|
87
93
|
//!
|
|
88
94
|
//! @note If the event fails to be destroyed, the error is silently ignored.
|
|
89
|
-
~event()
|
|
95
|
+
_CCCL_HOST_API ~event()
|
|
90
96
|
{
|
|
91
97
|
if (__event_ != nullptr)
|
|
92
98
|
{
|
|
@@ -101,7 +107,7 @@ public:
|
|
|
101
107
|
//! @param __other
|
|
102
108
|
//!
|
|
103
109
|
//! @post `__other` is in a moved-from state.
|
|
104
|
-
event& operator=(event&& __other) noexcept
|
|
110
|
+
_CCCL_HOST_API event& operator=(event&& __other) noexcept
|
|
105
111
|
{
|
|
106
112
|
event __tmp(::cuda::std::move(__other));
|
|
107
113
|
::cuda::std::swap(__event_, __tmp.__event_);
|
|
@@ -118,7 +124,7 @@ public:
|
|
|
118
124
|
//! @return event The constructed `event` object
|
|
119
125
|
//!
|
|
120
126
|
//! @note The constructed `event` object takes ownership of the native handle.
|
|
121
|
-
[[nodiscard]] static event from_native_handle(::cudaEvent_t __evnt) noexcept
|
|
127
|
+
[[nodiscard]] static _CCCL_HOST_API event from_native_handle(::cudaEvent_t __evnt) noexcept
|
|
122
128
|
{
|
|
123
129
|
return event(__evnt);
|
|
124
130
|
}
|
|
@@ -134,26 +140,21 @@ public:
|
|
|
134
140
|
//! @return cudaEvent_t The native handle being held by the `event` object.
|
|
135
141
|
//!
|
|
136
142
|
//! @post The event object is in a moved-from state.
|
|
137
|
-
[[nodiscard]] constexpr ::cudaEvent_t release() noexcept
|
|
143
|
+
[[nodiscard]] _CCCL_HOST_API constexpr ::cudaEvent_t release() noexcept
|
|
138
144
|
{
|
|
139
145
|
return ::cuda::std::exchange(__event_, {});
|
|
140
146
|
}
|
|
141
147
|
|
|
142
|
-
[[nodiscard]] friend constexpr flags operator|(flags __lhs, flags __rhs) noexcept
|
|
143
|
-
{
|
|
144
|
-
return static_cast<flags>(static_cast<unsigned>(__lhs) | static_cast<unsigned>(__rhs));
|
|
145
|
-
}
|
|
146
|
-
|
|
147
148
|
private:
|
|
148
149
|
// Use `event::from_native_handle(e)` to construct an owning `event`
|
|
149
150
|
// object from a `cudaEvent_t` handle.
|
|
150
|
-
explicit constexpr event(::cudaEvent_t __evnt) noexcept
|
|
151
|
+
_CCCL_HOST_API explicit constexpr event(::cudaEvent_t __evnt) noexcept
|
|
151
152
|
: event_ref(__evnt)
|
|
152
153
|
{}
|
|
153
154
|
|
|
154
|
-
explicit event(stream_ref __stream, unsigned __flags);
|
|
155
|
+
_CCCL_HOST_API explicit event(stream_ref __stream, unsigned __flags);
|
|
155
156
|
|
|
156
|
-
explicit event(device_ref __device, unsigned __flags)
|
|
157
|
+
_CCCL_HOST_API explicit event(device_ref __device, unsigned __flags)
|
|
157
158
|
: event_ref(::cudaEvent_t{})
|
|
158
159
|
{
|
|
159
160
|
[[maybe_unused]] __ensure_current_context __ctx_setter(__device);
|
|
@@ -56,7 +56,7 @@ public:
|
|
|
56
56
|
//!
|
|
57
57
|
//! @note: It is the callers responsibility to ensure the `event_ref` does not
|
|
58
58
|
//! outlive the event denoted by the `cudaEvent_t` handle.
|
|
59
|
-
constexpr event_ref(::cudaEvent_t __evnt) noexcept
|
|
59
|
+
_CCCL_HOST_API constexpr event_ref(::cudaEvent_t __evnt) noexcept
|
|
60
60
|
: __event_(__evnt)
|
|
61
61
|
{}
|
|
62
62
|
|
|
@@ -108,7 +108,7 @@ public:
|
|
|
108
108
|
//! @brief Retrieve the native `cudaEvent_t` handle.
|
|
109
109
|
//!
|
|
110
110
|
//! @return cudaEvent_t The native handle being held by the event_ref object.
|
|
111
|
-
[[nodiscard]] constexpr ::cudaEvent_t get() const noexcept
|
|
111
|
+
[[nodiscard]] _CCCL_HOST_API constexpr ::cudaEvent_t get() const noexcept
|
|
112
112
|
{
|
|
113
113
|
return __event_;
|
|
114
114
|
}
|
|
@@ -116,7 +116,7 @@ public:
|
|
|
116
116
|
//! @brief Checks if the `event_ref` is valid
|
|
117
117
|
//!
|
|
118
118
|
//! @return true if the `event_ref` is valid, false otherwise.
|
|
119
|
-
[[nodiscard]] explicit constexpr operator bool() const noexcept
|
|
119
|
+
[[nodiscard]] _CCCL_HOST_API explicit constexpr operator bool() const noexcept
|
|
120
120
|
{
|
|
121
121
|
return __event_ != nullptr;
|
|
122
122
|
}
|
|
@@ -129,7 +129,7 @@ public:
|
|
|
129
129
|
//! @param __lhs The first `event_ref` to compare
|
|
130
130
|
//! @param __rhs The second `event_ref` to compare
|
|
131
131
|
//! @return true if `lhs` and `rhs` refer to the same `cudaEvent_t` object.
|
|
132
|
-
[[nodiscard]] friend constexpr bool operator==(event_ref __lhs, event_ref __rhs) noexcept
|
|
132
|
+
[[nodiscard]] friend _CCCL_HOST_API constexpr bool operator==(event_ref __lhs, event_ref __rhs) noexcept
|
|
133
133
|
{
|
|
134
134
|
return __lhs.__event_ == __rhs.__event_;
|
|
135
135
|
}
|
|
@@ -142,7 +142,7 @@ public:
|
|
|
142
142
|
//! @param __lhs The first `event_ref` to compare
|
|
143
143
|
//! @param __rhs The second `event_ref` to compare
|
|
144
144
|
//! @return true if `lhs` and `rhs` refer to different `cudaEvent_t` objects.
|
|
145
|
-
[[nodiscard]] friend constexpr bool operator!=(event_ref __lhs, event_ref __rhs) noexcept
|
|
145
|
+
[[nodiscard]] friend _CCCL_HOST_API constexpr bool operator!=(event_ref __lhs, event_ref __rhs) noexcept
|
|
146
146
|
{
|
|
147
147
|
return __lhs.__event_ != __rhs.__event_;
|
|
148
148
|
}
|
|
@@ -26,10 +26,12 @@
|
|
|
26
26
|
|
|
27
27
|
#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
|
|
28
28
|
|
|
29
|
+
# include <cuda/__device/device_ref.h>
|
|
29
30
|
# include <cuda/__driver/driver_api.h>
|
|
30
31
|
# include <cuda/__event/event.h>
|
|
31
32
|
# include <cuda/__utility/no_init.h>
|
|
32
33
|
# include <cuda/std/__chrono/duration.h>
|
|
34
|
+
# include <cuda/std/__utility/to_underlying.h>
|
|
33
35
|
# include <cuda/std/cstddef>
|
|
34
36
|
|
|
35
37
|
# include <cuda/std/__cccl/prologue.h>
|
|
@@ -44,20 +46,20 @@ public:
|
|
|
44
46
|
//! and record the event on the specified stream.
|
|
45
47
|
//!
|
|
46
48
|
//! @throws cuda_error if the event creation fails.
|
|
47
|
-
explicit timed_event(stream_ref __stream,
|
|
49
|
+
_CCCL_HOST_API explicit timed_event(stream_ref __stream, event_flags __flags = event_flags::none);
|
|
48
50
|
|
|
49
51
|
//! @brief Construct a new `timed_event` object with the specified flags. The event can only be recorded on streams
|
|
50
52
|
//! from the specified device.
|
|
51
53
|
//!
|
|
52
54
|
//! @throws cuda_error if the event creation fails.
|
|
53
|
-
explicit timed_event(device_ref __device,
|
|
54
|
-
: event(__device,
|
|
55
|
+
_CCCL_HOST_API explicit timed_event(device_ref __device, event_flags __flags = event_flags::none)
|
|
56
|
+
: event(__device, ::cuda::std::to_underlying(__flags))
|
|
55
57
|
{}
|
|
56
58
|
|
|
57
59
|
//! @brief Construct a new `timed_event` object into the moved-from state.
|
|
58
60
|
//!
|
|
59
61
|
//! @post `get()` returns `cudaEvent_t()`.
|
|
60
|
-
explicit constexpr timed_event(no_init_t) noexcept
|
|
62
|
+
_CCCL_HOST_API explicit constexpr timed_event(no_init_t) noexcept
|
|
61
63
|
: event(no_init)
|
|
62
64
|
{}
|
|
63
65
|
|
|
@@ -73,7 +75,7 @@ public:
|
|
|
73
75
|
//! @return timed_event The constructed `timed_event` object
|
|
74
76
|
//!
|
|
75
77
|
//! @note The constructed `timed_event` object takes ownership of the native handle.
|
|
76
|
-
[[nodiscard]] static timed_event from_native_handle(::cudaEvent_t __evnt) noexcept
|
|
78
|
+
[[nodiscard]] static _CCCL_HOST_API timed_event from_native_handle(::cudaEvent_t __evnt) noexcept
|
|
77
79
|
{
|
|
78
80
|
return timed_event(__evnt);
|
|
79
81
|
}
|
|
@@ -94,7 +96,8 @@ public:
|
|
|
94
96
|
//! @return cuda::std::chrono::nanoseconds The elapsed time in nanoseconds.
|
|
95
97
|
//!
|
|
96
98
|
//! @note The elapsed time has a resolution of approximately 0.5 microseconds.
|
|
97
|
-
[[nodiscard]] friend ::cuda::std::chrono::nanoseconds
|
|
99
|
+
[[nodiscard]] friend _CCCL_HOST_API ::cuda::std::chrono::nanoseconds
|
|
100
|
+
operator-(const timed_event& __end, const timed_event& __start)
|
|
98
101
|
{
|
|
99
102
|
const auto __ms = ::cuda::__driver::__eventElapsedTime(__start.get(), __end.get());
|
|
100
103
|
return ::cuda::std::chrono::nanoseconds(static_cast<::cuda::std::chrono::nanoseconds::rep>(__ms * 1'000'000.0));
|
|
@@ -103,7 +106,7 @@ public:
|
|
|
103
106
|
private:
|
|
104
107
|
// Use `timed_event::from_native_handle(e)` to construct an owning `timed_event`
|
|
105
108
|
// object from a `cudaEvent_t` handle.
|
|
106
|
-
explicit constexpr timed_event(::cudaEvent_t __evnt) noexcept
|
|
109
|
+
_CCCL_HOST_API explicit constexpr timed_event(::cudaEvent_t __evnt) noexcept
|
|
107
110
|
: event(__evnt)
|
|
108
111
|
{}
|
|
109
112
|
};
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
//===----------------------------------------------------------------------===//
|
|
2
|
+
//
|
|
3
|
+
// Part of libcu++, the C++ Standard Library for your entire system,
|
|
4
|
+
// under the Apache License v2.0 with LLVM Exceptions.
|
|
5
|
+
// See https://llvm.org/LICENSE.txt for license information.
|
|
6
|
+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
7
|
+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
|
|
8
|
+
//
|
|
9
|
+
//===----------------------------------------------------------------------===//
|
|
10
|
+
|
|
11
|
+
#ifndef _CUDA___FWD_DEVICES_H
|
|
12
|
+
#define _CUDA___FWD_DEVICES_H
|
|
13
|
+
|
|
14
|
+
#include <cuda/std/detail/__config>
|
|
15
|
+
|
|
16
|
+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
|
|
17
|
+
# pragma GCC system_header
|
|
18
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
|
|
19
|
+
# pragma clang system_header
|
|
20
|
+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
|
|
21
|
+
# pragma system_header
|
|
22
|
+
#endif // no system header
|
|
23
|
+
|
|
24
|
+
#include <cuda/std/__fwd/span.h>
|
|
25
|
+
|
|
26
|
+
#include <cuda/std/__cccl/prologue.h>
|
|
27
|
+
|
|
28
|
+
_CCCL_BEGIN_NAMESPACE_CUDA
|
|
29
|
+
|
|
30
|
+
class __physical_device;
|
|
31
|
+
class device_ref;
|
|
32
|
+
template <::cudaDeviceAttr _Attr>
|
|
33
|
+
struct __dev_attr;
|
|
34
|
+
struct arch_traits_t;
|
|
35
|
+
class compute_capability;
|
|
36
|
+
enum class arch_id : int;
|
|
37
|
+
|
|
38
|
+
inline constexpr int __arch_specific_id_multiplier = 100000;
|
|
39
|
+
|
|
40
|
+
_CCCL_END_NAMESPACE_CUDA
|
|
41
|
+
|
|
42
|
+
#include <cuda/std/__cccl/epilogue.h>
|
|
43
|
+
|
|
44
|
+
#endif // _CUDA___FWD_DEVICES_H
|
|
@@ -42,6 +42,15 @@ inline constexpr bool __is_zip_function = false;
|
|
|
42
42
|
template <class _Fn>
|
|
43
43
|
inline constexpr bool __is_zip_function<zip_function<_Fn>> = true;
|
|
44
44
|
|
|
45
|
+
template <class _Fn, class... _Iterators>
|
|
46
|
+
class zip_transform_iterator;
|
|
47
|
+
|
|
48
|
+
template <class>
|
|
49
|
+
inline constexpr bool __is_zip_transform_iterator = false;
|
|
50
|
+
|
|
51
|
+
template <class _Fn, class... _Iterators>
|
|
52
|
+
inline constexpr bool __is_zip_transform_iterator<zip_transform_iterator<_Fn, _Iterators...>> = true;
|
|
53
|
+
|
|
45
54
|
_CCCL_END_NAMESPACE_CUDA
|
|
46
55
|
|
|
47
56
|
#include <cuda/std/__cccl/epilogue.h>
|
|
@@ -23,6 +23,7 @@
|
|
|
23
23
|
|
|
24
24
|
#include <cuda/std/__iterator/concepts.h>
|
|
25
25
|
#include <cuda/std/__iterator/iterator_traits.h>
|
|
26
|
+
#include <cuda/std/__ranges/compressed_movable_box.h>
|
|
26
27
|
#include <cuda/std/__ranges/movable_box.h>
|
|
27
28
|
#include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
|
|
28
29
|
#include <cuda/std/__type_traits/is_nothrow_move_constructible.h>
|
|
@@ -64,8 +65,28 @@ class constant_iterator
|
|
|
64
65
|
private:
|
|
65
66
|
static_assert(::cuda::std::__integer_like<_Index>, "The index type of cuda::constant_iterator must be integer-like!");
|
|
66
67
|
|
|
67
|
-
|
|
68
|
-
_Index
|
|
68
|
+
// Not a base because then the friend operators would be ambiguous
|
|
69
|
+
::cuda::std::__compressed_movable_box<_Index, _Tp> __store_;
|
|
70
|
+
|
|
71
|
+
[[nodiscard]] _CCCL_API constexpr _Index& __index() noexcept
|
|
72
|
+
{
|
|
73
|
+
return __store_.template __get<0>();
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
[[nodiscard]] _CCCL_API constexpr const _Index& __index() const noexcept
|
|
77
|
+
{
|
|
78
|
+
return __store_.template __get<0>();
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
[[nodiscard]] _CCCL_API constexpr _Tp& __value() noexcept
|
|
82
|
+
{
|
|
83
|
+
return __store_.template __get<1>();
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
[[nodiscard]] _CCCL_API constexpr const _Tp& __value() const noexcept
|
|
87
|
+
{
|
|
88
|
+
return __store_.template __get<1>();
|
|
89
|
+
}
|
|
69
90
|
|
|
70
91
|
public:
|
|
71
92
|
using iterator_concept = ::cuda::std::random_access_iterator_tag;
|
|
@@ -78,22 +99,17 @@ public:
|
|
|
78
99
|
using reference = _Tp;
|
|
79
100
|
using pointer = void;
|
|
80
101
|
|
|
81
|
-
#if _CCCL_HAS_CONCEPTS()
|
|
82
|
-
_CCCL_HIDE_FROM_ABI constant_iterator()
|
|
83
|
-
requires ::cuda::std::default_initializable<_Tp>
|
|
84
|
-
= default;
|
|
85
|
-
#else // ^^^ _CCCL_HAS_CONCEPTS() ^^^ / vvv !_CCCL_HAS_CONCEPTS() vvv
|
|
86
102
|
_CCCL_TEMPLATE(class _Tp2 = _Tp)
|
|
87
103
|
_CCCL_REQUIRES(::cuda::std::default_initializable<_Tp2>)
|
|
88
|
-
_CCCL_API constexpr constant_iterator() noexcept(::cuda::std::is_nothrow_default_constructible_v<_Tp2>)
|
|
89
|
-
|
|
104
|
+
_CCCL_API constexpr constant_iterator() noexcept(::cuda::std::is_nothrow_default_constructible_v<_Tp2>)
|
|
105
|
+
: __store_()
|
|
106
|
+
{}
|
|
90
107
|
|
|
91
108
|
//! @brief Creates a @c constant_iterator from a value. The index is set to zero
|
|
92
109
|
//! @param __value The value to store in the @c constant_iterator
|
|
93
110
|
_CCCL_EXEC_CHECK_DISABLE
|
|
94
111
|
_CCCL_API constexpr constant_iterator(_Tp __value) noexcept(::cuda::std::is_nothrow_move_constructible_v<_Tp>)
|
|
95
|
-
:
|
|
96
|
-
, __index_()
|
|
112
|
+
: __store_(0, ::cuda::std::move(__value))
|
|
97
113
|
{}
|
|
98
114
|
|
|
99
115
|
//! @brief Creates @c constant_iterator from a value and an index
|
|
@@ -104,32 +120,31 @@ public:
|
|
|
104
120
|
_CCCL_REQUIRES(::cuda::std::__integer_like<_Index2>)
|
|
105
121
|
_CCCL_API constexpr explicit constant_iterator(_Tp __value, _Index2 __index) noexcept(
|
|
106
122
|
::cuda::std::is_nothrow_move_constructible_v<_Tp>)
|
|
107
|
-
:
|
|
108
|
-
, __index_(static_cast<_Index>(__index))
|
|
123
|
+
: __store_(static_cast<_Index>(__index), ::cuda::std::move(__value))
|
|
109
124
|
{}
|
|
110
125
|
|
|
111
126
|
//! @brief Returns a the current index
|
|
112
127
|
[[nodiscard]] _CCCL_API constexpr difference_type index() const noexcept
|
|
113
128
|
{
|
|
114
|
-
return static_cast<difference_type>(
|
|
129
|
+
return static_cast<difference_type>(__index());
|
|
115
130
|
}
|
|
116
131
|
|
|
117
132
|
//! @brief Returns a const reference to the stored value
|
|
118
133
|
[[nodiscard]] _CCCL_API constexpr const _Tp& operator*() const noexcept
|
|
119
134
|
{
|
|
120
|
-
return
|
|
135
|
+
return __value();
|
|
121
136
|
}
|
|
122
137
|
|
|
123
138
|
//! @brief Returns a const reference to the stored value
|
|
124
139
|
[[nodiscard]] _CCCL_API constexpr const _Tp& operator[](difference_type) const noexcept
|
|
125
140
|
{
|
|
126
|
-
return
|
|
141
|
+
return __value();
|
|
127
142
|
}
|
|
128
143
|
|
|
129
144
|
//! @brief Increments the stored index
|
|
130
145
|
_CCCL_API constexpr constant_iterator& operator++() noexcept
|
|
131
146
|
{
|
|
132
|
-
++
|
|
147
|
+
++__index();
|
|
133
148
|
return *this;
|
|
134
149
|
}
|
|
135
150
|
|
|
@@ -147,9 +162,9 @@ public:
|
|
|
147
162
|
{
|
|
148
163
|
if constexpr (::cuda::std::is_signed_v<_Index>)
|
|
149
164
|
{
|
|
150
|
-
_CCCL_ASSERT(
|
|
165
|
+
_CCCL_ASSERT(__index() > 0, "The index must be greater than or equal to 0");
|
|
151
166
|
}
|
|
152
|
-
--
|
|
167
|
+
--__index();
|
|
153
168
|
return *this;
|
|
154
169
|
}
|
|
155
170
|
|
|
@@ -168,9 +183,9 @@ public:
|
|
|
168
183
|
{
|
|
169
184
|
if constexpr (::cuda::std::is_signed_v<_Index>)
|
|
170
185
|
{
|
|
171
|
-
_CCCL_ASSERT(
|
|
186
|
+
_CCCL_ASSERT(__index() + __n >= 0, "The index must be greater than or equal to 0");
|
|
172
187
|
}
|
|
173
|
-
|
|
188
|
+
__index() += static_cast<_Index>(__n);
|
|
174
189
|
return *this;
|
|
175
190
|
}
|
|
176
191
|
|
|
@@ -200,9 +215,9 @@ public:
|
|
|
200
215
|
{
|
|
201
216
|
if constexpr (::cuda::std::is_signed_v<_Index>)
|
|
202
217
|
{
|
|
203
|
-
_CCCL_ASSERT(
|
|
218
|
+
_CCCL_ASSERT(__index() - __n >= 0, "The index must be greater than or equal to 0");
|
|
204
219
|
}
|
|
205
|
-
|
|
220
|
+
__index() -= static_cast<_Index>(__n);
|
|
206
221
|
return *this;
|
|
207
222
|
}
|
|
208
223
|
|
|
@@ -220,14 +235,14 @@ public:
|
|
|
220
235
|
[[nodiscard]] _CCCL_API friend constexpr difference_type
|
|
221
236
|
operator-(const constant_iterator& __lhs, const constant_iterator& __rhs) noexcept
|
|
222
237
|
{
|
|
223
|
-
return static_cast<difference_type>(__lhs.
|
|
238
|
+
return static_cast<difference_type>(__lhs.__index()) - static_cast<difference_type>(__rhs.__index());
|
|
224
239
|
}
|
|
225
240
|
|
|
226
241
|
//! @brief Compares two @c constant_iterator for equality by comparing the index in the sequence
|
|
227
242
|
[[nodiscard]] _CCCL_API friend constexpr bool
|
|
228
243
|
operator==(const constant_iterator& __lhs, const constant_iterator& __rhs) noexcept
|
|
229
244
|
{
|
|
230
|
-
return __lhs.
|
|
245
|
+
return __lhs.__index() == __rhs.__index();
|
|
231
246
|
}
|
|
232
247
|
|
|
233
248
|
#if _CCCL_STD_VER <= 2017
|
|
@@ -235,7 +250,7 @@ public:
|
|
|
235
250
|
[[nodiscard]] _CCCL_API friend constexpr bool
|
|
236
251
|
operator!=(const constant_iterator& __lhs, const constant_iterator& __rhs) noexcept
|
|
237
252
|
{
|
|
238
|
-
return __lhs.
|
|
253
|
+
return __lhs.__index() != __rhs.__index();
|
|
239
254
|
}
|
|
240
255
|
#endif // _CCCL_STD_VER <= 2017
|
|
241
256
|
|
|
@@ -244,32 +259,32 @@ public:
|
|
|
244
259
|
[[nodiscard]] _CCCL_API friend constexpr auto
|
|
245
260
|
operator<=>(const constant_iterator& __lhs, const constant_iterator& __rhs) noexcept
|
|
246
261
|
{
|
|
247
|
-
return __lhs.
|
|
262
|
+
return __lhs.__index() <=> __rhs.__index();
|
|
248
263
|
}
|
|
249
264
|
#else // ^^^ _LIBCUDACXX_HAS_SPACESHIP_OPERATOR() ^^^ / vvv !_LIBCUDACXX_HAS_SPACESHIP_OPERATOR() vvv
|
|
250
265
|
//! @brief Compares two @c constant_iterator for less than by comparing the index in the sequence
|
|
251
266
|
[[nodiscard]] _CCCL_API friend constexpr bool
|
|
252
267
|
operator<(const constant_iterator& __lhs, const constant_iterator& __rhs) noexcept
|
|
253
268
|
{
|
|
254
|
-
return __lhs.
|
|
269
|
+
return __lhs.__index() < __rhs.__index();
|
|
255
270
|
}
|
|
256
271
|
//! @brief Compares two @c constant_iterator for less equal by comparing the index in the sequence
|
|
257
272
|
[[nodiscard]] _CCCL_API friend constexpr bool
|
|
258
273
|
operator<=(const constant_iterator& __lhs, const constant_iterator& __rhs) noexcept
|
|
259
274
|
{
|
|
260
|
-
return __lhs.
|
|
275
|
+
return __lhs.__index() <= __rhs.__index();
|
|
261
276
|
}
|
|
262
277
|
//! @brief Compares two @c constant_iterator for greater than by comparing the index in the sequence
|
|
263
278
|
[[nodiscard]] _CCCL_API friend constexpr bool
|
|
264
279
|
operator>(const constant_iterator& __lhs, const constant_iterator& __rhs) noexcept
|
|
265
280
|
{
|
|
266
|
-
return __lhs.
|
|
281
|
+
return __lhs.__index() > __rhs.__index();
|
|
267
282
|
}
|
|
268
283
|
//! @brief Compares two @c constant_iterator for greater equal by comparing the index in the sequence
|
|
269
284
|
[[nodiscard]] _CCCL_API friend constexpr bool
|
|
270
285
|
operator>=(const constant_iterator& __lhs, const constant_iterator& __rhs) noexcept
|
|
271
286
|
{
|
|
272
|
-
return __lhs.
|
|
287
|
+
return __lhs.__index() >= __rhs.__index();
|
|
273
288
|
}
|
|
274
289
|
#endif // !_LIBCUDACXX_HAS_NO_SPACESHIP_OPERATOR()
|
|
275
290
|
};
|