PyPI - cuda-cccl - Versions diffs - 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl - Mend

cuda-cccl 0.3.1__cp312-cp312-manylinux_2_24_aarch64.whl → 0.3.2__cp312-cp312-manylinux_2_24_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of cuda-cccl might be problematic. Click here for more details.

Files changed (185) hide show

cuda/cccl/headers/include/cub/device/device_transform.cuh CHANGED Viewed

@@ -18,6 +18,8 @@
 #include <cub/util_namespace.cuh>
 #include <cuda/__functional/address_stability.h>
+#include <cuda/__stream/get_stream.h>
+#include <cuda/std/__execution/env.h>
 #include <cuda/std/tuple>
 CUB_NAMESPACE_BEGIN
@@ -49,13 +51,20 @@ CUB_NAMESPACE_BEGIN
 struct DeviceTransform
 {
 private:
-  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename NumItemsT, typename TransformOp>
+  template <typename... RandomAccessIteratorsIn,
+            typename RandomAccessIteratorOut,
+            typename NumItemsT,
+            typename Predicate,
+            typename TransformOp,
+            typename StableAddress = cuda::std::false_type>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformInternal(
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
     NumItemsT num_items,
+    Predicate predicate,
     TransformOp transform_op,
-    cudaStream_t stream = nullptr)
+    cudaStream_t stream,
+    StableAddress = {})
   {
     using choose_offset_t = detail::choose_signed_offset<NumItemsT>;
     using offset_t        = typename choose_offset_t::type;
@@ -66,18 +75,28 @@ private:
       return error;
     }
-    return detail::transform::dispatch_t<
-      detail::transform::requires_stable_address::no,
-      offset_t,
-      ::cuda::std::tuple<RandomAccessIteratorsIn...>,
-      RandomAccessIteratorOut,
-      detail::transform::always_true_predicate,
-      TransformOp>::dispatch(::cuda::std::move(inputs),
-                             ::cuda::std::move(output),
-                             num_items,
-                             detail::transform::always_true_predicate{},
-                             ::cuda::std::move(transform_op),
-                             stream);
+    return detail::transform::dispatch_t < StableAddress::value
+           ? detail::transform::requires_stable_address::yes
+           : detail::transform::requires_stable_address::no,
+           offset_t, ::cuda::std::tuple<RandomAccessIteratorsIn...>, RandomAccessIteratorOut, Predicate,
+           TransformOp > ::dispatch(
+             ::cuda::std::move(inputs),
+             ::cuda::std::move(output),
+             num_items,
+             ::cuda::std::move(predicate),
+             ::cuda::std::move(transform_op),
+             stream);
+  }
+  template <typename Env>
+  CUB_RUNTIME_FUNCTION static auto get_stream(Env env) -> cudaStream_t
+  {
+    return ::cuda::std::execution::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}).get();
+  }
+  CUB_RUNTIME_FUNCTION static auto get_stream(cudaStream_t stream) -> cudaStream_t
+  {
+    return stream;
   }
 public:
@@ -108,18 +127,28 @@ public:
   //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
   //! types must be convertible to the parameters of the function object's call operator. The return type of the call
   //! operator must be assignable to the dereferenced output iterator.
-  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename NumItemsT, typename TransformOp>
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
+  template <typename... RandomAccessIteratorsIn,
+            typename RandomAccessIteratorOut,
+            typename NumItemsT,
+            typename TransformOp,
+            typename Env = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t Transform(
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
     NumItemsT num_items,
     TransformOp transform_op,
-    cudaStream_t stream = nullptr)
+    Env env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
     return TransformInternal(
-      ::cuda::std::move(inputs), ::cuda::std::move(output), num_items, ::cuda::std::move(transform_op), stream);
+      ::cuda::std::move(inputs),
+      ::cuda::std::move(output),
+      num_items,
+      detail::transform::always_true_predicate{},
+      ::cuda::std::move(transform_op),
+      get_stream(env));
   }
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -160,21 +189,26 @@ public:
   //! @param transform_op A unary function object. The input iterator's value type must be convertible to the parameter
   //! of the function object's call operator. The return type of the call operator must be assignable to the
   //! dereferenced output iterator.
-  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename NumItemsT, typename TransformOp>
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
+  template <typename RandomAccessIteratorIn,
+            typename RandomAccessIteratorOut,
+            typename NumItemsT,
+            typename TransformOp,
+            typename Env = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t Transform(
     RandomAccessIteratorIn input,
     RandomAccessIteratorOut output,
     NumItemsT num_items,
     TransformOp transform_op,
-    cudaStream_t stream = nullptr)
+    Env env = {})
   {
     return Transform(
       ::cuda::std::make_tuple(::cuda::std::move(input)),
       ::cuda::std::move(output),
       num_items,
       ::cuda::std::move(transform_op),
-      stream);
+      ::cuda::std::move(env));
   }
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -215,10 +249,14 @@ public:
   //! @param num_items The number of elements to write to the output sequence.
   //! @param generator A nullary function object. The return type of the call operator must be assignable to the
   //! dereferenced output iterator.
-  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  template <typename RandomAccessIteratorOut, typename NumItemsT, typename Generator>
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
+  template <typename RandomAccessIteratorOut,
+            typename NumItemsT,
+            typename Generator,
+            typename Env = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t
-  Generate(RandomAccessIteratorOut output, NumItemsT num_items, Generator generator, cudaStream_t stream = nullptr)
+  Generate(RandomAccessIteratorOut output, NumItemsT num_items, Generator generator, Env env = {})
   {
     static_assert(::cuda::std::is_invocable_v<Generator>, "The passed generator must be a nullary function object");
     static_assert(
@@ -228,7 +266,12 @@ public:
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Generate");
     return TransformInternal(
-      ::cuda::std::make_tuple(), ::cuda::std::move(output), num_items, ::cuda::std::move(generator), stream);
+      ::cuda::std::make_tuple(),
+      ::cuda::std::move(output),
+      num_items,
+      detail::transform::always_true_predicate{},
+      ::cuda::std::move(generator),
+      get_stream(env));
   }
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -262,10 +305,14 @@ public:
   //! @param output An iterator to the output sequence where num_items results are written to.
   //! @param num_items The number of elements to write to the output sequence.
   //! @param value The value to write. Must be assignable to the dereferenced output iterator.
-  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  template <typename RandomAccessIteratorOut, typename NumItemsT, typename Value>
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
+  template <typename RandomAccessIteratorOut,
+            typename NumItemsT,
+            typename Value,
+            typename Env = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t
-  Fill(RandomAccessIteratorOut output, NumItemsT num_items, Value value, cudaStream_t stream = nullptr)
+  Fill(RandomAccessIteratorOut output, NumItemsT num_items, Value value, Env env = {})
   {
     static_assert(::cuda::std::is_assignable_v<detail::it_reference_t<RandomAccessIteratorOut>, Value>,
                   "The passed value must be assignable to the dereferenced output iterator");
@@ -275,8 +322,9 @@ public:
       ::cuda::std::make_tuple(),
       ::cuda::std::move(output),
       num_items,
+      detail::transform::always_true_predicate{},
       detail::__return_constant<Value>{::cuda::std::move(value)},
-      stream);
+      get_stream(env));
   }
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -296,8 +344,7 @@ public:
       return cudaSuccess;
     }
-    return Generate(
-      ::cuda::std::move(output), num_items, detail::__return_constant<Value>{::cuda::std::move(value)}, stream);
+    return Fill(::cuda::std::move(output), num_items, ::cuda::std::move(value), stream);
   }
 #endif // _CCCL_DOXYGEN_INVOKED
@@ -333,43 +380,30 @@ public:
   //! types must be convertible to the parameters of the function object's call operator. The return type of the call
   //! operator must be assignable to the dereferenced output iterator. Will only be invoked if \p predicate returns
   //! true.
-  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
   template <typename... RandomAccessIteratorsIn,
             typename RandomAccessIteratorOut,
             typename NumItemsT,
             typename Predicate,
-            typename TransformOp>
+            typename TransformOp,
+            typename Env = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformIf(
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
     NumItemsT num_items,
     Predicate predicate,
     TransformOp transform_op,
-    cudaStream_t stream = nullptr)
+    Env env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformIf");
-    using choose_offset_t = detail::choose_signed_offset<NumItemsT>;
-    using offset_t        = typename choose_offset_t::type;
-    // Check if the number of items exceeds the range covered by the selected signed offset type
-    if (const cudaError_t error = choose_offset_t::is_exceeding_offset_type(num_items); error != cudaSuccess)
-    {
-      return error;
-    }
-    return detail::transform::dispatch_t<
-      detail::transform::requires_stable_address::no,
-      offset_t,
-      ::cuda::std::tuple<RandomAccessIteratorsIn...>,
-      RandomAccessIteratorOut,
-      Predicate,
-      TransformOp>::dispatch(::cuda::std::move(inputs),
-                             ::cuda::std::move(output),
-                             num_items,
-                             ::cuda::std::move(predicate),
-                             ::cuda::std::move(transform_op),
-                             stream);
+    return TransformInternal(
+      ::cuda::std::move(inputs),
+      ::cuda::std::move(output),
+      num_items,
+      ::cuda::std::move(predicate),
+      ::cuda::std::move(transform_op),
+      get_stream(env));
   }
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -435,19 +469,21 @@ public:
   //! @param transform_op A unary function object. The input iterator's value type must be convertible to the
   //! parameter of the function object's call operator. The return type of the call operator must be assignable to the
   //! dereferenced output iterator. Will only be invoked if \p predicate returns true.
-  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
   template <typename RandomAccessIteratorIn,
             typename RandomAccessIteratorOut,
             typename NumItemsT,
             typename Predicate,
-            typename TransformOp>
+            typename TransformOp,
+            typename Env = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformIf(
     RandomAccessIteratorIn input,
     RandomAccessIteratorOut output,
     NumItemsT num_items,
     Predicate predicate,
     TransformOp transform_op,
-    cudaStream_t stream = nullptr)
+    Env env = {})
   {
     return TransformIf(
       ::cuda::std::make_tuple(::cuda::std::move(input)),
@@ -455,7 +491,7 @@ public:
       num_items,
       ::cuda::std::move(predicate),
       ::cuda::std::move(transform_op),
-      stream);
+      get_stream(env));
   }
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -518,39 +554,29 @@ public:
   //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
   //! types must be convertible to the parameters of the function object's call operator. The return type of the call
   //! operator must be assignable to the dereferenced output iterator.
-  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  template <typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename NumItemsT, typename TransformOp>
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
+  template <typename... RandomAccessIteratorsIn,
+            typename RandomAccessIteratorOut,
+            typename NumItemsT,
+            typename TransformOp,
+            typename Env = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
     RandomAccessIteratorOut output,
     NumItemsT num_items,
     TransformOp transform_op,
-    cudaStream_t stream = nullptr)
+    Env env = {})
   {
     _CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses");
-    using choose_offset_t = detail::choose_signed_offset<NumItemsT>;
-    using offset_t        = typename choose_offset_t::type;
-    // Check if the number of items exceeds the range covered by the selected signed offset type
-    cudaError_t error = choose_offset_t::is_exceeding_offset_type(num_items);
-    if (error)
-    {
-      return error;
-    }
-    return detail::transform::dispatch_t<
-      detail::transform::requires_stable_address::yes,
-      offset_t,
-      ::cuda::std::tuple<RandomAccessIteratorsIn...>,
-      RandomAccessIteratorOut,
-      detail::transform::always_true_predicate,
-      TransformOp>::dispatch(::cuda::std::move(inputs),
-                             ::cuda::std::move(output),
-                             num_items,
-                             detail::transform::always_true_predicate{},
-                             ::cuda::std::move(transform_op),
-                             stream);
+    return TransformInternal(
+      ::cuda::std::move(inputs),
+      ::cuda::std::move(output),
+      num_items,
+      detail::transform::always_true_predicate{},
+      ::cuda::std::move(transform_op),
+      get_stream(env),
+      ::cuda::std::true_type{});
   }
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document
@@ -590,21 +616,26 @@ public:
   //! @param transform_op An n-ary function object, where n is the number of input sequences. The input iterators' value
   //! types must be convertible to the parameters of the function object's call operator. The return type of the call
   //! operator must be assignable to the dereferenced output iterator.
-  //! @param stream **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
-  template <typename RandomAccessIteratorIn, typename RandomAccessIteratorOut, typename NumItemsT, typename TransformOp>
+  //! @param env Execution environment, or cudaStream_t. Default is ``cuda::std::execution::env{}``, which will run on
+  //! stream\ :sub:`0`
+  template <typename RandomAccessIteratorIn,
+            typename RandomAccessIteratorOut,
+            typename NumItemsT,
+            typename TransformOp,
+            typename Env = ::cuda::std::execution::env<>>
   CUB_RUNTIME_FUNCTION static cudaError_t TransformStableArgumentAddresses(
     RandomAccessIteratorIn input,
     RandomAccessIteratorOut output,
     NumItemsT num_items,
     TransformOp transform_op,
-    cudaStream_t stream = nullptr)
+    Env env = {})
   {
     return TransformStableArgumentAddresses(
       ::cuda::std::make_tuple(::cuda::std::move(input)),
       ::cuda::std::move(output),
       num_items,
       ::cuda::std::move(transform_op),
-      stream);
+      get_stream(env));
   }
 #ifndef _CCCL_DOXYGEN_INVOKED // Do not document

cuda/cccl/headers/include/cub/device/dispatch/dispatch_merge.cuh CHANGED Viewed

@@ -122,9 +122,8 @@ __launch_bounds__(
 {
   // the merge agent loads keys into a local array of KeyIt1::value_type, on which the comparisons are performed
   using key_t = it_value_t<KeyIt1>;
-  static_assert(::cuda::std::__invocable<CompareOp, key_t, key_t>::value,
-                "Comparison operator cannot compare two keys");
-  static_assert(::cuda::std::is_convertible_v<typename ::cuda::std::__invoke_of<CompareOp, key_t, key_t>::type, bool>,
+  static_assert(::cuda::std::is_invocable_v<CompareOp, key_t, key_t>, "Comparison operator cannot compare two keys");
+  static_assert(::cuda::std::is_convertible_v<::cuda::std::invoke_result_t<CompareOp, key_t, key_t>, bool>,
                 "Comparison operator must be convertible to bool");
   using MergeAgent = typename choose_merge_agent<

cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce.cuh CHANGED Viewed

@@ -790,7 +790,7 @@ struct DispatchSegmentedReduce
    *   Function type of cub::DeviceSegmentedReduceKernel
    *
    * @param[in] segmented_reduce_kernel
-   *   Kernel function pointer to parameterization of
+   *   Kernel function pointer to instantiation of
    *   cub::DeviceSegmentedReduceKernel
    */
   template <typename ActivePolicyT, typename DeviceSegmentedReduceKernelT>
@@ -809,7 +809,8 @@ struct DispatchSegmentedReduce
         return cudaSuccess;
       }
-      // Init kernel configuration
+      // Init kernel configuration (computes kernel occupancy)
+      // maybe only used inside CUB_DEBUG_LOG code sections
       [[maybe_unused]] detail::KernelConfig segmented_reduce_config;
       error =
         CubDebug(segmented_reduce_config.Init(segmented_reduce_kernel, policy.SegmentedReduce(), launcher_factory));
@@ -839,7 +840,7 @@ struct DispatchSegmentedReduce
                 segmented_reduce_config.sm_occupancy);
 #endif // CUB_DEBUG_LOG
-        // Invoke DeviceReduceKernel
+        // Invoke DeviceSegmentedReduceKernel
         launcher_factory(
           static_cast<::cuda::std::uint32_t>(num_current_segments), policy.SegmentedReduce().BlockThreads(), 0, stream)
           .doit(segmented_reduce_kernel, d_in, d_out, d_begin_offsets, d_end_offsets, reduction_op, init);

cuda/cccl/headers/include/cub/device/dispatch/dispatch_reduce_deterministic.cuh CHANGED Viewed

@@ -77,7 +77,7 @@ namespace rfa
 {
 template <typename Invocable, typename InputT>
-using transformed_input_t = ::cuda::std::decay_t<typename ::cuda::std::__invoke_of<Invocable, InputT>::type>;
+using transformed_input_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t<Invocable, InputT>>;
 template <typename InitT, typename InputIteratorT, typename TransformOpT>
 using accum_t =

cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce.cuh CHANGED Viewed

@@ -18,8 +18,8 @@
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/iterator_adaptor.h>
-#include <thrust/iterator/tabulate_output_iterator.h>
+#include <cuda/__iterator/tabulate_output_iterator.h>
 #include <cuda/std/__functional/identity.h>
 #include <cuda/std/__utility/swap.h>
 #include <cuda/std/limits>
@@ -217,8 +217,7 @@ struct dispatch_streaming_arg_reduce_t
     // The output iterator that implements the logic to accumulate per-partition result to a global aggregate and,
     // eventually, write to the user-provided output iterators
-    using accumulating_transform_out_it_t =
-      THRUST_NS_QUALIFIER::tabulate_output_iterator<accumulating_transform_output_op_t>;
+    using accumulating_transform_out_it_t = ::cuda::tabulate_output_iterator<accumulating_transform_output_op_t>;
     // Empty problem initialization type
     using empty_problem_init_t = empty_problem_init_t<per_partition_accum_t>;
@@ -270,7 +269,7 @@ struct dispatch_streaming_arg_reduce_t
       nullptr,
       allocation_sizes[0],
       d_indexed_offset_in,
-      THRUST_NS_QUALIFIER::make_tabulate_output_iterator(accumulating_out_op),
+      ::cuda::make_tabulate_output_iterator(accumulating_out_op),
       static_cast<PerPartitionOffsetT>(largest_partition_size),
       reduce_op,
       initial_value,
@@ -315,7 +314,7 @@ struct dispatch_streaming_arg_reduce_t
         d_temp_storage,
         temp_storage_bytes,
         d_indexed_offset_in,
-        THRUST_NS_QUALIFIER::make_tabulate_output_iterator(accumulating_out_op),
+        ::cuda::make_tabulate_output_iterator(accumulating_out_op),
         static_cast<PerPartitionOffsetT>(current_num_items),
         reduce_op,
         initial_value,

cuda/cccl/headers/include/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh CHANGED Viewed

@@ -23,7 +23,6 @@
 #include <cub/util_type.cuh>
 #include <thrust/iterator/offset_iterator.h>
-#include <thrust/iterator/tabulate_output_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

cuda/cccl/headers/include/cub/device/dispatch/dispatch_topk.cuh CHANGED Viewed

@@ -387,15 +387,13 @@ struct DispatchTopK
         return error;
       }
-      _CubLog("Invoking topk_kernel<<<{%d,%d,%d}, %d, 0, "
+      _CubLog("Invoking topk_kernel<<<%d, %d, 0, "
               "%lld>>>(), %d items per thread, %d SM occupancy\n",
-              topk_grid_size.x,
-              topk_grid_size.y,
-              topk_grid_size.z,
+              topk_grid_size,
               block_threads,
               (long long) stream,
               items_per_thread,
-              topk_blocks_per_sm);
+              main_kernel_blocks_per_sm);
     }
 #endif // CUB_DEBUG_LOG

cuda/cccl/headers/include/cub/device/dispatch/dispatch_transform.cuh CHANGED Viewed

@@ -109,8 +109,9 @@ struct TransformKernelSource<Offset,
     return detail::transform::make_aligned_base_ptr_kernel_arg(it, align);
   }
+private:
   template <typename T>
-  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static auto IsPointerAligned(T it, [[maybe_unused]] int alignment)
+  CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static auto is_pointer_aligned(T it, [[maybe_unused]] int alignment)
   {
     if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<decltype(it)>)
     {
@@ -121,6 +122,14 @@ struct TransformKernelSource<Offset,
       return true; // fancy iterators are aligned, since the vectorized kernel chooses a different code path
     }
   }
+public:
+  CUB_RUNTIME_FUNCTION constexpr static bool
+  CanVectorize(int vec_size, const RandomAccessIteratorOut& out, const RandomAccessIteratorsIn&... in)
+  {
+    return is_pointer_aligned(out, sizeof(it_value_t<RandomAccessIteratorOut>) * vec_size)
+        && (is_pointer_aligned(in, sizeof(it_value_t<RandomAccessIteratorsIn>) * vec_size) && ...);
+  }
 };
 enum class requires_stable_address
@@ -384,7 +393,7 @@ struct dispatch_t<StableAddress,
   }
   CUB_DEFINE_SFINAE_GETTER(items_per_thread_no_input, prefetch, ItemsPerThreadNoInput)
-  CUB_DEFINE_SFINAE_GETTER(load_store_word_size, vectorized, LoadStoreWordSize)
+  CUB_DEFINE_SFINAE_GETTER(vec_size, vectorized, VecSize)
   CUB_DEFINE_SFINAE_GETTER(items_per_thread_vectorized, vectorized, ItemsPerThreadVectorized)
 #undef CUB_DEFINE_SFINAE_GETTER
@@ -441,9 +450,8 @@ struct dispatch_t<StableAddress,
     // the policy already handles the compile-time checks if we can vectorize. Do the remaining alignment check here
     if CUB_DETAIL_CONSTEXPR_ISH (Algorithm::vectorized == wrapped_policy.Algorithm())
     {
-      const int alignment = load_store_word_size(wrapped_policy.AlgorithmPolicy());
-      can_vectorize       = (kernel_source.IsPointerAligned(::cuda::std::get<Is>(in), alignment) && ...)
-                   && kernel_source.IsPointerAligned(out, alignment);
+      const int vs  = vec_size(wrapped_policy.AlgorithmPolicy());
+      can_vectorize = kernel_source.CanVectorize(vs, out, ::cuda::std::get<Is>(in)...);
     }
     int ipt        = 0;