RubyGems - whispercpp - Versions diffs - 1.3.1 → 1.3.2 - Mend

whispercpp 1.3.1 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (797) hide show

data/ext/sources/ggml/src/ggml-cuda/softmax.cu ADDED Viewed

@@ -0,0 +1,283 @@
+#include "common.cuh"
+#include "ggml.h"
+#include "softmax.cuh"
+#include <cstdint>
+template <typename T>
+static __device__ __forceinline__ float t2f32(T val) {
+    return (float) val;
+}
+template <>
+__device__ float __forceinline__ t2f32<half>(half val) {
+    return __half2float(val);
+}
+// When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled.
+// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here.
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpass-failed"
+#endif // __clang__
+template <bool use_shared, int ncols_template, int block_size_template, typename T>
+static __global__ void soft_max_f32(
+        const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y,
+        const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
+    const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
+    const int tid  = threadIdx.x;
+    const int rowx = blockIdx.x;
+    const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
+    x    += int64_t(rowx)*ncols;
+    mask += int64_t(rowy)*ncols * (mask != nullptr);
+    dst  += int64_t(rowx)*ncols;
+    const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int lane_id = threadIdx.x % WARP_SIZE;
+    const float slope = get_alibi_slope(max_bias, rowx/nrows_y, n_head_log2, m0, m1);
+    extern __shared__ float data_soft_max_f32[];
+    float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
+    // shared memory buffer to cache values between iterations:
+    float * vals = use_shared ? buf_iw + WARP_SIZE : dst;
+    float max_val = -INFINITY;
+#pragma unroll
+    for (int col0 = 0; col0 < ncols; col0 += block_size) {
+        const int col = col0 + tid;
+        if (ncols_template == 0 && col >= ncols) {
+            break;
+        }
+        const float val = x[col]*scale + (mask ? slope*t2f32(mask[col]) : 0.0f);
+        vals[col] = val;
+        max_val = max(max_val, val);
+    }
+    // find the max value in the block
+    max_val = warp_reduce_max(max_val);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf_iw[lane_id] = -INFINITY;
+        }
+        __syncthreads();
+        if (lane_id == 0) {
+            buf_iw[warp_id] = max_val;
+        }
+        __syncthreads();
+        max_val = buf_iw[lane_id];
+        max_val = warp_reduce_max(max_val);
+    }
+    float tmp = 0.0f; // partial sum
+#pragma unroll
+    for (int col0 = 0; col0 < ncols; col0 += block_size) {
+        const int col = col0 + tid;
+        if (ncols_template == 0 && col >= ncols) {
+            break;
+        }
+        const float val = expf(vals[col] - max_val);
+        tmp += val;
+        vals[col] = val;
+    }
+    // find the sum of exps in the block
+    tmp = warp_reduce_sum(tmp);
+    if (block_size > WARP_SIZE) {
+        __syncthreads();
+        if (warp_id == 0) {
+            buf_iw[lane_id] = 0.0f;
+        }
+        __syncthreads();
+        if (lane_id == 0) {
+            buf_iw[warp_id] = tmp;
+        }
+        __syncthreads();
+        tmp = buf_iw[lane_id];
+        tmp = warp_reduce_sum(tmp);
+    }
+    const float inv_sum = 1.0f / tmp;
+#pragma unroll
+    for (int col0 = 0; col0 < ncols; col0 += block_size) {
+        const int col = col0 + tid;
+        if (ncols_template == 0 && col >= ncols) {
+            return;
+        }
+        dst[col] = vals[col] * inv_sum;
+    }
+}
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif // __clang__
+static __global__ void soft_max_back_f32(
+        const float * grad, const float * dstf, float * dst, const int ncols, const float scale) {
+    const int tid  = threadIdx.x;
+    const int rowx = blockIdx.x;
+    grad += int64_t(rowx)*ncols;
+    dstf += int64_t(rowx)*ncols;
+    dst  += int64_t(rowx)*ncols;
+    float dgf_dot = 0.0f; // dot product of dst from forward pass and gradients
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
+        dgf_dot += dstf[col]*grad[col];
+    }
+    dgf_dot = warp_reduce_sum(dgf_dot);
+    for (int col = tid; col < ncols; col += WARP_SIZE) {
+        dst[col] = scale * (grad[col] - dgf_dot) * dstf[col];
+    }
+}
+template<typename T>
+static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
+    int nth = WARP_SIZE;
+    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
+    const dim3 block_dims(nth,     1, 1);
+    const dim3 block_nums(nrows_x, 1, 1);
+    const size_t nbytes_shared = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
+    static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
+    const uint32_t n_head      = nrows_x/nrows_y;
+    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
+    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
+    // FIXME: this limit could be raised by ~2-4x on Ampere or newer
+    if (nbytes_shared < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
+        switch (ncols_x) {
+            case 32:
+                soft_max_f32<true,   32,   32><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+            case 64:
+                soft_max_f32<true,   64,   64><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+            case 128:
+                soft_max_f32<true,  128,  128><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+            case 256:
+                soft_max_f32<true,  256,  256><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+            case 512:
+                soft_max_f32<true,  512,  512><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+            case 1024:
+                soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+            case 2048:
+                soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+            case 4096:
+                soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+            default:
+                soft_max_f32<true,    0,    0><<<block_nums, block_dims, nbytes_shared, stream>>>
+                    (x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+                break;
+        }
+    } else {
+        const size_t nbytes_shared_low = WARP_SIZE*sizeof(float);
+        soft_max_f32<false, 0, 0><<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
+    }
+}
+static void soft_max_back_f32_cuda(
+        const float * grad, const float * dstf, float * dst,
+        const int ncols, const int nrows, const float scale, cudaStream_t stream) {
+    const dim3 block_dims(WARP_SIZE, 1, 1);
+    const dim3 block_nums(nrows,     1, 1);
+    soft_max_back_f32<<<block_nums, block_dims, 0, stream>>>(grad, dstf, dst, ncols, scale);
+}
+void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+    const float * src0_d = (const float *) src0->data;
+    const void  * src1_d = src1 ? (const void *) src1->data : nullptr;
+    float       *  dst_d = (float *) dst->data;
+    cudaStream_t stream = ctx.stream();
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
+    const int64_t ne00    = src0->ne[0];
+    const int64_t nrows_x = ggml_nrows(src0);
+    const int64_t nrows_y = src0->ne[1];
+    float scale    = 1.0f;
+    float max_bias = 0.0f;
+    memcpy(&scale,    (const float *) dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
+    const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
+    if (use_f16) {
+        soft_max_f32_cuda(src0_d, (const half  *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
+    } else {
+        soft_max_f32_cuda(src0_d, (const float *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
+    }
+}
+void ggml_cuda_op_soft_max_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0]; // grad
+    const ggml_tensor * src1 = dst->src[1]; // forward pass output
+    const float * src0_d = (const float *) src0->data;
+    const float * src1_d = (const float *) src1->data;
+    float       * dst_d  = (float       *) dst->data;
+    cudaStream_t stream = ctx.stream();
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+    float scale    = 1.0f;
+    float max_bias = 0.0f;
+    memcpy(&scale,    (const float *) dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
+    GGML_ASSERT(max_bias == 0.0f);
+    soft_max_back_f32_cuda(src0_d, src1_d, dst_d, ncols, nrows, scale, stream);
+}

data/ext/sources/ggml/src/ggml-cuda/softmax.cuh ADDED Viewed

@@ -0,0 +1,7 @@
+#include "common.cuh"
+#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
+void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_soft_max_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu ADDED Viewed

@@ -0,0 +1,148 @@
+#include "ssm-conv.cuh"
+template <size_t split_d_inner, size_t d_conv>
+static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float * __restrict__ src1,
+                                    const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1,
+                                    float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2,
+                                    const int64_t n_t) {
+    GGML_UNUSED(src0_nb0);
+    const int tid  = threadIdx.x;
+    const int bidx = blockIdx.x;
+    const int bidy = blockIdx.y;
+    const float * x_block = (const float *) ((const char *) src0 + bidx * src0_nb2 + bidy * split_d_inner * src0_nb1);
+    const float * w_block = (const float *) ((const char *) src1 + bidy * split_d_inner * src1_nb1);
+    float *       y_block = (float *) ((char *) dst + bidx * dst_nb2 + bidy * split_d_inner * dst_nb0);
+    const int stride_x = src0_nb1 / sizeof(float);
+    const int stride_w = src1_nb1 / sizeof(float);
+    const int stride_y = dst_nb1 / sizeof(float);
+    float x[d_conv] = { 0.0f };
+    float w[d_conv] = { 0.0f };
+#pragma unroll
+    for (size_t j = 0; j < d_conv; j++) {
+        w[j] = w_block[tid * stride_w + j];
+    }
+    for (int64_t i = 0; i < n_t; i++) {
+        float sumf = 0.0f;
+        if (i == 0) {
+            for (size_t j = 0; j < d_conv; j++) {
+                x[j] = x_block[tid * stride_x + j];
+            }
+        } else {
+            x[(i - 1) % d_conv] = x_block[tid * stride_x + i + d_conv - 1];
+        }
+#pragma unroll
+        for (size_t j = 0; j < d_conv; j++) {
+            sumf += x[(i + j) % d_conv] * w[j];
+        }
+        y_block[i * stride_y + tid] = sumf;
+    }
+}
+template <size_t split_d_inner, size_t d_conv, int64_t split_n_t>
+static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0, const float * __restrict__ src1,
+                                               const int src0_nb0, const int src0_nb1, const int src0_nb2,
+                                               const int src1_nb1, float * __restrict__ dst, const int dst_nb0,
+                                               const int dst_nb1, const int dst_nb2, const int64_t n_t) {
+    const int tid  = threadIdx.x;
+    const int bidx = blockIdx.x;
+    const int bidy = blockIdx.y;
+    const int bidz = blockIdx.z;
+    const float * x_block = (const float *) ((const char *) src0 + bidx * src0_nb2 + bidy * split_d_inner * src0_nb1 +
+                                             bidz * split_n_t * src0_nb0);
+    const float * w_block = (const float *) ((const char *) src1 + bidy * split_d_inner * src1_nb1);
+    float *       y_block =
+        (float *) ((char *) dst + bidx * dst_nb2 + bidz * split_n_t * dst_nb1 + bidy * split_d_inner * dst_nb0);
+    const int stride_x = src0_nb1 / sizeof(float);
+    const int stride_w = src1_nb1 / sizeof(float);
+    const int stride_y = dst_nb1 / sizeof(float);
+    float x[d_conv] = { 0.0f };
+    float w[d_conv] = { 0.0f };
+#pragma unroll
+    for (size_t j = 0; j < d_conv; j++) {
+        w[j] = w_block[tid * stride_w + j];
+    }
+#pragma unroll
+    for (int64_t i = 0; i < split_n_t; i++) {
+        if (bidz * split_n_t + i < n_t) {
+            float sumf = 0.0f;
+            if (i == 0) {
+                for (size_t j = 0; j < d_conv; j++) {
+                    x[j] = x_block[tid * stride_x + j];
+                }
+            } else {
+                x[(i - 1) % d_conv] = x_block[tid * stride_x + i + d_conv - 1];
+            }
+#pragma unroll
+            for (size_t j = 0; j < d_conv; j++) {
+                sumf += x[(i + j) % d_conv] * w[j];
+            }
+            y_block[i * stride_y + tid] = sumf;
+        }
+    }
+}
+static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int src0_nb0, const int src0_nb1,
+                              const int src0_nb2, const int src1_nb1, float * dst, const int dst_nb0, const int dst_nb1,
+                              const int dst_nb2, const int64_t nc, const int64_t nr, const int64_t n_t,
+                              const int64_t n_s, cudaStream_t stream) {
+    const int threads = 128;
+    GGML_ASSERT(nr % threads == 0);
+    if (n_t <= 32) {
+        const dim3 blocks(n_s, (nr + threads - 1) / threads, 1);
+        if (nc == 4) {
+            ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
+                                                                     dst, dst_nb0, dst_nb1, dst_nb2, n_t);
+        } else {
+            GGML_ABORT("Only support kernel size = 4  now.");
+        }
+    } else {
+        if (nc == 4) {
+            const int64_t split_n_t = 32;
+            dim3          blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
+            ssm_conv_long_token_f32<threads, 4, split_n_t><<<blocks, threads, 0, stream>>>(
+                src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
+        } else {
+            GGML_ABORT("Only support kernel size = 4 right now.");
+        }
+    }
+}
+void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0];  // conv_x
+    const struct ggml_tensor * src1 = dst->src[1];  // conv1d.weight
+    const int64_t nc  = src1->ne[0];                // d_conv
+    const int64_t nr  = src0->ne[1];                // d_inner
+    const int64_t n_t = dst->ne[1];                 // tokens per sequence
+    const int64_t n_s = dst->ne[2];                 // number of sequences in the batch
+    GGML_ASSERT(dst->ne[0] == nr);
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(src1->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[1] == src0->ne[0] * sizeof(float));
+    const float * src0_d = (const float *) src0->data;
+    const float * src1_d = (const float *) src1->data;
+    float *       dst_d  = (float *) dst->data;
+    cudaStream_t  stream = ctx.stream();
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    ssm_conv_f32_cuda(src0_d, src1_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, dst->nb[0], dst->nb[1],
+                      dst->nb[2], nc, nr, n_t, n_s, stream);
+}

data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh ADDED Viewed

@@ -0,0 +1,3 @@
+#include "common.cuh"
+void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu ADDED Viewed

@@ -0,0 +1,153 @@
+#include "ssm-scan.cuh"
+template <size_t splitD, size_t N>
+__global__ void __launch_bounds__(splitD, 2)
+    ssm_scan_f32(const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2,
+                 const float * __restrict__ src3, const float * __restrict__ src4, const float * __restrict__ src5,
+                 const int src0_nb1, const int src0_nb2, const int src1_nb0, const int src1_nb1, const int src1_nb2,
+                 const int src1_nb3, const int src2_nb0, const int src2_nb1, const int src2_nb2, const int src3_nb1,
+                 const int src4_nb1, const int src4_nb2, const int src5_nb1, const int src5_nb2,
+                 float * __restrict__ dst, const int64_t L) {
+    GGML_UNUSED(src1_nb0);
+    GGML_UNUSED(src2_nb0);
+    const int bidx = blockIdx.x;  // split along B
+    const int bidy = blockIdx.y;  // split along D
+    const int tid  = threadIdx.x;
+    const int wid  = tid / 32;
+    const int wtid = tid % 32;
+    extern __shared__ float smem[];
+    const int               stride_sA  = N + 1;
+    const int               stride_ss0 = N + 1;
+    float *                 smem_A     = smem;
+    float *                 smem_s0    = smem_A + splitD * stride_sA;
+    const float * s0_block = (const float *) ((const char *) src0 + bidx * src0_nb2 + bidy * splitD * src0_nb1);
+    const float * x_block  = (const float *) ((const char *) src1 + (bidx * src1_nb2) + bidy * splitD * sizeof(float));
+    const float * dt_block = (const float *) ((const char *) src2 + (bidx * src2_nb2) + bidy * splitD * sizeof(float));
+    const float * A_block  = (const float *) ((const char *) src3 + bidy * splitD * src3_nb1);
+    const float * B_block  = (const float *) ((const char *) src4 + (bidx * src4_nb2));
+    const float * C_block  = (const float *) ((const char *) src5 + (bidx * src5_nb2));
+    float *       y_block  = (float *) ((char *) dst + (bidx * src1_nb2) + bidy * splitD * sizeof(float));
+    float *       s_block  = (float *) ((char *) dst + src1_nb3 + bidx * src0_nb2 + bidy * splitD * src0_nb1);
+    const int stride_s0 = src0_nb1 / sizeof(float);
+    const int stride_x  = src1_nb1 / sizeof(float);
+    const int stride_dt = src2_nb1 / sizeof(float);
+    const int stride_A  = src3_nb1 / sizeof(float);
+    const int stride_B  = src4_nb1 / sizeof(float);
+    const int stride_C  = src5_nb1 / sizeof(float);
+    const int stride_s  = stride_s0;
+    const int stride_y  = stride_x;
+    // can N not be 16? for example 32?
+    if (N == 16) {
+#pragma unroll
+        for (size_t i = 0; i < splitD / 4; i += 2) {
+            float value = A_block[(wid * warpSize + i) * stride_A + wtid];
+            // todo: bank conflict
+            // I am always confused with how to use the swizzling method to solve
+            // bank conflit. Hoping somebody can tell me.
+            smem_A[(wid * warpSize + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
+        }
+#pragma unroll
+        for (size_t i = 0; i < splitD / 4; i += 2) {
+            float value = s0_block[(wid * warpSize + i) * stride_s0 + wtid];
+            smem_s0[(wid * warpSize + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
+        }
+    }
+    __syncthreads();
+    for (int64_t i = 0; i < L; i++) {
+        float dt_soft_plus = dt_block[i * stride_dt + tid];
+        if (dt_soft_plus <= 20.0f) {
+            dt_soft_plus = log1pf(exp(dt_soft_plus));
+        }
+        float x_dt = x_block[i * stride_x + tid] * dt_soft_plus;
+        float sumf = 0.0f;
+#pragma unroll
+        for (size_t j = 0; j < N; j++) {
+            float state = (smem_s0[tid * stride_ss0 + j] * expf(dt_soft_plus * smem_A[tid * stride_sA + j])) +
+                          (B_block[i * stride_B + j] * x_dt);
+            sumf += state * C_block[i * stride_C + j];
+            if (i == L - 1) {
+                s_block[tid * stride_s + j] = state;
+            } else {
+                smem_s0[tid * stride_ss0 + j] = state;
+            }
+        }
+        __syncthreads();
+        y_block[i * stride_y + tid] = sumf;
+    }
+}
+static void ssm_scan_f32_cuda(const float * src0, const float * src1, const float * src2, const float * src3,
+                              const float * src4, const float * src5, const int src0_nb1, const int src0_nb2,
+                              const int src1_nb0, const int src1_nb1, const int src1_nb2, const int src1_nb3,
+                              const int src2_nb0, const int src2_nb1, const int src2_nb2, const int src3_nb1,
+                              const int src4_nb1, const int src4_nb2, const int src5_nb1, const int src5_nb2,
+                              float * dst, const int64_t N, const int64_t D, const int64_t L, const int64_t B,
+                              cudaStream_t stream) {
+    const int threads = 128;
+    // todo: consider D cannot be divided,does this situation exist?
+    GGML_ASSERT(D % threads == 0);
+    const dim3 blocks(B, (D + threads - 1) / threads, 1);
+    const int  smem_size = (threads * (N + 1) * 2) * sizeof(float);
+    if (N == 16) {
+        ssm_scan_f32<128, 16><<<blocks, threads, smem_size, stream>>>(
+            src0, src1, src2, src3, src4, src5, src0_nb1, src0_nb2, src1_nb0, src1_nb1, src1_nb2, src1_nb3, src2_nb0,
+            src2_nb1, src2_nb2, src3_nb1, src4_nb1, src4_nb2, src5_nb1, src5_nb2, dst, L);
+    } else {
+        GGML_ABORT("doesn't support N!=16.");
+    }
+}
+void ggml_cuda_op_ssm_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const struct ggml_tensor * src0 = dst->src[0];  // s
+    const struct ggml_tensor * src1 = dst->src[1];  // x
+    const struct ggml_tensor * src2 = dst->src[2];  // dt
+    const struct ggml_tensor * src3 = dst->src[3];  // A
+    const struct ggml_tensor * src4 = dst->src[4];  // B
+    const struct ggml_tensor * src5 = dst->src[5];  // C
+    //   const int64_t d_state = src0->ne[0];
+    //   const int64_t d_inner = src0->ne[1];
+    //   const int64_t l = src1->ne[1];
+    //   const int64_t b = src0->ne[2];
+    const int64_t nc  = src0->ne[0];  // d_state
+    const int64_t nr  = src0->ne[1];  // d_inner
+    const int64_t n_t = src1->ne[1];  // number of tokens per sequence
+    const int64_t n_s = src0->ne[2];  // number of sequences in the batch
+    GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(src1->nb[0] == sizeof(float));
+    GGML_ASSERT(src2->nb[0] == sizeof(float));
+    GGML_ASSERT(src3->nb[0] == sizeof(float));
+    GGML_ASSERT(src4->nb[0] == sizeof(float));
+    GGML_ASSERT(src5->nb[0] == sizeof(float));
+    // required for the dot product between s and C
+    GGML_ASSERT(src0->nb[1] == src0->ne[0] * sizeof(float));
+    // required for per-sequence offsets for states
+    GGML_ASSERT(src0->nb[2] == src0->ne[0] * src0->ne[1] * sizeof(float));
+    // required to get correct offset for state destination (i.e. src1->nb[3])
+    GGML_ASSERT(src1->nb[3] == src1->ne[0] * src1->ne[1] * src1->ne[2] * sizeof(float));
+    const float * src0_d = (const float *) src0->data;
+    const float * src1_d = (const float *) src1->data;
+    const float * src2_d = (const float *) src2->data;
+    const float * src3_d = (const float *) src3->data;
+    const float * src4_d = (const float *) src4->data;
+    const float * src5_d = (const float *) src5->data;
+    float *       dst_d  = (float *) dst->data;
+    cudaStream_t  stream = ctx.stream();
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    ssm_scan_f32_cuda(src0_d, src1_d, src2_d, src3_d, src4_d, src5_d, src0->nb[1], src0->nb[2], src1->nb[0],
+                      src1->nb[1], src1->nb[2], src1->nb[3], src2->nb[0], src2->nb[1], src2->nb[2], src3->nb[1],
+                      src4->nb[1], src4->nb[2], src5->nb[1], src5->nb[2], dst_d, nc, nr, n_t, n_s, stream);
+}

data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cuh ADDED Viewed

@@ -0,0 +1,3 @@
+#include "common.cuh"
+void ggml_cuda_op_ssm_scan(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

data/ext/sources/ggml/src/ggml-cuda/sum.cu ADDED Viewed

@@ -0,0 +1,45 @@
+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
+#define USE_CUB
+#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
+#ifdef USE_CUB
+#include <cub/cub.cuh>
+using namespace cub;
+#endif // USE_CUB
+#include "sumrows.cuh"
+#include "sum.cuh"
+#include <cstdint>
+void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
+#ifdef USE_CUB
+    size_t tmp_size = 0;
+    DeviceReduce::Sum(nullptr,       tmp_size, x, dst, ne, stream);
+    ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
+    DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream);
+#else
+    // Use (inefficient) sum_rows implementation as a fallback.
+    // For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
+    sum_rows_f32_cuda(x, dst, ne, 1, stream);
+    GGML_UNUSED(pool);
+#endif // USE_CUB
+}
+void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+    const float * src0_d = (const float *) src0->data;
+    float * dst_d = (float *) dst->data;
+    const int64_t ne = ggml_nelements(src0);
+    ggml_cuda_pool & pool = ctx.pool();
+    cudaStream_t stream = ctx.stream();
+    sum_f32_cuda(pool, src0_d, dst_d, ne, stream);
+}

data/ext/sources/ggml/src/ggml-cuda/sum.cuh ADDED Viewed

@@ -0,0 +1,5 @@
+#include "common.cuh"
+void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
+void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);