RubyGems - whispercpp - Versions diffs - 1.3.5 → 1.3.6 - Mend

whispercpp 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (610) hide show

data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu CHANGED Viewed

@@ -5,6 +5,13 @@
 #include <cmath>
 #include <initializer_list>
+// Kernel config struct - passed by value to CUDA kernel
+struct topk_moe_config {
+    bool use_sigmoid;
+    bool with_norm;
+    bool delayed_softmax;
+};
 // Warp-local softmax used for both the pre-top-k logits and the post-top-k delayed path.
 template <int experts_per_thread, bool use_limit>
 __device__ void softmax_warp_inplace(float (&vals)[experts_per_thread], const int limit, const int lane) {
@@ -50,6 +57,16 @@ __device__ void softmax_warp_inplace(float (&vals)[experts_per_thread], const in
     }
 }
+template <int experts_per_thread, bool use_limit>
+__device__ void sigmoid_warp_inplace(float (&vals)[experts_per_thread], const int limit, const int lane) {
+#pragma unroll
+    for (int i = 0; i < experts_per_thread; i++) {
+        const int  idx    = lane + i * WARP_SIZE;
+        const bool active = !use_limit || (idx < limit);
+        vals[i]           = active ? 1.f / (1.f + expf(-vals[i])) : -INFINITY;
+    }
+}
 /*
     This kernel does the following:
     1. optionally softmax over the logits per token [n_experts, n_tokens]
@@ -59,13 +76,16 @@ __device__ void softmax_warp_inplace(float (&vals)[experts_per_thread], const in
     It is intended as fusion of softmax->top-k->get_rows pipeline for MoE models
 */
-template <int n_experts, bool with_norm, bool delayed_softmax = false>
-__launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * logits,
-                                                                  float *       weights,
-                                                                  int32_t *     ids,
-                                                                  const int     n_rows,
-                                                                  const int     n_expert_used,
-                                                                  const float   clamp_val) {
+template <int n_experts, bool has_bias>
+__launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *         logits,
+                                                                  float *               weights,
+                                                                  int32_t *             ids,
+                                                                  float *               bias,
+                                                                  const int             n_rows,
+                                                                  const int             n_expert_used,
+                                                                  const float           clamp_val,
+                                                                  const float           scale_val,
+                                                                  const topk_moe_config config) {
     const int row = blockIdx.x * blockDim.y + threadIdx.y;
     if (row >= n_rows) {
         return;
@@ -79,14 +99,53 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
     float wt[experts_per_thread];
+    // Initialize all slots to -INFINITY
+#pragma unroll
+    for (int i = 0; i < experts_per_thread; i++) {
+        wt[i] = -INFINITY;
+    }
 #pragma unroll
     for (int i = 0; i < n_experts; i += WARP_SIZE) {
         const int expert  = i + threadIdx.x;
         wt[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[expert] : -INFINITY;
     }
-    if constexpr (!delayed_softmax) {
-        softmax_warp_inplace<experts_per_thread, false>(wt, n_experts, threadIdx.x);
+    if (!config.delayed_softmax) {
+        if (config.use_sigmoid) {
+           sigmoid_warp_inplace<experts_per_thread, false>(wt, n_experts, threadIdx.x);
+        } else {
+           softmax_warp_inplace<experts_per_thread, false>(wt, n_experts, threadIdx.x);
+        }
+    }
+    // Sanitize NaN to -FLT_MAX so the iterative argmax produces unique expert IDs.
+    // NaN comparisons always return false, which would cause the same expert to be
+    // selected repeatedly. -FLT_MAX compares normally and is still excluded by the
+    // -INFINITY sentinel used after each selection round.
+    // More relevant for the cuBLAS path. See https://github.com/ggml-org/llama.cpp/issues/19659
+#pragma unroll
+    for (int i = 0; i < experts_per_thread; i++) {
+        if (__isnanf(wt[i])) {
+            wt[i] = -FLT_MAX;
+        }
+    }
+    // selection_wt is only needed when bias is present (selection uses wt + bias)
+    // when no bias, we use wt directly for both selection and weight values
+    float selection_wt[has_bias ? experts_per_thread : 1];
+    if constexpr (has_bias) {
+#pragma unroll
+        for (int i = 0; i < experts_per_thread; i++) {
+            selection_wt[i] = -INFINITY;
+        }
+#pragma unroll
+        for (int i = 0; i < n_experts; i += WARP_SIZE) {
+            const int expert = i + threadIdx.x;
+            selection_wt[i / WARP_SIZE] =
+                (n_experts % WARP_SIZE == 0 || expert < n_experts) ? wt[i / WARP_SIZE] + bias[expert] : -INFINITY;
+        }
     }
     //at this point, each thread holds either a portion of the softmax distribution
@@ -106,22 +165,56 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
         float max_val    = wt[0];
         int   max_expert = threadIdx.x;
+        if constexpr (has_bias) {
+            float max_val_s = selection_wt[0];
 #pragma unroll
-        for (int i = 1; i < experts_per_thread; i++) {
-            const int expert = threadIdx.x + i * WARP_SIZE;
-            if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && wt[i] > max_val) {
-                max_val    = wt[i];
-                max_expert = expert;
+            for (int i = 1; i < experts_per_thread; i++) {
+                const int expert = threadIdx.x + i * WARP_SIZE;
+                if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && selection_wt[i] > max_val_s) {
+                    max_val    = wt[i];
+                    max_val_s  = selection_wt[i];
+                    max_expert = expert;
+                }
             }
-        }
 #pragma unroll
-        for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) {
-            const float val    = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, WARP_SIZE);
-            const int   expert = __shfl_xor_sync(0xFFFFFFFF, max_expert, mask, WARP_SIZE);
-            if (val > max_val || (val == max_val && expert < max_expert)) {
-                max_val    = val;
-                max_expert = expert;
+            for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) {
+                const float val    = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, WARP_SIZE);
+                const float val_s  = __shfl_xor_sync(0xFFFFFFFF, max_val_s, mask, WARP_SIZE);
+                const int   expert = __shfl_xor_sync(0xFFFFFFFF, max_expert, mask, WARP_SIZE);
+                if (val_s > max_val_s || (val_s == max_val_s && expert < max_expert)) {
+                    max_val    = val;
+                    max_val_s  = val_s;
+                    max_expert = expert;
+                }
+            }
+            if ((max_expert & (WARP_SIZE - 1)) == threadIdx.x) {
+                selection_wt[max_expert / WARP_SIZE] = -INFINITY;
+            }
+        } else {
+#pragma unroll
+            for (int i = 1; i < experts_per_thread; i++) {
+                const int expert = threadIdx.x + i * WARP_SIZE;
+                if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && wt[i] > max_val) {
+                    max_val    = wt[i];
+                    max_expert = expert;
+                }
+            }
+#pragma unroll
+            for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) {
+                const float val    = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, WARP_SIZE);
+                const int   expert = __shfl_xor_sync(0xFFFFFFFF, max_expert, mask, WARP_SIZE);
+                if (val > max_val || (val == max_val && expert < max_expert)) {
+                    max_val    = val;
+                    max_expert = expert;
+                }
+            }
+            if ((max_expert & (WARP_SIZE - 1)) == threadIdx.x) {
+                wt[max_expert / WARP_SIZE] = -INFINITY;
             }
         }
@@ -130,16 +223,14 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
         }
         if ((max_expert & (WARP_SIZE - 1)) == threadIdx.x) {
-            wt[max_expert / WARP_SIZE] = -INFINITY;
             ids[k] = max_expert;
-            if constexpr (with_norm) {
+            if (config.with_norm) {
                 wt_sum += max_val;
             }
         }
     }
-    if constexpr (with_norm) {
+    if (config.with_norm) {
         wt_sum              = warp_reduce_sum(wt_sum);
         wt_sum              = max(wt_sum, clamp_val);
         const float inv_sum = 1.0f / wt_sum;
@@ -149,7 +240,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
         }
     }
-    if constexpr (delayed_softmax) {
+    if (config.delayed_softmax) {
         softmax_warp_inplace<experts_per_thread, true>(output_weights, n_expert_used, threadIdx.x);
     }
@@ -157,25 +248,25 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
     for (int i = 0; i < experts_per_thread; i++) {
         const int idx = i * WARP_SIZE + threadIdx.x;
         if (idx < n_expert_used) {
-            weights[idx] = output_weights[i];
+            weights[idx] = output_weights[i] * scale_val;
         }
     }
-    if (!with_norm) {
-        GGML_UNUSED(clamp_val);
-    }
 }
-template <bool with_norm, bool delayed_softmax = false>
+template<bool has_bias>
 static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
                                  const float *               logits,
                                  float *                     weights,
                                  int32_t *                   ids,
+                                 float *                     bias,
                                  const int                   n_rows,
                                  const int                   n_expert,
                                  const int                   n_expert_used,
-                                 const float                 clamp_val) {
-    static_assert(!(with_norm && delayed_softmax), "delayed softmax is not supported with weight normalization");
+                                 const float                 clamp_val,
+                                 const float                 scale_val,
+                                 const topk_moe_config       config) {
+    GGML_ASSERT(!(config.with_norm && config.delayed_softmax) &&
+                "delayed softmax is not supported with weight normalization");
     const int    rows_per_block = 4;
     dim3         grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1);
     dim3         block_dims(WARP_SIZE, rows_per_block, 1);
@@ -183,44 +274,48 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
     switch (n_expert) {
         case 1:
-            topk_moe_cuda<1, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<1, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                   clamp_val, scale_val, config);
             break;
         case 2:
-            topk_moe_cuda<2, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<2, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                   clamp_val, scale_val, config);
             break;
         case 4:
-            topk_moe_cuda<4, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<4, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                   clamp_val, scale_val, config);
             break;
         case 8:
-            topk_moe_cuda<8, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<8, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                   clamp_val, scale_val, config);
             break;
         case 16:
-            topk_moe_cuda<16, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<16, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                    clamp_val, scale_val, config);
             break;
         case 32:
-            topk_moe_cuda<32, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<32, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                    clamp_val, scale_val, config);
             break;
         case 64:
-            topk_moe_cuda<64, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<64, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                    clamp_val, scale_val, config);
             break;
         case 128:
-            topk_moe_cuda<128, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<128, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                     clamp_val, scale_val, config);
             break;
         case 256:
-            topk_moe_cuda<256, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<256, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                     clamp_val, scale_val, config);
             break;
         case 512:
-            topk_moe_cuda<512, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+            topk_moe_cuda<512, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                     clamp_val, scale_val, config);
+            break;
+        case 576:
+            topk_moe_cuda<576, has_bias><<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, bias, n_rows, n_expert_used,
+                                                                     clamp_val, scale_val, config);
             break;
         default:
             GGML_ASSERT(false && "fatal error");
@@ -228,13 +323,14 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
     }
 }
-void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
-                           const ggml_tensor *         logits,
-                           ggml_tensor *               weights,
-                           ggml_tensor *               ids,
-                           const bool                  with_norm,
-                           const bool                  delayed_softmax,
-                           ggml_tensor *               clamp) {
+void ggml_cuda_op_topk_moe(ggml_backend_cuda_context &     ctx,
+                           const ggml_tensor *             logits,
+                           ggml_tensor *                   weights,
+                           ggml_tensor *                   ids,
+                           const ggml_tensor *             clamp,
+                           const ggml_tensor *             scale,
+                           const ggml_tensor *             bias,
+                           const ggml_cuda_topk_moe_args & args) {
     GGML_ASSERT(logits->type == GGML_TYPE_F32);
     GGML_ASSERT(weights->type == GGML_TYPE_F32);
     GGML_ASSERT(ids->type == GGML_TYPE_I32);
@@ -245,107 +341,75 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
     const float * logits_d  = (const float *) logits->data;
     float *       weights_d = (float *) weights->data;
     int32_t *     ids_d     = (int32_t *) ids->data;
+    float *       bias_d    = bias ? (float *) bias->data : nullptr;
+    float scale_val = scale ? ggml_get_op_params_f32(scale, 0) : 1.0f;
     GGML_ASSERT(ids->nb[1] / ggml_type_size(ids->type) == (size_t) n_experts);
     const int n_expert_used = weights->ne[1];
+    const bool with_norm = clamp != nullptr;
     float clamp_val = -INFINITY;
-    if (with_norm) {
-        if (clamp) {
-            clamp_val = ggml_get_op_params_f32(clamp, 0);
-        }
-        launch_topk_moe_cuda<true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used, clamp_val);
+    if (clamp) {
+        clamp_val = ggml_get_op_params_f32(clamp, 0);
+    }
+    topk_moe_config config;
+    config.use_sigmoid     = args.sigmoid;
+    config.with_norm       = with_norm;
+    config.delayed_softmax = args.delayed_softmax;
+    if (bias) {
+        launch_topk_moe_cuda<true>(ctx, logits_d, weights_d, ids_d, bias_d, n_rows, n_experts, n_expert_used, clamp_val,
+                             scale_val, config);
     } else {
-        GGML_ASSERT(clamp == nullptr);
-        if (delayed_softmax) {
-            launch_topk_moe_cuda<false, true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used,
-                                              clamp_val);
-        } else {
-            launch_topk_moe_cuda<false, false>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used,
-                                               clamp_val);
-        }
+        launch_topk_moe_cuda<false>(ctx, logits_d, weights_d, ids_d, bias_d, n_rows, n_experts, n_expert_used, clamp_val,
+                             scale_val, config);
     }
 }
-bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax,
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * gating_op,
                                    const ggml_tensor * weights,
-                                   const ggml_tensor * get_rows,
-                                   const ggml_tensor * argsort,
-                                   const ggml_tensor * clamp,
-                                   int n_expert) {
-    ggml_tensor * probs = get_rows->src[0];
-    if (probs->op != GGML_OP_RESHAPE) {
+                                   const ggml_tensor * logits,
+                                   const ggml_tensor * ids) {
+    const int n_expert = ids->nb[1] / ids->nb[0];
+    if (((n_expert & (n_expert - 1)) != 0 || n_expert > 512) && n_expert != 576) {
         return false;
     }
-    probs = probs->src[0];
-    ggml_tensor * selection_probs = argsort->src[0];
-    if (probs != selection_probs) {
+    if (!ggml_is_contiguous(weights) || !ggml_is_contiguous(logits)) {
         return false;
     }
-    float scale    = 1.0f;
-    float max_bias = 0.0f;
-    memcpy(&scale, (const float *) softmax->op_params + 0, sizeof(float));
-    memcpy(&max_bias, (const float *) softmax->op_params + 1, sizeof(float));
-    if (!ggml_is_contiguous(softmax->src[0]) || !ggml_is_contiguous(weights)) {
-        return false;
-    }
+    if (gating_op->op == GGML_OP_SOFT_MAX) {
+        const ggml_tensor * softmax  = gating_op;
+        float               scale    = 1.0f;
+        float               max_bias = 0.0f;
-    if (scale != 1.0f || max_bias != 0.0f) {
-        return false;
-    }
+        memcpy(&scale, (const float *) softmax->op_params + 0, sizeof(float));
+        memcpy(&max_bias, (const float *) softmax->op_params + 1, sizeof(float));
-    // don't fuse when masks or sinks are present
-    if (softmax->src[1] || softmax->src[2]) {
-        return false;
-    }
+        if (!ggml_is_contiguous(softmax->src[0])) {
+            return false;
+        }
-    // n_expert must be a power of 2
-    if ((n_expert & (n_expert - 1)) != 0 || n_expert > 512) {
-        return false;
-    }
+        if (scale != 1.0f || max_bias != 0.0f) {
+            return false;
+        }
-    if (clamp) {
-        if (clamp->op != GGML_OP_CLAMP) {
+        // don't fuse when masks or sinks are present
+        if (softmax->src[1] || softmax->src[2]) {
             return false;
         }
-        float max_val = ggml_get_op_params_f32(clamp, 1);
+    } else if (gating_op->op == GGML_OP_UNARY) {
+        ggml_unary_op op = ggml_get_unary_op(gating_op);
-        if (max_val != INFINITY) {
+        if (op != GGML_UNARY_OP_SIGMOID) {
             return false;
         }
     }
     return true;
 }
-std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool norm, bool delayed_softmax) {
-    static std::initializer_list<enum ggml_op> norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE,  GGML_OP_ARGSORT,
-                                                            GGML_OP_VIEW,     GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
-                                                            GGML_OP_SUM_ROWS, GGML_OP_CLAMP,    GGML_OP_DIV,
-                                                            GGML_OP_RESHAPE };
-    static std::initializer_list<enum ggml_op> no_norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
-                                                               GGML_OP_VIEW, GGML_OP_GET_ROWS };
-    static std::initializer_list<enum ggml_op> delayed_softmax_ops = { GGML_OP_ARGSORT,  GGML_OP_VIEW,
-                                                                       GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
-                                                                       GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
-    GGML_ASSERT(!norm || !delayed_softmax);
-    if (delayed_softmax) {
-        return delayed_softmax_ops;
-    }
-    if (norm) {
-        return norm_ops;
-    }
-    return no_norm_ops;
-}

data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh CHANGED Viewed

@@ -3,19 +3,25 @@
 #include <initializer_list>
-void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
-                           const ggml_tensor *         logits,
-                           ggml_tensor *               weights,
-                           ggml_tensor *               ids,
-                           const bool                  with_norm,
-                           const bool                  delayed_softmax = false,
-                           ggml_tensor *               weight_clamp    = nullptr);
+struct ggml_cuda_topk_moe_args {
+    bool sigmoid{};
+    bool softmax{};
+    bool delayed_softmax{};
+    bool prob_bias{};
+    bool norm{};
+    bool scale{};
+};
-bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax,
-                                   const ggml_tensor * weights,
-                                   const ggml_tensor * get_rows,
-                                   const ggml_tensor * argsort,
-                                   const ggml_tensor * clamp,
-                                   int n_expert);
+void ggml_cuda_op_topk_moe(ggml_backend_cuda_context &     ctx,
+                           const ggml_tensor *             logits,
+                           ggml_tensor *                   weights,
+                           ggml_tensor *                   ids,
+                           const ggml_tensor *             clamp,
+                           const ggml_tensor *             scale,
+                           const ggml_tensor *             bias,
+                           const ggml_cuda_topk_moe_args & args);
-std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool with_norm, bool delayed_softmax = false);
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * gating_op,
+                                   const ggml_tensor * weights,
+                                   const ggml_tensor * logits,
+                                   const ggml_tensor * ids);

data/ext/sources/ggml/src/ggml-cuda/unary.cu CHANGED Viewed

@@ -560,3 +560,58 @@ void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
         leaky_relu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), negative_slope, stream);
     }
 }
+/* fused unary + mul */
+template <float (*op)(float)>
+static void ggml_cuda_op_unary_mul_impl(ggml_backend_cuda_context & ctx, ggml_tensor * unary_node, ggml_tensor * mul_node) {
+    // unary_node: UNARY op applied to unary_node->src[0]
+    // mul_node:   MUL(a, b) where one of a/b is unary_node
+    // Output goes to mul_node->data
+    const ggml_tensor * unary_src = unary_node->src[0];  // input to the unary op
+    const ggml_tensor * other_src = (mul_node->src[0] == unary_node) ? mul_node->src[1] : mul_node->src[0];
+    GGML_ASSERT(ggml_is_contiguous_1(unary_src));
+    GGML_ASSERT(unary_src->nb[0] == ggml_element_size(unary_src));
+    GGML_ASSERT(ggml_is_contiguous_1(other_src));
+    GGML_ASSERT(other_src->nb[0] == ggml_element_size(other_src));
+    GGML_ASSERT(ggml_are_same_shape(unary_src, other_src));
+    GGML_ASSERT(unary_src->type == GGML_TYPE_F32 || unary_src->type == GGML_TYPE_F16);
+    GGML_ASSERT(unary_src->type == other_src->type);
+    GGML_ASSERT(unary_src->type == mul_node->type);
+    cudaStream_t stream = ctx.stream();
+    const int64_t k  = ggml_nelements(mul_node);
+    const int64_t nc = unary_src->ne[0];
+    const int64_t unary_stride = unary_src->nb[1];
+    const int64_t other_stride = other_src->nb[1];
+    if (unary_src->type == GGML_TYPE_F16) {
+        unary_gated_cuda<op>((const half *) unary_src->data, (const half *) other_src->data,
+                             (half *) mul_node->data, k, nc,
+                             unary_stride / sizeof(half), other_stride / sizeof(half), stream);
+    } else {
+        unary_gated_cuda<op>((const float *) unary_src->data, (const float *) other_src->data,
+                             (float *) mul_node->data, k, nc,
+                             unary_stride / sizeof(float), other_stride / sizeof(float), stream);
+    }
+}
+void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary_node, ggml_tensor * mul_node) {
+    switch (ggml_get_unary_op(unary_node)) {
+        case GGML_UNARY_OP_SILU:
+            ggml_cuda_op_unary_mul_impl<op_silu>(ctx, unary_node, mul_node);
+            break;
+        case GGML_UNARY_OP_SIGMOID:
+            ggml_cuda_op_unary_mul_impl<op_sigmoid>(ctx, unary_node, mul_node);
+            break;
+        case GGML_UNARY_OP_SOFTPLUS:
+            ggml_cuda_op_unary_mul_impl<op_softplus>(ctx, unary_node, mul_node);
+            break;
+        default:
+            GGML_ABORT("Unsupported unary op for fused unary+mul");
+    }
+}

data/ext/sources/ggml/src/ggml-cuda/unary.cuh CHANGED Viewed

@@ -89,6 +89,8 @@ void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst
 void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary_node, ggml_tensor * mul_node);
 __device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
     return x / (1.0f + expf(-x));
 }