npm - cui-llama.rn - Versions diffs - 1.1.0 → 1.1.2 - Mend

cui-llama.rn 1.1.0 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/cpp/ggml.c CHANGED Viewed

@@ -7095,7 +7095,8 @@ struct lm_ggml_tensor * lm_ggml_flash_attn_ext(
         struct lm_ggml_tensor  * v,
         struct lm_ggml_tensor  * mask,
         float                 scale,
-        float                 max_bias) {
+        float                 max_bias,
+        float                 logit_softcap) {
     LM_GGML_ASSERT(lm_ggml_can_mul_mat(k, q));
     // TODO: check if vT can be multiplied by (k*qT)
@@ -7122,7 +7123,7 @@ struct lm_ggml_tensor * lm_ggml_flash_attn_ext(
     int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
     struct lm_ggml_tensor * result = lm_ggml_new_tensor(ctx, LM_GGML_TYPE_F32, 4, ne);
-    float params[] = { scale, max_bias };
+    float params[] = { scale, max_bias, logit_softcap };
     lm_ggml_set_op_params(result, params, sizeof(params));
     result->op   = LM_GGML_OP_FLASH_ATTN_EXT;
@@ -7142,7 +7143,7 @@ void lm_ggml_flash_attn_ext_set_prec(
     const int32_t prec_i32 = (int32_t) prec;
-    lm_ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
+    lm_ggml_set_op_params_i32(a, 3, prec_i32); // scale is on first pos, max_bias on second
 }
 // lm_ggml_flash_attn_back
@@ -7229,43 +7230,34 @@ struct lm_ggml_tensor * lm_ggml_flash_attn_back(
 struct lm_ggml_tensor * lm_ggml_ssm_conv(
         struct lm_ggml_context * ctx,
-        struct lm_ggml_tensor  * s,
-        struct lm_ggml_tensor  * x,
-        struct lm_ggml_tensor  * c,
-        struct lm_ggml_tensor  * sq) {
-    LM_GGML_ASSERT(lm_ggml_is_3d(s));
-    LM_GGML_ASSERT(lm_ggml_is_matrix(x));
+        struct lm_ggml_tensor  * sx,
+        struct lm_ggml_tensor  * c) {
+    LM_GGML_ASSERT(lm_ggml_is_3d(sx));
     LM_GGML_ASSERT(lm_ggml_is_matrix(c));
-    LM_GGML_ASSERT(lm_ggml_is_matrix(sq));
-    LM_GGML_ASSERT(sq->type == LM_GGML_TYPE_I32);
-    const int64_t d_conv   = c->ne[0];
-    const int64_t d_inner  = c->ne[1];
-    const int64_t n_tokens = x->ne[1];
-    const int64_t n_kv     = s->ne[2];
+    const int64_t d_conv  = c->ne[0];
+    const int64_t d_inner = c->ne[1];
+    const int64_t n_t     = sx->ne[0] - d_conv + 1; // tokens per sequence
+    const int64_t n_s     = sx->ne[2];
-    LM_GGML_ASSERT( s->ne[0] == d_conv - 1);
-    LM_GGML_ASSERT( s->ne[1] == d_inner);
-    LM_GGML_ASSERT( x->ne[0] == d_inner);
-    LM_GGML_ASSERT(sq->ne[0] == n_kv);
-    LM_GGML_ASSERT(sq->ne[1] == n_tokens);
+    // TODO: maybe support other strides than 1?
+    LM_GGML_ASSERT(sx->ne[0] == d_conv - 1 + n_t);
+    LM_GGML_ASSERT(sx->ne[1] == d_inner);
+    LM_GGML_ASSERT(n_t >= 0);
     bool is_node = false;
-    if (s->grad || x->grad || c->grad || sq->grad) {
+    if (sx->grad || c->grad) {
         LM_GGML_ABORT("fatal error"); // TODO: implement
         is_node = true;
     }
-    // 2-in-1 concatenated x and conv_states, {d_inner, n_tokens} with {d_conv, d_inner, n_kv}
-    struct lm_ggml_tensor * result = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, (d_inner*n_tokens) + (d_conv*d_inner*n_kv));
+    struct lm_ggml_tensor * result = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, d_inner, n_t, n_s);
     result->op   = LM_GGML_OP_SSM_CONV;
     result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = s;
-    result->src[1] = x;
-    result->src[2] = c;
-    result->src[3] = sq;
+    result->src[0] = sx;
+    result->src[1] = c;
     return result;
 }
@@ -7279,39 +7271,42 @@ struct lm_ggml_tensor * lm_ggml_ssm_scan(
         struct lm_ggml_tensor  * dt,
         struct lm_ggml_tensor  * A,
         struct lm_ggml_tensor  * B,
-        struct lm_ggml_tensor  * C,
-        struct lm_ggml_tensor  * sq) {
+        struct lm_ggml_tensor  * C) {
     LM_GGML_ASSERT(lm_ggml_is_contiguous(s));
     LM_GGML_ASSERT(lm_ggml_is_contiguous(x));
     LM_GGML_ASSERT(lm_ggml_is_contiguous(dt));
     LM_GGML_ASSERT(lm_ggml_is_contiguous(A));
-    LM_GGML_ASSERT(sq->type == LM_GGML_TYPE_I32);
+    LM_GGML_ASSERT(lm_ggml_is_matrix(A));
+    LM_GGML_ASSERT(lm_ggml_is_3d(B));
+    LM_GGML_ASSERT(lm_ggml_is_3d(s));
     LM_GGML_ASSERT(B->nb[0] == lm_ggml_type_size(B->type));
     LM_GGML_ASSERT(C->nb[0] == lm_ggml_type_size(C->type));
     LM_GGML_ASSERT(lm_ggml_are_same_shape(x, dt));
+    LM_GGML_ASSERT(lm_ggml_are_same_shape(B, C));
     {
-        const int64_t d_state  = s->ne[0];
-        const int64_t d_inner  = s->ne[1];
-        const int64_t n_tokens = x->ne[1];
+        const int64_t d_state      = s->ne[0];
+        const int64_t d_inner      = s->ne[1];
+        const int64_t n_seq_tokens = x->ne[1];
+        const int64_t n_seqs       = x->ne[2];
+        LM_GGML_ASSERT(s->ne[2] == n_seqs);
         LM_GGML_ASSERT(x->ne[0] == d_inner);
         LM_GGML_ASSERT(A->ne[0] == d_state);
         LM_GGML_ASSERT(A->ne[1] == d_inner);
         LM_GGML_ASSERT(B->ne[0] == d_state);
-        LM_GGML_ASSERT(B->ne[1] == n_tokens);
-        LM_GGML_ASSERT(C->ne[0] == d_state);
-        LM_GGML_ASSERT(C->ne[1] == n_tokens);
+        LM_GGML_ASSERT(B->ne[1] == n_seq_tokens);
+        LM_GGML_ASSERT(B->ne[2] == n_seqs);
     }
     bool is_node = false;
-    if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) {
+    if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad) {
         LM_GGML_ABORT("fatal error"); // TODO: implement
         is_node = true;
     }
-    // 2-in-1 concatenated y and ssm_states, {d_inner, n_tokens} with {d_state, d_inner, n_kv}
+    // concatenated y + ssm_states
     struct lm_ggml_tensor * result = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_F32, lm_ggml_nelements(x) + lm_ggml_nelements(s));
     result->op   = LM_GGML_OP_SSM_SCAN;
@@ -7322,7 +7317,6 @@ struct lm_ggml_tensor * lm_ggml_ssm_scan(
     result->src[3] = A;
     result->src[4] = B;
     result->src[5] = C;
-    result->src[6] = sq;
     return result;
 }
@@ -10995,11 +10989,6 @@ static void lm_ggml_compute_forward_concat_f32(
     LM_GGML_TENSOR_BINARY_OP_LOCALS
-    // TODO: support for transposed / permuted tensors
-    LM_GGML_ASSERT(nb0  == sizeof(float));
-    LM_GGML_ASSERT(nb00 == sizeof(float));
-    LM_GGML_ASSERT(nb10 == sizeof(float));
     const int32_t dim = lm_ggml_get_op_params_i32(dst, 0);
     LM_GGML_ASSERT(dim >= 0 && dim < 4);
@@ -15283,11 +15272,17 @@ static void lm_ggml_compute_forward_flash_attn_ext_f16(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
-    float scale    = 1.0f;
-    float max_bias = 0.0f;
+    float scale         = 1.0f;
+    float max_bias      = 0.0f;
+    float logit_softcap = 0.0f;
-    memcpy(&scale,    (float *) dst->op_params + 0, sizeof(float));
-    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
+    memcpy(&scale,         (float *) dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias,      (float *) dst->op_params + 1, sizeof(float));
+    memcpy(&logit_softcap, (float *) dst->op_params + 2, sizeof(float));
+    if (logit_softcap != 0) {
+        scale /= logit_softcap;
+    }
     const uint32_t n_head      = neq2;
     const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
@@ -15351,7 +15346,13 @@ static void lm_ggml_compute_forward_flash_attn_ext_f16(
             const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3);
             kq_vec_dot(D, &s, 0, k_data, 0, Q_q, 0, 1);
-            s = s*scale + mv; // scale KQ value and apply mask
+            s = s*scale; // scale KQ value
+            if (logit_softcap != 0.0f) {
+                s = logit_softcap*tanhf(s);
+            }
+            s += mv; // apply mask
             const float Mold = M;
@@ -15360,7 +15361,7 @@ static void lm_ggml_compute_forward_flash_attn_ext_f16(
             const char * v_data = ((const char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3));
-            if (v->type== LM_GGML_TYPE_F16) {
+            if (v->type == LM_GGML_TYPE_F16) {
                 if (s > M) {
                     // s is new maximum, ms < 1.0f, vs == expf(s - s) == 1.0f
                     M = s;
@@ -15427,7 +15428,7 @@ static void lm_ggml_compute_forward_flash_attn_ext(
         const struct lm_ggml_tensor * v,
         const struct lm_ggml_tensor * mask,
         struct lm_ggml_tensor * dst) {
-    switch (dst->op_params[2]) {
+    switch (dst->op_params[3]) {
         case LM_GGML_PREC_DEFAULT:
         case LM_GGML_PREC_F32:
             {
@@ -15782,27 +15783,22 @@ static void lm_ggml_compute_forward_flash_attn_back(
 static void lm_ggml_compute_forward_ssm_conv_f32(
         const struct lm_ggml_compute_params * params,
         struct lm_ggml_tensor * dst) {
-    const struct lm_ggml_tensor * src0 = dst->src[0]; // conv_state
-    const struct lm_ggml_tensor * src1 = dst->src[1]; // x
-    const struct lm_ggml_tensor * src2 = dst->src[2]; // conv1d.weight
-    const struct lm_ggml_tensor * src3 = dst->src[3]; // state_seq
+    const struct lm_ggml_tensor * src0 = dst->src[0]; // conv_x
+    const struct lm_ggml_tensor * src1 = dst->src[1]; // conv1d.weight
     const int ith = params->ith;
     const int nth = params->nth;
-    const int nc   = src2->ne[0]; // d_conv
-    const int nr   = src0->ne[1]; // d_inner
-    const int n_t  = src1->ne[1]; // n_tokens
-    const int n_kv = src0->ne[2]; // max number of sequences in the batch
+    const int nc  = src1->ne[0]; // d_conv
+    const int ncs = src0->ne[0]; // d_conv - 1 + n_t
+    const int nr  = src0->ne[1]; // d_inner
+    const int n_t =  dst->ne[1]; // tokens per sequence
+    const int n_s =  dst->ne[2]; // number of sequences in the batch
-    LM_GGML_ASSERT((nr*n_t) + (nc*nr*n_kv) == lm_ggml_nelements(dst));
+    LM_GGML_ASSERT( dst->ne[0] == nr);
     LM_GGML_ASSERT(src0->nb[0] == sizeof(float));
     LM_GGML_ASSERT(src1->nb[0] == sizeof(float));
-    LM_GGML_ASSERT(src2->nb[0] == sizeof(float));
-    LM_GGML_ASSERT(src3->nb[0] == sizeof(int32_t));
     LM_GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
-    // for use with the destination state offset between sequences
-    LM_GGML_ASSERT(src2->nb[2] == src2->ne[1]*src2->ne[0]*sizeof(float));
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -15812,74 +15808,27 @@ static void lm_ggml_compute_forward_ssm_conv_f32(
     const int ir1 = MIN(ir0 + dr, nr);
     const int ir  = ir1 - ir0;
-    if (n_kv > 1) {
-        // multiple sequences means it's hard to know when it's the first time a state is read,
-        // so copy them all over to the destination, just to be sure.
-        for (int i3 = 0; i3 < n_kv; ++i3) {
-            float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
-            float * s  = (float *) ((char *)  dst->data + ir0*(src2->nb[1]) + i3*(src2->nb[2]) + nr*n_t*sizeof(float));
-            // can't use memcpy because of d_conv vs d_conv - 1
-            for (int i1 = 0; i1 < ir; ++i1) {
-                for (int i0 = 0; i0 < nc - 1; ++i0) {
-                    // copy s0 to last (d_conv - 1) columns of s
-                    s[1 + i0 + i1*nc] = s0[i0 + i1*(nc - 1)];
-                }
-            }
-        }
-    }
-    for (int i2 = 0; i2 < n_t; ++i2) {
-        int32_t * sq = (int32_t *) ((char *) src3->data +  i2*(src3->nb[1])); // {n_kv, n_tokens}
-        float *   x  = (float *)   ((char *)  dst->data + ir0*sizeof(float) + i2*(nr*sizeof(float))); // {d_inner, n_tokens}
-        float *   s  = (float *)   ((char *)  dst->data + ir0*(src2->nb[1]) + sq[0]*(src2->nb[2]) + nr*n_t*sizeof(float)); // {d_conv, d_inner, n_kv}
-        float *   s0; // {d_conv - 1, d_inner, n_kv}
-        float *   x0 = (float *)   ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
-        float *   c  = (float *)   ((char *) src2->data + ir0*(src2->nb[1])); // {d_conv, d_inner}
-        int ne0s0;
-        LM_GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
+    for (int i3 = 0; i3 < n_s; ++i3) {
+        for (int i2 = 0; i2 < n_t; ++i2) {
+            // {d_conv - 1 + n_t, d_inner, n_seqs}
+            // sliding window
+            const float * s = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i2*(src0->nb[0]) + i3*(src0->nb[2])); // {d_conv, d_inner, n_s}
+            const float * c = (const float *) ((const char *) src1->data + ir0*(src1->nb[1])); // {d_conv, d_inner}
+            float * x = (float *) ((char *) dst->data + ir0*(dst->nb[0]) + i2*(dst->nb[1]) + i3*(dst->nb[2])); // {d_inner, n_t, n_s}
-        // avoid needing to copy the state for the first token
-        if (i2 == 0) {
-            s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_conv - 1, d_inner, n_kv}
-            ne0s0 = src0->ne[0];
-        } else {
-            // the source is the last (d_conv - 1) columns of the destination
-            s0 = s + 1;
-            ne0s0 = nc;
-        }
-        // d_inner
-        for (int i1 = 0; i1 < ir; ++i1) {
-            // shift state left
-            for (int i0 = 0; i0 < nc - 1; ++i0) {
-                s[i0 + i1*nc] = s0[i0 + i1*ne0s0];
-            }
-            // insert x on the last column
-            s[(nc - 1) + i1*nc] = x0[i1];
-        }
-        // handle copies when there are multiple output states
-        for (int i3 = 1; i3 < n_kv; ++i3) {
-            int32_t seq = sq[i3];
-            if (0 <= seq && seq < n_kv) {
-                float * s1 = s + (seq - sq[0])*nc*nr;
-                memcpy(s1, s, nc*ir*sizeof(float));
-            } else {
-                // stop at negative or too big seq_ids
-                break;
-            }
-        }
+            // TODO: transpose the output for smaller strides for big batches?
+            // d_inner
+            for (int i1 = 0; i1 < ir; ++i1) {
+                // rowwise dot product
+                // NOTE: not using lm_ggml_vec_dot_f32, because its sum is in double precision
+                float sumf = 0.0f;
-        // it seems a little faster when this is separate from the state shift
-        for (int i1 = 0; i1 < ir; ++i1) {
-            // rowwise dot product
-            float sumf = 0.0f;
-            for (int i0 = 0; i0 < nc; ++i0) {
-                int i = i0 + i1*nc;
-                sumf += s[i] * c[i];
+                // d_conv
+                for (int i0 = 0; i0 < nc; ++i0) {
+                    sumf += s[i0 + i1*ncs] * c[i0 + i1*nc];
+                }
+                x[i1] = sumf;
             }
-            x[i1] = sumf;
         }
     }
 }
@@ -15910,15 +15859,14 @@ static void lm_ggml_compute_forward_ssm_scan_f32(
     const struct lm_ggml_tensor * src3 = dst->src[3]; // A
     const struct lm_ggml_tensor * src4 = dst->src[4]; // B
     const struct lm_ggml_tensor * src5 = dst->src[5]; // C
-    const struct lm_ggml_tensor * src6 = dst->src[6]; // sq
     const int ith = params->ith;
     const int nth = params->nth;
-    const int64_t nc   = src0->ne[0]; // d_state
-    const int64_t nr   = src0->ne[1]; // d_inner
-    const int64_t n_t  = src1->ne[1]; // number of tokens in the batch
-    const int64_t n_kv = src0->ne[2]; // max number of sequences in the batch
+    const int64_t nc  = src0->ne[0]; // d_state
+    const int64_t nr  = src0->ne[1]; // d_inner
+    const int64_t n_t = src1->ne[1]; // number of tokens per sequence
+    const int64_t n_s = src0->ne[2]; // number of sequences in the batch
     LM_GGML_ASSERT(lm_ggml_nelements(src1) + lm_ggml_nelements(src0) == lm_ggml_nelements(dst));
     LM_GGML_ASSERT(src0->nb[0] == sizeof(float));
@@ -15927,12 +15875,12 @@ static void lm_ggml_compute_forward_ssm_scan_f32(
     LM_GGML_ASSERT(src3->nb[0] == sizeof(float));
     LM_GGML_ASSERT(src4->nb[0] == sizeof(float));
     LM_GGML_ASSERT(src5->nb[0] == sizeof(float));
-    // required for the dot product between s and C, and when copying the states
+    // required for the dot product between s and C
     LM_GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
     // required for per-sequence offsets for states
     LM_GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
-    // required to get correct offset for state destination (i.e. src1->nb[2])
-    LM_GGML_ASSERT(src1->nb[2] == src1->ne[0]*src1->ne[1]*sizeof(float));
+    // required to get correct offset for state destination (i.e. src1->nb[3])
+    LM_GGML_ASSERT(src1->nb[3] == src1->ne[0]*src1->ne[1]*src1->ne[2]*sizeof(float));
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -15942,64 +15890,36 @@ static void lm_ggml_compute_forward_ssm_scan_f32(
     const int ir1 = MIN(ir0 + dr, nr);
     const int ir  = ir1 - ir0;
-    if (n_kv > 1) {
-        // it's hard to know if the source states have already been copied
-        // when there are multiple, so copy them already.
-        for (int i3 = 0; i3 < n_kv; ++i3) {
-            float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
-            float * s  = (float *) ((char *)  dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[2]);
-            memcpy(s, s0, nc*ir*sizeof(float));
-        }
-    }
-    for (int i2 = 0; i2 < n_t; ++i2) {
-        int32_t * sq = (int32_t *) ((char *) src6->data +  i2*(src6->nb[1])); // {n_kv, n_tokens}
-        float *   y  = (float *)   ((char *)  dst->data + ir0*(src1->nb[0]) +    i2*(src1->nb[1])); // {d_inner, n_tokens}
-        float *   s  = (float *)   ((char *)  dst->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2]) + src1->nb[2]); // {d_state, d_inner, n_kv}
-        float *   s0;
-        float *   x  = (float *)   ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
-        float *   dt = (float *)   ((char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1])); // {d_inner, n_tokens}
-        float *   A  = (float *)   ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
-        float *   B  = (float *)   ((char *) src4->data +  i2*(src4->nb[1])); // {d_state, n_tokens}
-        float *   C  = (float *)   ((char *) src5->data +  i2*(src5->nb[1])); // {d_state, n_tokens}
-        LM_GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
-        // avoid needing to copy the state for the first token
-        if (i2 == 0) {
-            s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_state, d_inner, n_kv}
-        } else {
-            // otherwise the source is the same as the destination
-            s0 = s;
-        }
-        // d_inner
-        for (int i1 = 0; i1 < ir; ++i1) {
-            // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
-            float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
-            float x_dt = x[i1] * dt_soft_plus;
-            float sumf = 0.0f;
-            // d_state
-            for (int i0 = 0; i0 < nc; ++i0) {
-                int i = i0 + i1*nc;
-                // state = prev_state * dA + dB * x
-                float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
-                // y = rowwise_dotprod(state, C)
-                sumf += state * C[i0];
-                s[i] = state;
-            }
-            y[i1] = sumf;
-        }
-        // handle copies when there are multiple output states
-        for (int i3 = 1; i3 < n_kv; ++i3) {
-            int32_t seq = sq[i3];
-            if (0 <= seq && seq < n_kv) {
-                float * s1 = s + (seq - sq[0])*nc*nr;
-                memcpy(s1, s, nc*ir*sizeof(float));
-            } else {
-                // stop at negative or too big seq_ids
-                break;
+    for (int i3 = 0; i3 < n_s; ++i3) {
+        for (int i2 = 0; i2 < n_t; ++i2) {
+            const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
+            const float * x  = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+            const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
+            const float * A  = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
+            const float * B  = (const float *) ((const char *) src4->data +  i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
+            const float * C  = (const float *) ((const char *) src5->data +  i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
+                  float * y  = (      float *) ((      char *) dst->data  + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                  float * s  = (      float *) ((      char *) dst->data  + ir0*(src0->nb[1]) + i3*(src0->nb[2]) +     src1->nb[3]);  // {d_state, d_inner, n_s}
+            // use the output as the source for the next token-wise iterations
+            if (i2 > 0) { s0 = s; }
+            // d_inner
+            for (int i1 = 0; i1 < ir; ++i1) {
+                // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
+                float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
+                float x_dt = x[i1] * dt_soft_plus;
+                float sumf = 0.0f;
+                // d_state
+                for (int i0 = 0; i0 < nc; ++i0) {
+                    int i = i0 + i1*nc;
+                    // state = prev_state * dA + dB * x
+                    float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+                    // y = rowwise_dotprod(state, C)
+                    sumf += state * C[i0];
+                    s[i] = state;
+                }
+                y[i1] = sumf;
             }
         }
     }

package/cpp/ggml.h CHANGED Viewed

@@ -1760,7 +1760,8 @@ extern "C" {
             struct lm_ggml_tensor  * v,
             struct lm_ggml_tensor  * mask,
             float                 scale,
-            float                 max_bias);
+            float                 max_bias,
+            float                 logit_softcap);
     LM_GGML_API void lm_ggml_flash_attn_ext_set_prec(
             struct lm_ggml_tensor * a,
@@ -1777,10 +1778,8 @@ extern "C" {
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_ssm_conv(
             struct lm_ggml_context * ctx,
-            struct lm_ggml_tensor  * s,
-            struct lm_ggml_tensor  * x,
-            struct lm_ggml_tensor  * c,
-            struct lm_ggml_tensor  * sq);
+            struct lm_ggml_tensor  * sx,
+            struct lm_ggml_tensor  * c);
     LM_GGML_API struct lm_ggml_tensor * lm_ggml_ssm_scan(
             struct lm_ggml_context * ctx,
@@ -1789,8 +1788,7 @@ extern "C" {
             struct lm_ggml_tensor  * dt,
             struct lm_ggml_tensor  * A,
             struct lm_ggml_tensor  * B,
-            struct lm_ggml_tensor  * C,
-            struct lm_ggml_tensor  * sq);
+            struct lm_ggml_tensor  * C);
     // partition into non-overlapping windows with padding if needed
     // example:

package/cpp/llama-impl.h CHANGED Viewed

@@ -31,11 +31,17 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
     if (search.empty()) {
-        return; // Avoid infinite loop if 'search' is an empty string
+        return;
     }
+    std::string builder;
+    builder.reserve(s.length());
     size_t pos = 0;
-    while ((pos = s.find(search, pos)) != std::string::npos) {
-        s.replace(pos, search.length(), replace);
-        pos += replace.length();
+    size_t last_pos = 0;
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
+        builder.append(s, last_pos, pos - last_pos);
+        builder.append(replace);
+        last_pos = pos + search.length();
     }
+    builder.append(s, last_pos, std::string::npos);
+    s = std::move(builder);
 }

package/cpp/llama-sampling.cpp CHANGED Viewed

@@ -175,9 +175,10 @@ void llama_sample_xtc_impl(struct llama_sampling * smpl, llama_token_data_array
     if(xtc_threshold <= 0.0f || !candidates-> size) {
         return;
     }
-    // TODO: xtc impl
     bool xtc_applied = false;
     const int64_t t_start_sample_us = lm_ggml_time_us();
+    llama_sample_softmax(nullptr, candidates);
     // unsorted iteration
     if (!candidates->sorted) {
@@ -199,7 +200,7 @@ void llama_sample_xtc_impl(struct llama_sampling * smpl, llama_token_data_array
         // sort top_tokens
         std::sort(top_tokens.begin(), top_tokens.end(), [](const llama_token_data & a, const llama_token_data & b) {
-            return a.logit > b.logit;
+            return a.logit < b.logit;
         });
         // insert top_tokens with probability. Always insert lowest top_token
@@ -232,30 +233,31 @@ void llama_sample_xtc_impl(struct llama_sampling * smpl, llama_token_data_array
         size_t last_index = 0;
         for (; last_index < candidates -> size; ++last_index) {
-            if(candidates -> data[last_index].logit < xtc_threshold) {
+            if(candidates -> data[last_index].p < xtc_threshold) {
                 break;
             }
         }
-        last_index--;
-        // check if only 1 last index token or total less than min_keep
-        if(last_index <= 1 || candidates-> size - last_index < min_keep) {
+        // check if only 1 token above threshold
+        if(last_index <= 1) {
             return;
         }
-        // indexes to be skipped
-        size_t safe_index = 0;
+        last_index--;
+        // items beyond safe index will be ignored
+        size_t safe_index = candidates -> size;
         // remove tokens until last threshold item
-        candidates -> data;
         std::uniform_real_distribution<float> random_float(0.0 , 1.0);
         for (size_t i = 0; i < last_index; i++) {
             if(random_float(rng) < xtc_probability) {
-                if(i != safe_index) {
-                    std::swap(candidates-> data[i], candidates->data[safe_index]);
+                std::swap(candidates-> data[i], candidates->data[safe_index - 1]);
+                safe_index--;
+                if (candidates-> sorted) {
+                    candidates -> sorted = false;
                 }
-                safe_index++;
             }
         }
-        candidates -> data = candidates -> data + safe_index;
-        candidates -> size = candidates -> size - safe_index;
+        candidates -> size = safe_index;
     }
     if (smpl) {