npm - cui-llama.rn - Versions diffs - 1.6.1 → 1.7.0 - Mend

cui-llama.rn 1.6.1 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (196) hide show

package/cpp/llama-context.h CHANGED Viewed

@@ -7,6 +7,7 @@
 #include "llama-adapter.h"
 #include "ggml-cpp.h"
+#include "ggml-opt.h"
 #include <map>
 #include <vector>
@@ -133,6 +134,32 @@ struct llama_context {
     llama_perf_context_data perf_get_data() const;
     void perf_reset();
+    //
+    // training
+    //
+    void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
+    void opt_epoch(
+            lm_ggml_opt_dataset_t      dataset,
+            lm_ggml_opt_result_t       result_train,
+            lm_ggml_opt_result_t       result_eval,
+            int64_t                 idata_split,
+            lm_ggml_opt_epoch_callback callback_train,
+            lm_ggml_opt_epoch_callback callback_eval);
+    void opt_epoch_iter(
+            lm_ggml_opt_dataset_t               dataset,
+            lm_ggml_opt_result_t                result,
+            const std::vector<llama_token> & tokens,
+            const std::vector<llama_token> & labels_sparse,
+            llama_batch                    & batch,
+            lm_ggml_opt_epoch_callback          callback,
+            bool                             train,
+            int64_t                          idata_in_loop,
+            int64_t                          ndata_in_loop,
+            int64_t                          t_loop_start);
 private:
     //
     // output
@@ -187,9 +214,6 @@ private:
     std::unique_ptr<llama_memory_i> memory;
-    // TODO: remove
-    bool logits_all = false;
     // decode output (2-dimensional array: [n_outputs][n_vocab])
     size_t  logits_size = 0; // capacity (of floats) for logits
     float * logits      = nullptr;
@@ -215,6 +239,9 @@ private:
     lm_ggml_context_ptr ctx_compute;
+    // training
+    lm_ggml_opt_context_t opt_ctx = nullptr;
     lm_ggml_threadpool_t threadpool       = nullptr;
     lm_ggml_threadpool_t threadpool_batch = nullptr;

package/cpp/llama-cparams.h CHANGED Viewed

@@ -30,6 +30,7 @@ struct llama_cparams {
     bool flash_attn;
     bool no_perf;
     bool warmup;
+    bool op_offload;
     enum llama_pooling_type pooling_type;

package/cpp/llama-graph.cpp CHANGED Viewed

@@ -9,33 +9,6 @@
 #include <cmath>
 #include <cstring>
-static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
-    // TODO move to hparams if a T5 variant appears that uses a different value
-    const int64_t max_distance = 128;
-    if (bidirectional) {
-        n_buckets >>= 1;
-    }
-    const int64_t max_exact = n_buckets >> 1;
-    int32_t relative_position = x - y;
-    int32_t relative_bucket = 0;
-    if (bidirectional) {
-        relative_bucket += (relative_position > 0) * n_buckets;
-        relative_position = abs(relative_position);
-    } else {
-        relative_position = -std::min<int32_t>(relative_position, 0);
-    }
-    int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
-    relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
-    relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
-    return relative_bucket;
-}
 void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
     if (ubatch->token) {
         const int64_t n_tokens = ubatch->n_tokens;
@@ -110,22 +83,7 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
 void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
     if (pos_bucket) {
-        const int64_t n_tokens = ubatch->n_tokens;
-        LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(pos_bucket->buffer));
-        LM_GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
-        int32_t * data = (int32_t *) pos_bucket->data;
-        const int64_t n_kv = kv_self->n;
-        for (int h = 0; h < 1; ++h) {
-            for (int j = 0; j < n_tokens; ++j) {
-                for (int i = 0; i < n_kv; ++i) {
-                    data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
-                }
-            }
-        }
+        kv_self->set_input_pos_bucket(pos_bucket, ubatch);
     }
 }
@@ -403,99 +361,18 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
 }
 void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
-    if (self_kq_mask || self_kq_mask_swa) {
-        const int64_t n_kv         = kv_self->n;
-        const int64_t n_tokens     = ubatch->n_tokens;
-        const int64_t n_seq_tokens = ubatch->n_seq_tokens;
-        const int64_t n_seqs       = ubatch->n_seqs;
-        float * data     = nullptr;
-        float * data_swa = nullptr;
-        if (self_kq_mask) {
-            LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(self_kq_mask->buffer));
-            data = (float *) self_kq_mask->data;
-        }
-        if (self_kq_mask_swa) {
-            LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
-            data_swa = (float *) self_kq_mask_swa->data;
-        }
-        // Use only the previous KV cells of the correct sequence for each token of the ubatch.
-        // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
-        // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
-        //   Causal mask:
-        //      xxx-------
-        //      xxxx------
-        //      xxxxx-----
-        //   Non-causal mask:
-        //      xxxxx-----
-        //      xxxxx-----
-        //      xxxxx-----
-        // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
-        for (int h = 0; h < 1; ++h) {
-            for (int s = 0; s < n_seqs; ++s) {
-                const llama_seq_id seq_id = ubatch->seq_id[s][0];
-                for (int j = 0; j < n_seq_tokens; ++j) {
-                    const llama_pos pos = ubatch->pos[s*n_seq_tokens + j];
-                    for (int i = 0; i < n_kv; ++i) {
-                        float f;
-                        // mask the token if:
-                        if (!kv_self->cells[i].has_seq_id(seq_id) // not the correct sequence
-                            || (cparams.causal_attn && kv_self->cells[i].pos > pos) // for causal, mask future tokens
-                        ) {
-                            f = -INFINITY;
-                        } else {
-                            if (hparams.use_alibi) {
-                                f = -std::abs(kv_self->cells[i].pos - pos);
-                            } else {
-                                f = 0.0f;
-                            }
-                        }
-                        if (data) {
-                            data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
-                        }
-                        // may need to cut off old tokens for sliding window
-                        // TODO @ngxson : we are currently re-using the swa logic to store the chunked mask, we should rename SWA to something more generic like "aux mask"
-                        if (data_swa) {
-                            if (hparams.n_attn_chunk) {
-                                llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
-                                if (kv_self->cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
-                                    f = -INFINITY;
-                                }
-                            } else {
-                                if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
-                                    f = -INFINITY;
-                                }
-                            }
-                            data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
-                        }
-                    }
-                }
-            }
+    if (self_kq_mask) {
+        kv_self->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+    }
+}
-            // mask padded tokens
-            if (data) {
-                for (int i = n_tokens; i < LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD); ++i) {
-                    for (int j = 0; j < n_kv; ++j) {
-                        data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
-                    }
-                }
-            }
+void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
+    if (self_kq_mask) {
+        kv_self->get_kv_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+    }
-            // mask padded tokens
-            if (data_swa) {
-                for (int i = n_tokens; i < LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD); ++i) {
-                    for (int j = 0; j < n_kv; ++j) {
-                        data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
-                    }
-                }
-            }
-        }
+    if (self_kq_mask_swa) {
+        kv_self->get_kv_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
     }
 }
@@ -545,7 +422,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     n_layer          (hparams.n_layer),
     n_rot            (hparams.n_rot),
     n_ctx            (cparams.n_ctx),
-    n_ctx_per_seq    (cparams.n_ctx / cparams.n_seq_max),
     n_head           (hparams.n_head()),
     n_head_kv        (hparams.n_head_kv()),
     n_embd_head_k    (hparams.n_embd_head_k),
@@ -782,7 +658,7 @@ lm_ggml_tensor * llm_graph_context::build_ffn(
             } break;
     }
-    if (type_gate == LLM_FFN_PAR) {
+    if (gate && type_gate == LLM_FFN_PAR) {
         cur = lm_ggml_mul(ctx0, cur, tmp);
         cb(cur, "ffn_gate_par", il);
     }
@@ -971,6 +847,7 @@ lm_ggml_tensor * llm_graph_context::build_inp_embd(lm_ggml_tensor * tok_embd) co
         inp->tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, ubatch.n_tokens);
         //cb(inp->tokens, "inp_tokens", -1);
         lm_ggml_set_input(inp->tokens);
+        res->t_tokens = inp->tokens;
         cur = lm_ggml_get_rows(ctx0, tok_embd, inp->tokens);
@@ -1152,7 +1029,7 @@ lm_ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
     auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_self);
-    const auto n_kv = kv_self->n;
+    const auto n_kv = kv_self->get_n();
     auto & cur = inp->pos_bucket;
@@ -1187,16 +1064,12 @@ lm_ggml_tensor * llm_graph_context::build_attn_mha(
          lm_ggml_tensor * kq_b,
          lm_ggml_tensor * kq_mask,
          lm_ggml_tensor * v_mla,
-             bool      v_trans,
              float     kq_scale) const {
-  //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-  //const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+    const bool v_trans = v->nb[1] > v->nb[2];
-  //const int64_t n_head    = hparams.n_head(il);
-  //const int64_t n_head_kv = hparams.n_head_kv(il);
-  //const auto & n_embd_head_k = hparams.n_embd_head_k;
-  //const auto & n_embd_head_v = hparams.n_embd_head_v;
+    q = lm_ggml_permute(ctx0, q, 0, 2, 1, 3);
+    k = lm_ggml_permute(ctx0, k, 0, 2, 1, 3);
+    v = lm_ggml_permute(ctx0, v, 0, 2, 1, 3);
     const auto n_tokens = q->ne[1];
     const auto n_head   = q->ne[2];
@@ -1227,8 +1100,19 @@ lm_ggml_tensor * llm_graph_context::build_attn_mha(
         lm_ggml_flash_attn_ext_set_prec(cur, LM_GGML_PREC_F32);
         if (v_mla) {
+#if 0
+            // v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
+            // However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
             cur = lm_ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
             cur = lm_ggml_mul_mat(ctx0, v_mla, cur);
+#else
+            // It's preferable to do the calculation as a matrix-matrix multiplication with n_tokens in dimension 1.
+            // The permutations are noops and only change how the tensor data is interpreted.
+            cur = lm_ggml_permute(ctx0, cur, 0, 2, 1, 3);
+            cur = lm_ggml_mul_mat(ctx0, v_mla, cur);
+            cur = lm_ggml_permute(ctx0, cur, 0, 2, 1, 3);
+            cur = lm_ggml_cont(ctx0, cur); // Needed because lm_ggml_reshape_2d expects contiguous inputs.
+#endif
         }
         cur = lm_ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
@@ -1324,17 +1208,11 @@ lm_ggml_tensor * llm_graph_context::build_attn(
     const auto & kq_mask = inp->get_kq_mask();
-    lm_ggml_tensor * q = lm_ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
-    //cb(q, "q", il);
-    lm_ggml_tensor * k = lm_ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
-    //cb(k, "k", il);
-    lm_ggml_tensor * v = lm_ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
-    //cb(k, "v", il);
-    lm_ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
+    lm_ggml_tensor * q = q_cur;
+    lm_ggml_tensor * k = k_cur;
+    lm_ggml_tensor * v = v_cur;
+    lm_ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
     cb(cur, "kqv_out", il);
     if (wo) {
@@ -1357,22 +1235,16 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()
     auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
-    const auto n_kv = kv_self->n;
-    inp->self_kq_mask = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_kv, LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD));
-    //cb(inp->self_kq_mask, "KQ_mask", -1);
-    lm_ggml_set_input(inp->self_kq_mask);
-    inp->self_kq_mask_cnv = cparams.flash_attn ? lm_ggml_cast(ctx0, inp->self_kq_mask, LM_GGML_TYPE_F16) : inp->self_kq_mask;
+    {
+        LM_GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");
-    if (hparams.n_swa_pattern > 1) {
-        LM_GGML_ASSERT(hparams.n_swa > 0);
+        const auto n_kv = kv_self->get_n();
-        inp->self_kq_mask_swa = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_kv, LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD));
-        //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
-        lm_ggml_set_input(inp->self_kq_mask_swa);
+        inp->self_kq_mask = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_kv, LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD));
+        //cb(inp->self_kq_mask, "KQ_mask", -1);
+        lm_ggml_set_input(inp->self_kq_mask);
-        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? lm_ggml_cast(ctx0, inp->self_kq_mask_swa, LM_GGML_TYPE_F16) : inp->self_kq_mask_swa;
+        inp->self_kq_mask_cnv = cparams.flash_attn ? lm_ggml_cast(ctx0, inp->self_kq_mask, LM_GGML_TYPE_F16) : inp->self_kq_mask;
     }
     return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp));
@@ -1397,85 +1269,108 @@ lm_ggml_tensor * llm_graph_context::build_attn(
     lm_ggml_build_forward_expand(gf, v_cur);
     const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
-    const auto & n_ctx = cparams.n_ctx;
-    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+    // store to KV cache
+    {
+        lm_ggml_build_forward_expand(gf, kv_self->cpy_k(ctx0, k_cur, il));
+        lm_ggml_build_forward_expand(gf, kv_self->cpy_v(ctx0, v_cur, il));
+    }
-    const auto n_tokens = q_cur->ne[2];
+    const auto & kq_mask = inp->get_kq_mask();
-    const bool v_trans = !cparams.flash_attn;
+    lm_ggml_tensor * q = q_cur;
+    lm_ggml_tensor * k = kv_self->get_k(ctx0, il);
+    lm_ggml_tensor * v = kv_self->get_v(ctx0, il);
-    // store to KV cache
-    {
-        const auto kv_head = kv_self->head;
+    lm_ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
+    cb(cur, "kqv_out", il);
-        LM_GGML_ASSERT(kv_self->size == n_ctx);
+    if (wo) {
+        cur = build_lora_mm(wo, cur);
+    }
-        lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx0, kv_self->k_l[il], n_tokens*n_embd_k_gqa, lm_ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa)*kv_head);
-        //cb(k_cache_view, "k_cache_view", il);
+    if (wo_b) {
+        cur = lm_ggml_add(ctx0, cur, wo_b);
+    }
-        // note: storing RoPE-ed version of K in the KV cache
-        lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, k_cur, k_cache_view));
+    return cur;
+}
-        v_cur = lm_ggml_reshape_2d(ctx0, v_cur, n_embd_v_gqa, n_tokens);
+llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
+    const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
-        lm_ggml_tensor * v_cache_view = nullptr;
+    auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, kv_self);
-        if (!v_trans) {
-            v_cache_view = lm_ggml_view_1d(ctx0, kv_self->v_l[il], n_tokens*n_embd_v_gqa, lm_ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa)*kv_head);
-        } else {
-            // note: the V cache is transposed when not using flash attention
-            v_cache_view = lm_ggml_view_2d(ctx0, kv_self->v_l[il], n_tokens, n_embd_v_gqa,
-                    (  n_ctx)*lm_ggml_element_size(kv_self->v_l[il]),
-                    (kv_head)*lm_ggml_element_size(kv_self->v_l[il]));
+    {
+        const auto n_kv = kv_self->get_kv_base()->get_n();
-            v_cur = lm_ggml_transpose(ctx0, v_cur);
-        }
-        //cb(v_cache_view, "v_cache_view", il);
+        inp->self_kq_mask = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_kv, LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD));
+        //cb(inp->self_kq_mask, "KQ_mask", -1);
+        lm_ggml_set_input(inp->self_kq_mask);
-        lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, v_cur, v_cache_view));
+        inp->self_kq_mask_cnv = cparams.flash_attn ? lm_ggml_cast(ctx0, inp->self_kq_mask, LM_GGML_TYPE_F16) : inp->self_kq_mask;
     }
+    {
+        LM_GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA");
+        const auto n_kv = kv_self->get_kv_swa()->get_n();
+        inp->self_kq_mask_swa = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_kv, LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD));
+        //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
+        lm_ggml_set_input(inp->self_kq_mask_swa);
+        inp->self_kq_mask_swa_cnv = cparams.flash_attn ? lm_ggml_cast(ctx0, inp->self_kq_mask_swa, LM_GGML_TYPE_F16) : inp->self_kq_mask_swa;
+    }
+    return (llm_graph_input_attn_kv_unified_iswa *) res->add_input(std::move(inp));
+}
+lm_ggml_tensor * llm_graph_context::build_attn(
+        llm_graph_input_attn_kv_unified_iswa * inp,
+        lm_ggml_cgraph * gf,
+        lm_ggml_tensor * wo,
+        lm_ggml_tensor * wo_b,
+        lm_ggml_tensor * q_cur,
+        lm_ggml_tensor * k_cur,
+        lm_ggml_tensor * v_cur,
+        lm_ggml_tensor * kq_b,
+        lm_ggml_tensor * v_mla,
+            float     kq_scale,
+            int       il) const {
+    // these nodes are added to the graph together so that they are not reordered
+    // by doing so, the number of splits in the graph is reduced
+    lm_ggml_build_forward_expand(gf, q_cur);
+    lm_ggml_build_forward_expand(gf, k_cur);
+    lm_ggml_build_forward_expand(gf, v_cur);
     const bool is_swa = hparams.is_swa(il);
+    const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
+    const auto * kv = is_swa ? kv_self->get_kv_swa() : kv_self->get_kv_base();
+    // store to KV cache
+    {
+        lm_ggml_build_forward_expand(gf, kv->cpy_k(ctx0, k_cur, il));
+        lm_ggml_build_forward_expand(gf, kv->cpy_v(ctx0, v_cur, il));
+    }
     const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
-    const auto n_kv = kv_self->n;
+    lm_ggml_tensor * q = q_cur;
+    lm_ggml_tensor * k = kv->get_k(ctx0, il);
+    lm_ggml_tensor * v = kv->get_v(ctx0, il);
-    const int64_t n_head_kv = hparams.n_head_kv(il);
-    const auto & n_embd_head_k = hparams.n_embd_head_k;
-    const auto & n_embd_head_v = hparams.n_embd_head_v;
-    lm_ggml_tensor * q = lm_ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
-    //cb(q, "q", il);
-    lm_ggml_tensor * k =
-        lm_ggml_view_3d(ctx0, kv_self->k_l[il],
-                n_embd_head_k, n_kv, n_head_kv,
-                lm_ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
-                lm_ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k),
-                0);
-    //cb(k, "k", il);
-    lm_ggml_tensor * v = !v_trans ?
-        lm_ggml_view_3d(ctx0, kv_self->v_l[il],
-                n_embd_head_v, n_kv, n_head_kv,
-                lm_ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
-                lm_ggml_row_size(kv_self->v_l[il]->type, n_embd_head_v),
-                0) :
-        lm_ggml_view_3d(ctx0, kv_self->v_l[il],
-                n_kv, n_embd_head_v, n_head_kv,
-                lm_ggml_element_size(kv_self->v_l[il])*n_ctx,
-                lm_ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v,
-                0);
-    lm_ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale);
+    lm_ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
     cb(cur, "kqv_out", il);
     if (wo) {
         cur = build_lora_mm(wo, cur);
+        if (arch == LLM_ARCH_GLM4) {
+            // GLM4 seems to have numerical issues with half-precision accumulators
+            lm_ggml_mul_mat_set_prec(cur, LM_GGML_PREC_F32);
+        }
     }
     if (wo_b) {
@@ -1522,17 +1417,11 @@ lm_ggml_tensor * llm_graph_context::build_attn(
     const auto & kq_mask = inp->get_kq_mask_cross();
-    lm_ggml_tensor * q = lm_ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
-    //cb(q, "q", il);
-    lm_ggml_tensor * k = lm_ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
-    //cb(k, "k", il);
-    lm_ggml_tensor * v = lm_ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
-    //cb(k, "v", il);
-    lm_ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
+    lm_ggml_tensor * q = q_cur;
+    lm_ggml_tensor * k = k_cur;
+    lm_ggml_tensor * v = v_cur;
+    lm_ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
     cb(cur, "kqv_out", il);
     if (wo) {
@@ -1700,3 +1589,30 @@ void llm_graph_context::build_pooling(
     lm_ggml_build_forward_expand(gf, cur);
 }
+int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
+    // TODO move to hparams if a T5 variant appears that uses a different value
+    const int64_t max_distance = 128;
+    if (bidirectional) {
+        n_buckets >>= 1;
+    }
+    const int64_t max_exact = n_buckets >> 1;
+    int32_t relative_position = x - y;
+    int32_t relative_bucket = 0;
+    if (bidirectional) {
+        relative_bucket += (relative_position > 0) * n_buckets;
+        relative_position = abs(relative_position);
+    } else {
+        relative_position = -std::min<int32_t>(relative_position, 0);
+    }
+    int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
+    relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
+    relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
+    return relative_bucket;
+}