npm - cui-llama.rn - Versions diffs - 1.2.6 → 1.3.3 - Mend

cui-llama.rn 1.2.6 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

package/README.md +3 -2
package/android/src/main/CMakeLists.txt +26 -6
package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
package/android/src/main/jni.cpp +228 -40
package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
package/cpp/amx/amx.cpp +196 -0
package/cpp/amx/amx.h +20 -0
package/cpp/amx/common.h +101 -0
package/cpp/amx/mmq.cpp +2524 -0
package/cpp/amx/mmq.h +16 -0
package/cpp/common.cpp +118 -251
package/cpp/common.h +53 -30
package/cpp/ggml-aarch64.c +46 -3395
package/cpp/ggml-aarch64.h +0 -20
package/cpp/ggml-alloc.c +6 -8
package/cpp/ggml-backend-impl.h +33 -11
package/cpp/ggml-backend-reg.cpp +423 -0
package/cpp/ggml-backend.cpp +14 -676
package/cpp/ggml-backend.h +46 -9
package/cpp/ggml-common.h +6 -0
package/cpp/ggml-cpu-aarch64.c +3823 -0
package/cpp/ggml-cpu-aarch64.h +32 -0
package/cpp/ggml-cpu-impl.h +14 -242
package/cpp/ggml-cpu-quants.c +10835 -0
package/cpp/ggml-cpu-quants.h +63 -0
package/cpp/ggml-cpu.c +13971 -13720
package/cpp/ggml-cpu.cpp +715 -0
package/cpp/ggml-cpu.h +65 -63
package/cpp/ggml-impl.h +285 -25
package/cpp/ggml-metal.h +8 -8
package/cpp/ggml-metal.m +1221 -728
package/cpp/ggml-quants.c +189 -10681
package/cpp/ggml-quants.h +78 -125
package/cpp/ggml-threading.cpp +12 -0
package/cpp/ggml-threading.h +12 -0
package/cpp/ggml.c +688 -1460
package/cpp/ggml.h +58 -244
package/cpp/json-schema-to-grammar.cpp +1045 -1045
package/cpp/json.hpp +24766 -24766
package/cpp/llama-sampling.cpp +5 -2
package/cpp/llama.cpp +409 -123
package/cpp/llama.h +8 -4
package/cpp/rn-llama.hpp +89 -25
package/cpp/sampling.cpp +42 -3
package/cpp/sampling.h +22 -1
package/cpp/sgemm.cpp +608 -0
package/cpp/speculative.cpp +270 -0
package/cpp/speculative.h +28 -0
package/cpp/unicode.cpp +11 -0
package/ios/RNLlama.mm +43 -20
package/ios/RNLlamaContext.h +9 -3
package/ios/RNLlamaContext.mm +146 -33
package/jest/mock.js +0 -1
package/lib/commonjs/NativeRNLlama.js.map +1 -1
package/lib/commonjs/grammar.js +4 -2
package/lib/commonjs/grammar.js.map +1 -1
package/lib/commonjs/index.js +52 -15
package/lib/commonjs/index.js.map +1 -1
package/lib/module/NativeRNLlama.js.map +1 -1
package/lib/module/grammar.js +2 -1
package/lib/module/grammar.js.map +1 -1
package/lib/module/index.js +51 -15
package/lib/module/index.js.map +1 -1
package/lib/typescript/NativeRNLlama.d.ts +122 -8
package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
package/lib/typescript/grammar.d.ts +5 -6
package/lib/typescript/grammar.d.ts.map +1 -1
package/lib/typescript/index.d.ts +15 -6
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +2 -1
package/src/NativeRNLlama.ts +135 -13
package/src/grammar.ts +10 -8
package/src/index.ts +104 -28

package/cpp/llama.cpp CHANGED Viewed

@@ -190,6 +190,7 @@ enum llm_arch {
     LLM_ARCH_COMMAND_R,
     LLM_ARCH_DBRX,
     LLM_ARCH_OLMO,
+    LLM_ARCH_OLMO2,
     LLM_ARCH_OLMOE,
     LLM_ARCH_OPENELM,
     LLM_ARCH_ARCTIC,
@@ -243,6 +244,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_COMMAND_R,       "command-r"    },
     { LLM_ARCH_DBRX,            "dbrx"         },
     { LLM_ARCH_OLMO,            "olmo"         },
+    { LLM_ARCH_OLMO2,           "olmo2"        },
     { LLM_ARCH_OLMOE,           "olmoe"        },
     { LLM_ARCH_OPENELM,         "openelm"      },
     { LLM_ARCH_ARCTIC,          "arctic"       },
@@ -1218,6 +1220,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_OLMO2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
     {
         LLM_ARCH_OLMOE,
         {
@@ -2312,6 +2333,7 @@ enum e_model {
     MODEL_1B,
     MODEL_1_3B,
     MODEL_1_4B,
+    MODEL_1_5B,
     MODEL_1_6B,
     MODEL_2B,
     MODEL_2_8B,
@@ -2330,6 +2352,7 @@ enum e_model {
     MODEL_16B,
     MODEL_20B,
     MODEL_30B,
+    MODEL_32B,
     MODEL_34B,
     MODEL_35B,
     MODEL_40B,
@@ -2917,9 +2940,15 @@ struct llama_model {
     // for quantize-stats only
     std::vector<std::pair<std::string, struct lm_ggml_tensor *>> tensors_by_name;
-    int64_t t_load_us = 0;
+    int64_t t_load_us  = 0;
     int64_t t_start_us = 0;
+    // total number of parameters in the model
+    uint64_t n_elements = 0;
+    // total size of all the tensors in the model in bytes
+    size_t  n_bytes     = 0;
     // keep track of loaded lora adapters
     std::set<struct llama_lora_adapter *> lora_adapters;
@@ -3464,21 +3493,13 @@ static bool llama_kv_cache_init(
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
-        const llama_model::buft_list_t * buft_list;
+        lm_ggml_backend_buffer_type_t buft;
         if (offload) {
-            buft_list = model.dev_layer.at(i).buft_list;
+            auto * dev = model.dev_layer.at(i).dev;
+            buft = lm_ggml_backend_dev_buffer_type(dev);
         } else {
-            buft_list = &model.cpu_buft_list;
+            buft = lm_ggml_backend_cpu_buffer_type();
         }
-        lm_ggml_backend_buffer_type_t buft = select_buft(*buft_list,
-            [&](lm_ggml_context * ctx) {
-                lm_ggml_tensor * k = lm_ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
-                if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
-                    return k;
-                }
-                lm_ggml_tensor * p = lm_ggml_new_tensor_1d(ctx, LM_GGML_TYPE_I32, 1);
-                return lm_ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
-            });
         lm_ggml_context * ctx = ctx_for_buft(buft);
         if (!ctx) {
@@ -3512,11 +3533,24 @@ static bool llama_kv_cache_init(
     return true;
 }
+// a structure holds information about the slot found in llama_kv_cache_find_slot
+struct llama_kv_cache_slot_info {
+    std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
+    bool found = false;                       // the slot was found
+    explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
+    llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
+    operator bool() const { return found; }
+};
+static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false};
 // find an empty slot of size "n_tokens" in the cache
 // updates the cache head
+// returns a structure holding information about the slot found
 // Note: On success, it's important that cache.head points
 // to the first cell of the slot.
-static bool llama_kv_cache_find_slot(
+static struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
            struct llama_kv_cache & cache,
        const struct llama_ubatch & batch) {
     const uint32_t n_tokens = batch.n_tokens;
@@ -3544,7 +3578,7 @@ static bool llama_kv_cache_find_slot(
                     // too big seq_id
                     // TODO: would it be possible to resize the cache instead?
                     LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
-                    return false;
+                    return llama_kv_cache_slot_info_failed;
                 }
                 if (j > 0) {
                     llama_kv_cell & seq = cache.cells[seq_id];
@@ -3679,15 +3713,17 @@ static bool llama_kv_cache_find_slot(
         // allow getting the range of used cells, from head to head + n
         cache.head = min;
         cache.n    = max - min + 1;
+        cache.used = std::count_if(cache.cells.begin(), cache.cells.end(),
+            [](const llama_kv_cell& cell){ return !cell.is_empty(); });
         // sanity check
-        return cache.n >= n_seqs;
+        return llama_kv_cache_slot_info(cache.n >= n_seqs);
     }
     // otherwise, one cell per token.
     if (n_tokens > cache.size) {
         LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
-        return false;
+        return llama_kv_cache_slot_info_failed;
     }
     uint32_t n_tested = 0;
@@ -3715,7 +3751,7 @@ static bool llama_kv_cache_find_slot(
         if (n_tested >= cache.size) {
             //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
-            return false;
+            return llama_kv_cache_slot_info_failed;
         }
     }
@@ -3732,7 +3768,7 @@ static bool llama_kv_cache_find_slot(
     cache.used += n_tokens;
-    return true;
+    return llama_kv_cache_slot_info(cache.head, cache.head + n_tokens);
 }
 // find how many cells are currently in use
@@ -4008,6 +4044,53 @@ static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams)
     return cparams.flash_attn ? 256u : 32u;
 }
+// saves the kv_cache state for future recovery.
+// used to rollback llama_kv_cache_find_slot changes.
+struct llama_kv_slot_restorer {
+    struct llama_kv_cache_state {
+        uint32_t head = 0;
+        uint32_t n    = 0;
+    } old_state;
+    // for non-recurrent models only
+    // list of slots to restore
+    std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
+    bool do_restore = false;
+    explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) {
+        old_state.head  = cache.head;
+        old_state.n     = cache.n;
+    }
+    // saves a slot information for future restoration
+    void save(const struct llama_kv_cache_slot_info & slot) {
+        if (slot) {
+            do_restore = true;
+            if (slot.boundaries.first != slot.boundaries.second) {
+                slot_boundaries.push_back(slot.boundaries);
+            }
+        }
+    }
+    // must be explicitly called to restore the kv_cache state
+    // and rollback changes from all llama_kv_cache_find_slot calls
+    void restore(struct llama_kv_cache & cache) {
+        if (do_restore) {
+            cache.head  = old_state.head;
+            cache.n     = old_state.n;
+            if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
+                llama_kv_cache_seq_rm(cache, -1, -1, -1);
+            } else {
+                for (auto & slot : slot_boundaries) {
+                    llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
+                }
+            }
+        }
+    }
+};
 //
 // model loading and saving
 //
@@ -4223,8 +4306,8 @@ struct llama_model_loader {
     int n_tensors = 0;
     int n_created = 0;
-    int64_t n_elements = 0;
-    size_t  n_bytes    = 0;
+    uint64_t n_elements = 0;
+    size_t  n_bytes     = 0;
     bool use_mmap = false;
     bool check_tensors;
@@ -4795,7 +4878,9 @@ struct llama_model_loader {
             mappings.reserve(files.size());
             mmaps_used.reserve(files.size());
             for (const auto & file : files) {
-                std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, lm_ggml_is_numa()));
+                auto * reg = lm_ggml_backend_dev_backend_reg(lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU));
+                auto * is_numa_fn = (decltype(lm_ggml_is_numa) *) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_cpu_is_numa");
+                std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
                 mmaps_used.emplace_back(mapping->size, 0);
                 if (mlock_mmaps) {
                     std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
@@ -5238,6 +5323,7 @@ static const char * llama_model_type_name(e_model type) {
         case MODEL_1B:            return "1B";
         case MODEL_1_3B:          return "1.3B";
         case MODEL_1_4B:          return "1.4B";
+        case MODEL_1_5B:          return "1.5B";
         case MODEL_1_6B:          return "1.6B";
         case MODEL_2B:            return "2B";
         case MODEL_2_8B:          return "2.8B";
@@ -5256,6 +5342,7 @@ static const char * llama_model_type_name(e_model type) {
         case MODEL_16B:           return "16B";
         case MODEL_20B:           return "20B";
         case MODEL_30B:           return "30B";
+        case MODEL_32B:           return "32B";
         case MODEL_34B:           return "34B";
         case MODEL_35B:           return "35B";
         case MODEL_40B:           return "40B";
@@ -5291,6 +5378,11 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
     }
 }
+static void llm_load_stats(llama_model_loader & ml, llama_model & model) {
+    model.n_elements = ml.n_elements;
+    model.n_bytes = ml.n_bytes;
+}
 static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
     model.arch = ml.get_arch();
     if (model.arch == LLM_ARCH_UNKNOWN) {
@@ -5609,8 +5701,12 @@ static void llm_load_hparams(
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
                     case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
+                    case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
                     case 32: model.type = e_model::MODEL_7B; break;
+                    case 36: model.type = e_model::MODEL_3B; break;
                     case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
+                    case 48: model.type = e_model::MODEL_14B; break;
+                    case 64: model.type = e_model::MODEL_32B; break;
                     case 80: model.type = e_model::MODEL_70B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
@@ -5820,6 +5916,17 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_OLMO2:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 16: model.type = e_model::MODEL_1B; break;
+                    case 32: model.type = e_model::MODEL_7B; break;
+                    case 40: model.type = e_model::MODEL_13B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
         case LLM_ARCH_OLMOE:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -7022,7 +7129,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
     {LLM_TENSOR_TIME_MIX_LERP_R,            {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
     {LLM_TENSOR_TIME_MIX_LERP_G,            {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
     {LLM_TENSOR_TIME_MIX_DECAY,             {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_ADD}},
-    {LLM_TENSOR_TIME_MIX_FIRST,             {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_RWKV_WKV}},
+    {LLM_TENSOR_TIME_MIX_FIRST,             {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_RWKV_WKV6}},
     {LLM_TENSOR_ATTN_NORM,                  {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
     {LLM_TENSOR_ATTN_NORM_2,                {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
     {LLM_TENSOR_ATTN_OUT_NORM,              {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL}},
@@ -7092,12 +7199,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, lm_ggml_tensor
             } break;
         case LM_GGML_OP_ADD:
             {
-                lm_ggml_tensor * a = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_F32, w->ne[0], 512);
+                lm_ggml_tensor * a = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
                 op_tensor = lm_ggml_add(ctx, a, w);
             } break;
         case LM_GGML_OP_MUL:
             {
-                lm_ggml_tensor * a = lm_ggml_new_tensor_2d(ctx, LM_GGML_TYPE_F32, w->ne[0], 512);
+                lm_ggml_tensor * a = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
                 op_tensor = lm_ggml_mul(ctx, a, w);
             } break;
         case LM_GGML_OP_DIV:
@@ -7138,7 +7245,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, lm_ggml_tensor
                 lm_ggml_tensor * C = lm_ggml_new_tensor_3d(ctx, LM_GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
                 op_tensor = lm_ggml_ssm_scan(ctx, s, x, dt, w, B, C);
             } break;
-        case LM_GGML_OP_RWKV_WKV:
+        case LM_GGML_OP_RWKV_WKV6:
             {
                 // FIXME
                 const int64_t S = 123;
@@ -7151,7 +7258,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, lm_ggml_tensor
                 lm_ggml_tensor  * tf = w;
                 lm_ggml_tensor  * td = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, 1, S, H, n_tokens);
                 lm_ggml_tensor  * state = lm_ggml_new_tensor_4d(ctx, LM_GGML_TYPE_F32, S, n_seqs, S, H);
-                op_tensor = lm_ggml_rwkv_wkv(ctx, k, v, r, tf, td, state);
+                op_tensor = lm_ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
             } break;
         default:
             LM_GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, lm_ggml_op_name(op), w->name);
@@ -7200,7 +7307,7 @@ static llama_model::buft_list_t make_cpu_buft_list(llama_model & model) {
     auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
     auto * cpu_reg = lm_ggml_backend_dev_backend_reg(cpu_dev);
     auto lm_ggml_backend_dev_get_extra_bufts_fn = (lm_ggml_backend_dev_get_extra_bufts_t)
-        lm_ggml_backend_reg_get_proc_address(cpu_reg, "lm_ggml_backend_cpu_get_extra_bufts");
+        lm_ggml_backend_reg_get_proc_address(cpu_reg, "lm_ggml_backend_dev_get_extra_bufts");
     if (lm_ggml_backend_dev_get_extra_bufts_fn) {
         lm_ggml_backend_buffer_type_t * extra_bufts = lm_ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
         while (extra_bufts && *extra_bufts) {
@@ -7467,7 +7574,7 @@ static bool llm_load_tensors(
             // avoid using a host buffer when using mmap
             auto * buft_dev = lm_ggml_backend_buft_get_device(buft);
-            if (ml.use_mmap && buft == lm_ggml_backend_dev_host_buffer_type(buft_dev)) {
+            if (ml.use_mmap && buft_dev && buft == lm_ggml_backend_dev_host_buffer_type(buft_dev)) {
                 auto * cpu_dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
                 buft = lm_ggml_backend_dev_buffer_type(cpu_dev);
             }
@@ -8502,6 +8609,31 @@ static bool llm_load_tensors(
                         layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                     }
                 } break;
+            case LLM_ARCH_OLMO2:
+                {
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    // output
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+                    for (int i = 0; i < n_layer; ++i) {
+                        auto & layer = model.layers[i];
+                        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
+                        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
+                        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
+                        layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                        layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
+                    }
+                } break;
             case LLM_ARCH_OLMOE:
                 {
                     model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -9074,6 +9206,10 @@ static bool llm_load_tensors(
         // check if it is possible to use buffer_from_host_ptr with this buffer type
         lm_ggml_backend_dev_t dev = lm_ggml_backend_buft_get_device(buft);
+        if (!dev) {
+            // FIXME: workaround for CPU backend buft having a NULL device
+            dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
+        }
         lm_ggml_backend_dev_props props;
         lm_ggml_backend_dev_get_props(dev, &props);
         bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
@@ -9145,7 +9281,7 @@ static bool llm_load_tensors(
     // print memory requirements per buffer type
     for (auto & buf : model.bufs) {
-        LLAMA_LOG_INFO("%s: %10s model buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf.get()), lm_ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
+        LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, lm_ggml_backend_buffer_name(buf.get()), lm_ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
     }
     // populate tensors_by_name
@@ -9198,6 +9334,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
             throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
         }
+        llm_load_stats(ml, model);
         llm_load_print_meta(ml, model);
         if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
@@ -10094,7 +10231,7 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
     v = lm_ggml_transpose(ctx, v);
     r = lm_ggml_transpose(ctx, r);
-    struct lm_ggml_tensor * wkv_output = lm_ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
+    struct lm_ggml_tensor * wkv_output = lm_ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
     cur = lm_ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
     *wkv_state = lm_ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
@@ -14362,6 +14499,130 @@ struct llm_build_context {
         return gf;
     }
+    struct lm_ggml_cgraph * build_olmo2() {
+        struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
+        struct lm_ggml_tensor * cur;
+        struct lm_ggml_tensor * inpL;
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+        // inp_pos - contains the positions
+        struct lm_ggml_tensor * inp_pos = build_inp_pos();
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
+        for (int il = 0; il < n_layer; ++il) {
+            struct lm_ggml_tensor * inpSA = inpL;
+            cur = inpL;
+            // self_attention
+            {
+                // compute Q and K and RoPE them
+                struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(Qcur, "Qcur_normed", il);
+                Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(Kcur, "Kcur_normed", il);
+                Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Qcur = lm_ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur_rope", il);
+                Kcur = lm_ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur_rope", il);
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+            cur = llm_build_norm(ctx0, cur, hparams,
+                    model.layers[il].attn_post_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_post_norm", il);
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = lm_ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+            struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+            // feed-forward network
+            cur = llm_build_ffn(ctx0, lctx, ffn_inp,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+            cur = llm_build_norm(ctx0, cur, hparams,
+                model.layers[il].ffn_post_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+            cb(cur, "ffn_post_norm", -1);
+            cur = lm_ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+            // input for next layer
+            inpL = cur;
+        }
+        cur = inpL;
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+        lm_ggml_build_forward_expand(gf, cur);
+        return gf;
+    }
     // based on the build_qwen2moe() function, changes:
     //   * removed shared experts
     //   * removed bias
@@ -16554,6 +16815,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_olmo();
             } break;
+        case LLM_ARCH_OLMO2:
+            {
+                result = llm.build_olmo2();
+            } break;
         case LLM_ARCH_OLMOE:
             {
                 result = llm.build_olmoe();
@@ -17189,14 +17454,16 @@ static void llama_output_reorder(struct llama_context * ctx) {
     }
 }
-static void llama_graph_compute(
+// returns the result of lm_ggml_backend_sched_graph_compute_async execution
+static enum lm_ggml_status llama_graph_compute(
           llama_context & lctx,
             lm_ggml_cgraph * gf,
                     int   n_threads,
         lm_ggml_threadpool * threadpool) {
     if (lctx.backend_cpu != nullptr) {
-        lm_ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
-        lm_ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
+        auto * reg = lm_ggml_backend_dev_backend_reg(lm_ggml_backend_get_device(lctx.backend_cpu));
+        auto * set_threadpool_fn = (decltype(lm_ggml_backend_cpu_set_threadpool) *) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_cpu_set_threadpool");
+        set_threadpool_fn(lctx.backend_cpu, threadpool);
     }
     // set the number of threads for all the backends
@@ -17204,15 +17471,20 @@ static void llama_graph_compute(
         set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
     }
-    auto err = lm_ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf);
-    if (err != LM_GGML_STATUS_SUCCESS) {
-        LLAMA_LOG_ERROR("%s: lm_ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, err);
+    auto status = lm_ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf);
+    if (status != LM_GGML_STATUS_SUCCESS) {
+        LLAMA_LOG_ERROR("%s: lm_ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
     }
     // fprintf(stderr, "splits: %d\n", lm_ggml_backend_sched_get_n_splits(lctx.sched));
+    return status;
 }
 // decode a batch of tokens by evaluating the transformer
+// in case of unsuccessful decoding (error or warning),
+// the kv_cache state will be returned to its original state
+// (for non-recurrent models) or cleaned (for recurrent models)
 //
 //   - lctx:      llama context
 //   - batch:     batch to evaluate
@@ -17262,6 +17534,7 @@ static int llama_decode_internal(
     lctx.n_queued_tokens += n_tokens_all;
     auto & kv_self = lctx.kv_self;
+    llama_kv_slot_restorer kv_slot_restorer(kv_self);
     const int64_t n_embd  = hparams.n_embd;
     const int64_t n_vocab = hparams.n_vocab;
@@ -17346,9 +17619,11 @@ static int llama_decode_internal(
                 kv_self.head = 0;
             }
-            if (!llama_kv_cache_find_slot(kv_self, ubatch)) {
+            const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
+            if (!slot) {
                 return 1;
             }
+            kv_slot_restorer.save(slot);
             if (!kv_self.recurrent) {
                 // a heuristic, to avoid attending the full cache if it is not yet utilized
@@ -17395,7 +17670,19 @@ static int llama_decode_internal(
         llama_set_inputs(lctx, ubatch);
-        llama_graph_compute(lctx, gf, n_threads, threadpool);
+        const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
+        if (compute_status != LM_GGML_STATUS_SUCCESS) {
+            kv_slot_restorer.restore(kv_self);
+            switch (compute_status) {
+                case LM_GGML_STATUS_ABORTED:
+                    return 2;
+                case LM_GGML_STATUS_ALLOC_FAILED:
+                    return -2;
+                case LM_GGML_STATUS_FAILED:
+                default:
+                    return -3;
+            }
+        }
         // update the kv ring buffer
         {
@@ -17632,7 +17919,18 @@ static int llama_encode_internal(
     llama_set_inputs(lctx, ubatch);
-    llama_graph_compute(lctx, gf, n_threads, threadpool);
+    const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
+    switch (compute_status) {
+        case LM_GGML_STATUS_SUCCESS:
+            break;
+        case LM_GGML_STATUS_ABORTED:
+            return 2;
+        case LM_GGML_STATUS_ALLOC_FAILED:
+            return -2;
+        case LM_GGML_STATUS_FAILED:
+        default:
+            return -3;
+    }
     // extract embeddings
     if (embd) {
@@ -17932,13 +18230,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 static void llama_kv_cache_update_internal(struct llama_context & lctx) {
     bool need_reserve = false;
-    // apply K-shift if needed
-    if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
-        if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
-            LM_GGML_ABORT("Deepseek2 does not support K-shift");
+    if (lctx.kv_self.has_shift) {
+        if (!llama_kv_cache_can_shift(&lctx)) {
+            LM_GGML_ABORT("The current context does not support K-shift");
         }
-        {
+        // apply K-shift if needed
+        if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
             lm_ggml_backend_sched_reset(lctx.sched.get());
             lm_ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
@@ -18511,6 +18809,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     llama_model model;
     llm_load_arch(ml, model);
     llm_load_hparams(ml, model);
+    llm_load_stats(ml, model);
     struct quantize_state_internal qs(model, params);
@@ -19081,6 +19380,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
 //
 struct llama_model_params llama_model_default_params() {
     struct llama_model_params result = {
+        /*.devices                     =*/ nullptr,
         /*.n_gpu_layers                =*/ 0,
         /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
         /*.main_gpu                    =*/ 0,
@@ -19198,7 +19498,11 @@ void llama_backend_init(void) {
 void llama_numa_init(enum lm_ggml_numa_strategy numa) {
     if (numa != LM_GGML_NUMA_STRATEGY_DISABLED) {
-        lm_ggml_numa_init(numa);
+        auto * dev = lm_ggml_backend_dev_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU);
+        LM_GGML_ASSERT(dev && "CPU backend is not loaded");
+        auto * reg = lm_ggml_backend_dev_backend_reg(dev);
+        auto * numa_init_fn = (decltype(lm_ggml_numa_init) *) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_cpu_numa_init");
+        numa_init_fn(numa);
     }
 }
@@ -19289,19 +19593,24 @@ struct llama_model * llama_load_model_from_file(
     }
     // create list of devices to use with this model
-    // currently, we use all available devices
-    // TODO: rework API to give user more control over device selection
-    for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
-        lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
-        switch (lm_ggml_backend_dev_type(dev)) {
-            case LM_GGML_BACKEND_DEVICE_TYPE_CPU:
-            case LM_GGML_BACKEND_DEVICE_TYPE_ACCEL:
-                // skip CPU backends since they are handled separately
-                break;
+    if (params.devices) {
+        for (lm_ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
+            model->devices.push_back(*dev);
+        }
+    } else {
+        // use all available devices
+        for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
+            lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
+            switch (lm_ggml_backend_dev_type(dev)) {
+                case LM_GGML_BACKEND_DEVICE_TYPE_CPU:
+                case LM_GGML_BACKEND_DEVICE_TYPE_ACCEL:
+                    // skip CPU backends since they are handled separately
+                    break;
-            case LM_GGML_BACKEND_DEVICE_TYPE_GPU:
-                model->devices.push_back(dev);
-                break;
+                case LM_GGML_BACKEND_DEVICE_TYPE_GPU:
+                    model->devices.push_back(dev);
+                    break;
+            }
         }
     }
@@ -19472,9 +19781,6 @@ struct llama_context * llama_new_context_with_model(
                 __func__, n_ctx_per_seq, hparams.n_ctx_train);
     }
-    ctx->abort_callback      = params.abort_callback;
-    ctx->abort_callback_data = params.abort_callback_data;
     ctx->logits_all = params.logits_all;
     // build worst-case graph for encoder if a model contains encoder
@@ -19523,7 +19829,7 @@ struct llama_context * llama_new_context_with_model(
         }
         // add CPU backend
-        ctx->backend_cpu = lm_ggml_backend_cpu_init();
+        ctx->backend_cpu = lm_ggml_backend_init_by_type(LM_GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
         if (ctx->backend_cpu == nullptr) {
             LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
             llama_free(ctx);
@@ -19543,6 +19849,8 @@ struct llama_context * llama_new_context_with_model(
             }
         }
+        llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
         if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
@@ -19588,7 +19896,8 @@ struct llama_context * llama_new_context_with_model(
             std::vector<lm_ggml_backend_t> backend_ptrs;
             for (auto & backend : ctx->backends) {
                 auto * buft = lm_ggml_backend_get_default_buffer_type(backend.get());
-                if (lm_ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) {
+                auto backend_type = lm_ggml_backend_dev_type(lm_ggml_backend_get_device(backend.get()));
+                if (backend_type == LM_GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
                     // use the host buffer of the first device CPU for faster transfer of the intermediate state
                     auto * dev = model->devices[0];
                     auto * host_buft = lm_ggml_backend_dev_host_buffer_type(dev);
@@ -19616,7 +19925,8 @@ struct llama_context * llama_new_context_with_model(
             // pipeline parallelism requires support for async compute and events in all devices
             if (pipeline_parallel) {
                 for (auto & backend : ctx->backends) {
-                    if (lm_ggml_backend_is_cpu(backend.get())) {
+                    auto dev_type = lm_ggml_backend_dev_type(lm_ggml_backend_get_device(backend.get()));
+                    if (dev_type == LM_GGML_BACKEND_DEVICE_TYPE_CPU) {
                         // ignore CPU backend
                         continue;
                     }
@@ -19790,6 +20100,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_QWEN:
         case LLM_ARCH_QWEN2:
         case LLM_ARCH_QWEN2MOE:
+        case LLM_ARCH_OLMO2:
         case LLM_ARCH_OLMOE:
         case LLM_ARCH_PHI2:
         case LLM_ARCH_PHI3:
@@ -19863,19 +20174,11 @@ int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t bu
 }
 uint64_t llama_model_size(const struct llama_model * model) {
-    uint64_t size = 0;
-    for (const auto & it : model->tensors_by_name) {
-        size += lm_ggml_nbytes(it.second);
-    }
-    return size;
+    return model->n_bytes;
 }
 uint64_t llama_model_n_params(const struct llama_model * model) {
-    uint64_t nparams = 0;
-    for (const auto & it : model->tensors_by_name) {
-        nparams += lm_ggml_nelements(it.second);
-    }
-    return nparams;
+    return model->n_elements;
 }
 struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
@@ -20189,6 +20492,10 @@ void llama_kv_cache_update(struct llama_context * ctx) {
     llama_kv_cache_update_internal(*ctx);
 }
+bool llama_kv_cache_can_shift(struct llama_context * ctx) {
+    return !ctx->kv_self.recurrent && ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
+}
 // deprecated
 size_t llama_get_state_size(struct llama_context * ctx) {
     return llama_state_get_size(ctx);
@@ -21173,6 +21480,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) {
 void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
     ctx->abort_callback      = abort_callback;
     ctx->abort_callback_data = abort_callback_data;
+    for (auto & backend : ctx->backends) {
+        auto * reg = lm_ggml_backend_dev_backend_reg(lm_ggml_backend_get_device(backend.get()));
+        auto * set_abort_callback_fn = (lm_ggml_backend_set_abort_callback_t) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_set_abort_callback");
+        if (set_abort_callback_fn) {
+            set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
+        }
+    }
 }
 void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
@@ -21810,8 +22125,11 @@ static int32_t llama_chat_apply_template_internal(
         // IBM Granite template
         for (const auto & message : chat) {
             std::string role(message->role);
-            ss << "<|start_of_role|>" << role << "<|end_of_role|>"
-               << message->content << "<|end_of_text|>\n";
+            ss << "<|start_of_role|>" << role << "<|end_of_role|>";
+            if (role == "assistant_tool_call") {
+                ss << "<|tool_call|>";
+            }
+            ss << message->content << "<|end_of_text|>\n";
         }
         if (add_ass) {
             ss << "<|start_of_role|>assistant<|end_of_role|>\n";
@@ -21911,33 +22229,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
 }
 const char * llama_print_system_info(void) {
-    lm_ggml_cpu_init(); // some ARM features are detected at runtime
     static std::string s;
-    s  = "";
-    s += "AVX = "         + std::to_string(lm_ggml_cpu_has_avx())         + " | ";
-    s += "AVX_VNNI = "    + std::to_string(lm_ggml_cpu_has_avx_vnni())    + " | ";
-    s += "AVX2 = "        + std::to_string(lm_ggml_cpu_has_avx2())        + " | ";
-    s += "AVX512 = "      + std::to_string(lm_ggml_cpu_has_avx512())      + " | ";
-    s += "AVX512_VBMI = " + std::to_string(lm_ggml_cpu_has_avx512_vbmi()) + " | ";
-    s += "AVX512_VNNI = " + std::to_string(lm_ggml_cpu_has_avx512_vnni()) + " | ";
-    s += "AVX512_BF16 = " + std::to_string(lm_ggml_cpu_has_avx512_bf16()) + " | ";
-    s += "AMX_INT8 = "    + std::to_string(lm_ggml_cpu_has_amx_int8())    + " | ";
-    s += "FMA = "         + std::to_string(lm_ggml_cpu_has_fma())         + " | ";
-    s += "NEON = "        + std::to_string(lm_ggml_cpu_has_neon())        + " | ";
-    s += "SVE = "         + std::to_string(lm_ggml_cpu_has_sve())         + " | ";
-    s += "ARM_FMA = "     + std::to_string(lm_ggml_cpu_has_arm_fma())     + " | ";
-    s += "F16C = "        + std::to_string(lm_ggml_cpu_has_f16c())        + " | ";
-    s += "FP16_VA = "     + std::to_string(lm_ggml_cpu_has_fp16_va())     + " | ";
-    s += "RISCV_VECT = "  + std::to_string(lm_ggml_cpu_has_riscv_v())     + " | ";
-    s += "WASM_SIMD = "   + std::to_string(lm_ggml_cpu_has_wasm_simd())   + " | ";
-    s += "BLAS = "        + std::to_string(lm_ggml_cpu_has_blas())        + " | ";
-    s += "SSE3 = "        + std::to_string(lm_ggml_cpu_has_sse3())        + " | ";
-    s += "SSSE3 = "       + std::to_string(lm_ggml_cpu_has_ssse3())       + " | ";
-    s += "VSX = "         + std::to_string(lm_ggml_cpu_has_vsx())         + " | ";
-    s += "MATMUL_INT8 = " + std::to_string(lm_ggml_cpu_has_matmul_int8()) + " | ";
-    s += "LLAMAFILE = "   + std::to_string(lm_ggml_cpu_has_llamafile())   + " | ";
+    for (size_t i = 0; i < lm_ggml_backend_reg_count(); i++) {
+        auto * reg = lm_ggml_backend_reg_get(i);
+        auto * get_features_fn = (lm_ggml_backend_get_features_t) lm_ggml_backend_reg_get_proc_address(reg, "lm_ggml_backend_get_features");
+        if (get_features_fn) {
+            lm_ggml_backend_feature * features = get_features_fn(reg);
+            s += lm_ggml_backend_reg_name(reg);
+            s += " : ";
+            for (; features->name; features++) {
+                s += features->name;
+                s += " = ";
+                s += features->value;
+                s += " | ";
+            }
+        }
+    }
     return s.c_str();
 }
@@ -21978,28 +22286,6 @@ void llama_perf_context_reset(struct llama_context * ctx) {
     ctx->t_p_eval_us = ctx->n_p_eval = 0;
 }
-void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
-    fprintf(stream, "\n");
-    fprintf(stream, "###########\n");
-    fprintf(stream, "# Timings #\n");
-    fprintf(stream, "###########\n");
-    fprintf(stream, "\n");
-    fprintf(stream, "mst_eval: %.2f  # ms / token during generation\n",
-            1.0e-3 * ctx->t_eval_us / ctx->n_eval);
-    fprintf(stream, "mst_p_eval: %.2f  # ms / token during prompt processing\n",
-            1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
-    fprintf(stream, "n_eval: %d  # number of tokens generated (excluding the first one)\n", ctx->n_eval);
-    fprintf(stream, "n_p_eval: %d  # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
-    fprintf(stream, "t_eval_us: %" PRId64 "  # total microseconds spent generating tokens\n", ctx->t_eval_us);
-    fprintf(stream, "t_load_us: %" PRId64 "  # total microseconds spent loading the model\n", ctx->t_load_us);
-    fprintf(stream, "t_p_eval_us: %" PRId64 "  # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
-    fprintf(stream, "ts_eval: %.2f  # tokens / second during generation\n",
-            1.0e6 * ctx->n_eval / ctx->t_eval_us);
-    fprintf(stream, "ts_p_eval: %.2f  # tokens / second during prompt processing\n",
-            1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
-}
 // For internal test use
 const std::vector<std::pair<std::string, struct lm_ggml_tensor *>> & llama_internal_get_tensor_map(
     struct llama_context * ctx