npm - cui-llama.rn - Versions diffs - 1.6.0 → 1.6.1 - Mend

cui-llama.rn 1.6.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

package/cpp/llama-model.h CHANGED Viewed

@@ -36,14 +36,17 @@ enum llm_type {
     LLM_TYPE_335M,
     LLM_TYPE_410M,
     LLM_TYPE_450M,
+    LLM_TYPE_475M,
     LLM_TYPE_770M,
     LLM_TYPE_780M,
     LLM_TYPE_0_5B,
+    LLM_TYPE_0_6B,
     LLM_TYPE_1B,
     LLM_TYPE_1_3B,
     LLM_TYPE_1_4B,
     LLM_TYPE_1_5B,
     LLM_TYPE_1_6B,
+    LLM_TYPE_1_7B,
     LLM_TYPE_1_8B,
     LLM_TYPE_2B,
     LLM_TYPE_2_8B,
@@ -62,6 +65,7 @@ enum llm_type {
     LLM_TYPE_15B,
     LLM_TYPE_16B,
     LLM_TYPE_20B,
+    LLM_TYPE_27B,
     LLM_TYPE_30B,
     LLM_TYPE_32B,
     LLM_TYPE_34B,
@@ -70,6 +74,7 @@ enum llm_type {
     LLM_TYPE_65B,
     LLM_TYPE_70B,
     LLM_TYPE_236B,
+    LLM_TYPE_290B,
     LLM_TYPE_314B,
     LLM_TYPE_671B,
     LLM_TYPE_SMALL,
@@ -84,10 +89,10 @@ enum llm_type {
     LLM_TYPE_16x3_8B,
     LLM_TYPE_10B_128x3_66B,
     LLM_TYPE_57B_A14B,
-    LLM_TYPE_27B,
-    LLM_TYPE_290B,
     LLM_TYPE_17B_16E, // llama4 Scout
     LLM_TYPE_17B_128E, // llama4 Maverick
+    LLM_TYPE_30B_A3B,
+    LLM_TYPE_235B_A22B,
 };
 struct llama_layer_posnet {
@@ -171,6 +176,8 @@ struct llama_layer {
     struct lm_ggml_tensor * wq_b      = nullptr;
     struct lm_ggml_tensor * wkv_a_mqa = nullptr;
     struct lm_ggml_tensor * wkv_b     = nullptr;
+    struct lm_ggml_tensor * wk_b      = nullptr;
+    struct lm_ggml_tensor * wv_b      = nullptr;
     struct lm_ggml_tensor * wq_cross  = nullptr;
     struct lm_ggml_tensor * wk_cross  = nullptr;
     struct lm_ggml_tensor * wv_cross  = nullptr;
@@ -388,8 +395,11 @@ struct llama_model {
     const struct lm_ggml_tensor * get_tensor(const char * name) const;
+    lm_ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
+    // note: can mutate `cparams`
     // TODO: move this to new llm_arch_model_i interface
-    llama_memory_i * create_memory() const; // TODO: params
+    llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
     // TODO: move this to new llm_arch_model_i interface
     llm_graph_result_ptr build_graph(

package/cpp/llama-sampling.cpp CHANGED Viewed

@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
     // }
     if (k <= 0) {
-        k = cur_p->size;
+        return;
     }
     k = std::min(k, (int) cur_p->size);
@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
         }
         cur_p->sorted = true;
     }
     cur_p->size = k;
 }

package/cpp/llama-vocab.cpp CHANGED Viewed

@@ -1506,7 +1506,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                     tokenizer_pre == "llama3"   ||
                     tokenizer_pre == "llama-v3" ||
                     tokenizer_pre == "llama-bpe"||
-                    tokenizer_pre == "falcon3") {
+                    tokenizer_pre == "falcon3"  ||
+                    tokenizer_pre == "pixtral") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
                 ignore_merges = true;
                 add_bos = true;
@@ -1572,6 +1573,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
                 clean_spaces = false;
             } else if (
+                tokenizer_pre == "glm4" ||
                 tokenizer_pre == "chatglm-bpe") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
                 special_bos_id = LLAMA_TOKEN_NULL;
@@ -1840,6 +1842,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 if (false
                         || t.first == "<|fim_prefix|>"  // Qwen
                         || t.first == "<fim-prefix>"
+                        || t.first == "<fim_prefix>"    // Granite
                         || t.first == "<｜fim▁begin｜>" // DeepSeek
                         || t.first == "<PRE>"
                         || t.first == "▁<PRE>"          // CodeLlama
@@ -1858,6 +1861,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 if (false
                         || t.first == "<|fim_suffix|>" // Qwen
                         || t.first == "<fim-suffix>"
+                        || t.first == "<fim_suffix>"   // Granite
                         || t.first == "<｜fim▁hole｜>" // DeepSeek
                         || t.first == "<SUF>"
                         || t.first == "▁<SUF>"         // CodeLlama
@@ -1876,6 +1880,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 if (false
                         || t.first == "<|fim_middle|>" // Qwen
                         || t.first == "<fim-middle>"
+                        || t.first == "<fim_middle>"   // Granite
                         || t.first == "<｜fim▁end｜>"  // DeepSeek
                         || t.first == "<MID>"
                         || t.first == "▁<MID>"         // CodeLlama
@@ -1894,6 +1899,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                 if (false
                         || t.first == "<|fim_pad|>" // Qwen
                         || t.first == "<fim-pad>"
+                        || t.first == "<fim_pad>"   // Granite
                         || t.first == "<PAD>"
                         ) {
                     special_fim_pad_id = t.second;
@@ -1912,6 +1918,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                         || t.first == "<|repo_name|>"
                         || t.first == "<fim-repo>"
                         || t.first == "<REPO>"
+                        || t.first == "<reponame>"    // Granite
                         ) {
                     special_fim_rep_id = t.second;
                     if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {

package/cpp/llama.h CHANGED Viewed

@@ -112,6 +112,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_TRILLION       = 31,
         LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
         LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
+        LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
     };
     enum llama_rope_type {
@@ -368,17 +369,18 @@ extern "C" {
     // model quantization parameters
     typedef struct llama_model_quantize_params {
-        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype ftype;              // quantize to this llama_ftype
-        enum lm_ggml_type output_tensor_type;   // output tensor type
-        enum lm_ggml_type token_embedding_type; // token embeddings tensor type
-        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
-        bool quantize_output_tensor;         // quantize output.weight
-        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-        bool pure;                           // quantize all tensors to the default type
-        bool keep_split;                     // quantize to the same number of shards
-        void * imatrix;                      // pointer to importance matrix data
-        void * kv_overrides;                 // pointer to vector containing overrides
+        int32_t nthread;                      // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype ftype;               // quantize to this llama_ftype
+        enum lm_ggml_type output_tensor_type;    // output tensor type
+        enum lm_ggml_type token_embedding_type;  // token embeddings tensor type
+        bool allow_requantize;                // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor;          // quantize output.weight
+        bool only_copy;                       // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool pure;                            // quantize all tensors to the default type
+        bool keep_split;                      // quantize to the same number of shards
+        void * imatrix;                       // pointer to importance matrix data
+        void * kv_overrides;                  // pointer to vector containing overrides
+        void * tensor_types;                  // pointer to vector containing tensor types
     } llama_model_quantize_params;
     typedef struct llama_logit_bias {
@@ -1231,6 +1233,7 @@ extern "C" {
         "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
     /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+    /// Setting k <= 0 makes this a noop
     LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);
     /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751

package/cpp/rn-llama.cpp CHANGED Viewed

@@ -165,6 +165,7 @@ void llama_rn_context::rewind() {
     generated_text.reserve(params.n_ctx);
     generated_token_probs.clear();
     truncated = false;
+    context_full = false;
     stopped_eos = false;
     stopped_word = false;
     stopped_limit = false;
@@ -197,6 +198,9 @@ bool llama_rn_context::loadModel(common_params &params_)
     templates = common_chat_templates_init(model, params.chat_template);
     n_ctx = llama_n_ctx(ctx);
+    // Initialize context shift flag
+    LOG_INFO("ctx_shift: %s", params.ctx_shift ? "enabled" : "disabled");
     // We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101
     // LOG_INFO("%s\n", common_params_get_system_info(params).c_str());
@@ -271,11 +275,11 @@ void llama_rn_context::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
     new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
-    LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s, num_prompt_tokens: %d",
+    LOG_INFO("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, old_size: %d, new_size: %d",
         n_ctx,
         params.n_keep,
         n_left,
-        tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()).c_str(),
+        prompt_tokens.size(),
         new_tokens.size()
     );
@@ -304,18 +308,14 @@ void llama_rn_context::loadPrompt() {
     // if input prompt is too big, truncate like normal
     if (num_prompt_tokens >= (size_t) n_ctx)
     {
+        if (!params.ctx_shift) {
+            context_full = true;
+            return;
+        }
         truncatePrompt(prompt_tokens);
         num_prompt_tokens = prompt_tokens.size();
         LM_GGML_ASSERT(num_prompt_tokens < (size_t) n_ctx);
     }
-    // do context shifitng
-    if(!params.embedding){
-        purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
-    }
     // push the prompt into the sampling context (do not apply grammar)
     for (auto & token : prompt_tokens)
     {
@@ -358,6 +358,14 @@ completion_token_output llama_rn_context::nextToken()
     if (embd.size() >= (size_t)params.n_ctx)
     {
+        if (!params.ctx_shift) {
+            // If context shifting is disabled, stop generation
+            LOG_WARNING("context full, n_ctx: %d, tokens: %d", params.n_ctx, embd.size());
+            has_next_token = false;
+            context_full = true;
+            return result;
+        }
         // Shift context
         const int n_left    = n_past - params.n_keep - 1;
@@ -373,12 +381,9 @@ completion_token_output llama_rn_context::nextToken()
         embd.resize(embd.size() - n_discard);
         n_past -= n_discard;
+        truncated = true;
-        LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s",
-            params.n_ctx,
-            params.n_keep,
-            n_left
-        );
+        LOG_VERBOSE("context shifted, new n_past: %d, new size: %d", n_past, embd.size());
     }
     bool tg = true;
@@ -712,162 +717,5 @@ void llama_rn_context::removeLoraAdapters() {
 std::vector<common_adapter_lora_info> llama_rn_context::getLoadedLoraAdapters() {
     return this->lora;
 }
-std::vector<int> llama_rn_context::longest_common_subseq(const std::vector<int> x, const std::vector<int> y){
-     int m = x.size(), n = y.size();
-     //int LCSuff[m+1][n+1];
-     std::vector<std::vector<int>> LCSuff(m+1, std::vector<int>(n+1));
-     for (int j = 0; j <= n; j++)
-         LCSuff[0][j] = 0;
-     for (int i = 0; i <= m; i++)
-         LCSuff[i][0] = 0;
-     for (int i = 1; i <= m; i++)
-     {
-         for (int j = 1; j <= n; j++)
-         {
-             if (x[i - 1] == y[j - 1])
-                 LCSuff[i][j] = LCSuff[i - 1][j - 1] + 1;
-             else
-                 LCSuff[i][j] = 0;
-         }
-     }
-     std::vector<int> longest;
-     for (int i = 1; i <= m; i++)
-     {
-         for (int j = 1; j <= n; j++)
-         {
-             if (LCSuff[i][j] > longest.size())
-             {
-                 auto off1 = ((i - LCSuff[i][j] + 1) - 1);
-                 auto off2 = off1 + LCSuff[i][j];
-                 longest.clear();
-                //  std::vector<int>().swap(longest);
-                 longest = std::vector<int>(x.begin() + off1, x.begin() + off2);
-                // x.substr((i - LCSuff[i][j] + 1) - 1, LCSuff[i][j]);
-             }
-         }
-     }
-     return longest;
- }
-bool llama_rn_context::arr_start_with(const std::vector<int> targetArray, const std::vector<int> searchSeq)
- {
-     int ss = searchSeq.size();
-     if(targetArray.size()<ss)
-     {
-         return false;
-     }
-     for(int i=0;i<ss;++i)
-     {
-         if(targetArray[i]!=searchSeq[i])
-         {
-             return false;
-         }
-     }
-     return true;
- }
-int llama_rn_context::arr_find_index_of(const std::vector<int> targetArray, const std::vector<int> searchSeq)
- {
-     int ss = searchSeq.size();
-     int tas = targetArray.size();
-     if(tas<ss)
-     {
-         return -1;
-     }
-     for(int i=0;i<tas;++i)
-     {
-         int srch = 0;
-         bool fail = false;
-         for(int srch=0;srch<ss;++srch)
-         {
-             if ((i + srch) >= tas || targetArray[i + srch] != searchSeq[srch])
-             {
-                 fail = true;
-                 break;
-             }
-         }
-         if(!fail)
-         {
-             return i;
-         }
-     }
-     return -1;
- }
-void llama_rn_context::purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx)
-{
-    //scan from start old and new ctx, until first mismatch found, save as p0
-    //check remaining old and new ctx for longest common subseq, which needs to be at 256 tokens
-    //test: longest common subseq (LCQ) MUST start within 0 tokens from end of memory, otherwise purge fails
-    //if passed, save beginning of LCQ from old ctx as p1
-    //remove all tokens from old ctx between p0 and p1, updating both arrays and kv, then continue as normal
-    const int short_fall_threshold = 200 + (nctx/30); //dont trigger shifting if the distance between trimstart and currhead < this
-    const int stack_allowance = 60 + (nctx/50); //in case the end text is slightly modified, be forgiving
-    int trimstart = 0;
-    int new_tokens_len = new_context_tokens.size();
-    bool purge_needed = true;
-    for (int i = 0; i < current_context_tokens.size(); ++i)
-    {
-        if (current_context_tokens[i] == new_context_tokens[i])
-        {
-            trimstart += 1;
-        }
-        else
-        {
-            break;
-        }
-        if ((i + 2) >= new_tokens_len)
-        {
-            purge_needed = false;
-            break; //no surgery required
-        }
-    }
-    if(!purge_needed || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < short_fall_threshold)
-    {
-        LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
-        return; //no purge is needed
-    }
-    //at least this many tokens need to match, otherwise don't bother trimming
-    const int lc_tok_threshold = std::max(std::min((new_tokens_len - trimstart) - (genamt+stack_allowance), (int)(nctx*0.45)), short_fall_threshold - stack_allowance);
-    auto curr_ctx_without_memory = std::vector<int>(current_context_tokens.begin() + trimstart, current_context_tokens.end());
-    auto new_ctx_without_memory = std::vector<int>(new_context_tokens.begin() + trimstart, new_context_tokens.end());
-    auto shared = longest_common_subseq(curr_ctx_without_memory, new_ctx_without_memory);
-    if (shared.size() > lc_tok_threshold && arr_start_with(new_ctx_without_memory, shared)) // enough tokens in common
-    {
-        int found = arr_find_index_of(current_context_tokens,shared);
-        if(found>=0 && found > trimstart)
-        {
-            //extract the unwanted tokens out from context and KV
-            int diff = found - trimstart;
-            llama_kv_self_seq_rm(ctx, 0, trimstart, trimstart + diff);
-            llama_kv_self_seq_add(ctx, 0, trimstart + diff, -1, -diff);
-            for (size_t i = trimstart + diff; i < current_context_tokens.size() - 1; i++)
-            {
-                current_context_tokens[i - diff] = current_context_tokens[i];
-            }
-            LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
-            current_context_tokens.resize(current_context_tokens.size() - diff);
-        }
-    }
-}
 }

package/cpp/rn-llama.h CHANGED Viewed

@@ -16,7 +16,6 @@
 namespace rnllama {
 std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token);
 std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::const_iterator begin, const std::vector<llama_token>::const_iterator end);
@@ -69,6 +68,7 @@ struct llama_rn_context {
     int n_ctx;
+    bool context_full = false;
     bool truncated = false;
     bool stopped_eos = false;
     bool stopped_word = false;
@@ -107,10 +107,6 @@ struct llama_rn_context {
     int applyLoraAdapters(std::vector<common_adapter_lora_info> lora);
     void removeLoraAdapters();
     std::vector<common_adapter_lora_info> getLoadedLoraAdapters();
-    std::vector<int> longest_common_subseq(const std::vector<int> x, const std::vector<int> y);
-    bool arr_start_with(const std::vector<int> targetArray, const std::vector<int> searchSeq);
-    int arr_find_index_of(const std::vector<int> targetArray, const std::vector<int> searchSeq);
-    void purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx);
 };\
 // Logging macros

package/ios/CMakeLists.txt CHANGED Viewed

@@ -40,15 +40,18 @@ add_library(rnllama SHARED
     ${SOURCE_DIR}/ggml-alloc.c
     ${SOURCE_DIR}/ggml-backend.cpp
     ${SOURCE_DIR}/ggml-backend-reg.cpp
-    ${SOURCE_DIR}/ggml-cpu.c
-    ${SOURCE_DIR}/ggml-cpu.cpp
-    ${SOURCE_DIR}/ops.cpp
-    ${SOURCE_DIR}/unary-ops.cpp
-    ${SOURCE_DIR}/binary-ops.cpp
-    ${SOURCE_DIR}/vec.cpp
-    ${SOURCE_DIR}/ggml-cpu-aarch64.cpp
-    ${SOURCE_DIR}/ggml-cpu-quants.c
-    ${SOURCE_DIR}/ggml-cpu-traits.cpp
+    ${SOURCE_DIR}/ggml-cpu/amx/amx.cpp
+    ${SOURCE_DIR}/ggml-cpu/amx/mmq.cpp
+    ${SOURCE_DIR}/ggml-cpu/ggml-cpu.c
+    ${SOURCE_DIR}/ggml-cpu/ggml-cpu.cpp
+    ${SOURCE_DIR}/ggml-cpu/ggml-cpu-aarch64.cpp
+    ${SOURCE_DIR}/ggml-cpu/ggml-cpu-quants.c
+    ${SOURCE_DIR}/ggml-cpu/ggml-cpu-traits.cpp
+    ${SOURCE_DIR}/ggml-cpu/unary-ops.cpp
+    ${SOURCE_DIR}/ggml-cpu/binary-ops.cpp
+    ${SOURCE_DIR}/ggml-cpu/sgemm.cpp
+    ${SOURCE_DIR}/ggml-cpu/vec.cpp
+    ${SOURCE_DIR}/ggml-cpu/ops.cpp
     ${SOURCE_DIR}/ggml-metal.m
     ${SOURCE_DIR}/ggml-opt.cpp
     ${SOURCE_DIR}/ggml-threading.cpp
@@ -78,7 +81,6 @@ add_library(rnllama SHARED
     ${SOURCE_DIR}/sampling.cpp
     ${SOURCE_DIR}/unicode-data.cpp
     ${SOURCE_DIR}/unicode.cpp
-    ${SOURCE_DIR}/sgemm.cpp
     ${SOURCE_DIR}/common.cpp
     ${SOURCE_DIR}/chat.cpp
     ${SOURCE_DIR}/json-schema-to-grammar.cpp
@@ -92,6 +94,7 @@ add_library(rnllama SHARED
 target_include_directories(rnllama
     PUBLIC
         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp/ggml-cpu>
         $<INSTALL_INTERFACE:include>
 )

package/ios/RNLlama.h CHANGED Viewed

@@ -1,6 +1,12 @@
 #import <React/RCTEventEmitter.h>
 #import <React/RCTBridgeModule.h>
+#if RNLLAMA_BUILD_FROM_SOURCE
+#import "json.hpp"
+#else
+#import <rnllama/json.hpp>
+#endif
 // TODO: Use RNLlamaSpec (Need to refactor NSDictionary usage)
 @interface RNLlama : RCTEventEmitter <RCTBridgeModule>

package/ios/RNLlama.mm CHANGED Viewed

@@ -108,8 +108,13 @@ RCT_EXPORT_METHOD(getFormattedChat:(double)contextId
         } else {
             resolve([context getFormattedChat:messages withChatTemplate:chatTemplate]);
         }
+    } catch (const nlohmann::json_abi_v3_11_3::detail::parse_error& e) {
+        NSString *errorMessage = [NSString stringWithUTF8String:e.what()];
+        reject(@"llama_error", [NSString stringWithFormat:@"JSON parse error in getFormattedChat: %@", errorMessage], nil);
     } catch (const std::exception& e) { // catch cpp exceptions
         reject(@"llama_error", [NSString stringWithUTF8String:e.what()], nil);
+    } catch (...) {
+        reject(@"llama_error", @"Unknown error in getFormattedChat", nil);
     }
 }

package/ios/RNLlamaContext.mm CHANGED Viewed

@@ -82,7 +82,7 @@
     BOOL isAsset = [params[@"is_model_asset"] boolValue];
     NSString *path = modelPath;
     if (isAsset) path = [[NSBundle mainBundle] pathForResource:modelPath ofType:nil];
-    defaultParams.model = {[path UTF8String]};
+    defaultParams.model.path = [path UTF8String];
     NSString *chatTemplate = params[@"chat_template"];
     if (chatTemplate) {
@@ -106,37 +106,27 @@
     NSString *reasonNoMetal = @"";
     defaultParams.n_gpu_layers = 0;
 #ifdef LM_GGML_USE_METAL
-    // Check ggml-metal availability
-    NSError * error = nil;
     id<MTLDevice> device = MTLCreateSystemDefaultDevice();
-    id<MTLLibrary> library = [device
-        newLibraryWithSource:@"#include <metal_stdlib>\n"
-                                "using namespace metal;"
-                                "typedef matrix<bfloat, 4, 4> bfloat4x4;"
-                                "kernel void test() { simd_sum(0); }"
-        options:nil
-        error:&error
-    ];
-    if (error) {
-        reasonNoMetal = [error localizedDescription];
+    // Check ggml-metal availability
+    BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
+    if (@available(iOS 16.0, tvOS 16.0, *)) {
+        supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
+    }
+    if (!supportsGgmlMetal) {
+        reasonNoMetal = @"Metal is not supported in this device";
         skipGpuDevices = true;
-    } else {
-        id<MTLFunction> kernel = [library newFunctionWithName:@"test"];
-        id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:kernel error:&error];
-        if (pipeline == nil) {
-            reasonNoMetal = [error localizedDescription];
-            skipGpuDevices = true;
-        } else {
+    }
 #if TARGET_OS_SIMULATOR
-            // Use the backend, but no layers because not supported fully on simulator
-            defaultParams.n_gpu_layers = 0;
-            isMetalEnabled = true;
+    // Use the backend, but no layers because not supported fully on simulator
+    defaultParams.n_gpu_layers = 0;
+    isMetalEnabled = true;
 #else
-            defaultParams.n_gpu_layers = [params[@"n_gpu_layers"] intValue];
-            isMetalEnabled = true;
+    defaultParams.n_gpu_layers = [params[@"n_gpu_layers"] intValue];
+    isMetalEnabled = true;
 #endif
-        }
-    }
     device = nil;
 #else
     reasonNoMetal = @"Metal is not enabled in this build";
@@ -158,6 +148,8 @@
         }
         if (cpu_devs.size() > 0) {
             defaultParams.devices = cpu_devs;
+            defaultParams.n_gpu_layers = 0;
+            isMetalEnabled = false;
         }
     }
@@ -184,6 +176,8 @@
     if (params[@"flash_attn"] && [params[@"flash_attn"] boolValue]) defaultParams.flash_attn = true;
+    if (params[@"ctx_shift"]) defaultParams.ctx_shift = [params[@"ctx_shift"] boolValue];
     if (params[@"cache_type_k"]) defaultParams.cache_type_k = rnllama::kv_cache_type_from_str([params[@"cache_type_k"] UTF8String]);
     if (params[@"cache_type_v"]) defaultParams.cache_type_v = rnllama::kv_cache_type_from_str([params[@"cache_type_v"] UTF8String]);
@@ -568,6 +562,9 @@
     }
     llama->beginCompletion();
     llama->loadPrompt();
+    if (llama->context_full) {
+        @throw [NSException exceptionWithName:@"LlamaException" reason:@"Context is full" userInfo:nil];
+    }
     size_t sent_count = 0;
     size_t sent_token_probs_index = 0;
@@ -655,7 +652,7 @@
                 }];
             }
         } catch (const std::exception &e) {
-            // NSLog(@"Error parsing tool calls: %s", e.what());
+        } catch (...) {
         }
     }
@@ -668,6 +665,7 @@
     result[@"tokens_predicted"] = @(llama->num_tokens_predicted);
     result[@"tokens_evaluated"] = @(llama->num_prompt_tokens);
     result[@"truncated"] = @(llama->truncated);
+    result[@"context_full"] = @(llama->context_full);
     result[@"stopped_eos"] = @(llama->stopped_eos);
     result[@"stopped_word"] = @(llama->stopped_word);
     result[@"stopped_limit"] = @(llama->stopped_limit);

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h CHANGED Viewed

@@ -355,8 +355,10 @@ struct common_params {
     common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
-    // multimodal models (see examples/llava)
+    // multimodal models (see tools/llava)
     struct common_params_model mmproj;
+    bool mmproj_use_gpu = true;     // use GPU for multimodal model
+    bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
     // embedding
@@ -427,8 +429,8 @@ struct common_params {
     int n_pca_batch = 100;
     int n_pca_iterations = 1000;
     dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
-    std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
-    std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
+    std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
+    std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
     bool spm_infill = false; // suffix/prefix/middle pattern for infill
@@ -558,6 +560,8 @@ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const
 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
+std::string                   get_model_endpoint();
 //
 // Batch utils
 //