npm - @novastera-oss/llamarn - Versions diffs - 0.2.7 → 0.2.9 - Mend

@novastera-oss/llamarn 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (186) hide show

package/android/src/main/cpp/include/llama.h CHANGED Viewed

@@ -390,6 +390,7 @@ extern "C" {
         void * imatrix;                       // pointer to importance matrix data
         void * kv_overrides;                  // pointer to vector containing overrides
         void * tensor_types;                  // pointer to vector containing tensor types
+        void * prune_layers;                  // pointer to vector containing layer indices to prune
     } llama_model_quantize_params;
     typedef struct llama_logit_bias {
@@ -943,12 +944,14 @@ extern "C" {
     // Requires the context to have a memory.
     // For encode-decoder contexts, processes the batch using the decoder.
     // Positive return values does not mean a fatal error, but rather a warning.
-    // Upon non-zero return values, the memory state is restored to the state before this call
+    // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
+    //   To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
+    // Upon other return values, the memory state is restored to the state before this call
     //    0 - success
     //    1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-    //    2 - aborted
+    //    2 - aborted     (processed ubatches will remain in the context's memory)
     //   -1 - invalid input batch
-    // < -1 - error
+    // < -1 - fatal error (processed ubatches will remain in the context's memory)
     LLAMA_API int32_t llama_decode(
             struct llama_context * ctx,
               struct llama_batch   batch);
@@ -1044,6 +1047,7 @@ extern "C" {
     LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
     LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
+    LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
     LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
     LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
@@ -1087,6 +1091,7 @@ extern "C" {
     /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
     /// @return Returns the number of tokens on success, no more than n_tokens_max
     /// @return Returns a negative number on failure - the number of tokens that would have been returned
+    /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
     /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
     /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
     ///                      as plaintext. Does not insert a leading space.

package/android/src/main/jniLibs/arm64-v8a/libggml-base.so CHANGED Viewed

Binary file

package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so CHANGED Viewed

Binary file

package/android/src/main/jniLibs/arm64-v8a/libggml.so CHANGED Viewed

Binary file

package/android/src/main/jniLibs/arm64-v8a/libllama.so CHANGED Viewed

Binary file

package/android/src/main/jniLibs/x86_64/libggml-base.so CHANGED Viewed

Binary file

package/android/src/main/jniLibs/x86_64/libggml-cpu.so CHANGED Viewed

Binary file

package/android/src/main/jniLibs/x86_64/libggml.so CHANGED Viewed

Binary file

package/android/src/main/jniLibs/x86_64/libllama.so CHANGED Viewed

Binary file

package/cpp/LlamaCppModel.cpp CHANGED Viewed

@@ -5,6 +5,7 @@
 #include <cstdlib>
 #include <ctime>
 #include <chrono>
+#include <thread>
 #include <fstream>
 #include <iostream>
 #include <random>
@@ -50,33 +51,60 @@ LlamaCppModel::~LlamaCppModel() {
 }
 void LlamaCppModel::release() {
-  // Cancel any ongoing predictions
+  // Signal completion to stop and wait for it to finish gracefully
   if (is_predicting_) {
     should_stop_completion_ = true;
-    // Optionally wait a bit for completion to stop
+    // Wait more patiently for completion to stop, with proper backoff
     int retry = 0;
-    while (is_predicting_ && retry < 10) {
-      std::this_thread::sleep_for(std::chrono::milliseconds(10));
+    while (is_predicting_ && retry < 100) { // Increased from 10 to 100
+      std::this_thread::sleep_for(std::chrono::milliseconds(retry < 50 ? 10 : 50));
       retry++;
     }
+    // Force stop if still predicting
+    if (is_predicting_) {
+      is_predicting_ = false;
+    }
   }
-  // Clean up our resources
+  // Clean up our resources with proper mutex protection
   if (rn_ctx_) {
+    std::lock_guard<std::mutex> lock(rn_ctx_->mutex);
+    // Clear KV cache before freeing context (following server.cpp pattern)
     if (rn_ctx_->ctx) {
+      try {
+        llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
+      } catch (...) {
+        // Ignore errors during cache clearing
+      }
       llama_free(rn_ctx_->ctx);
       rn_ctx_->ctx = nullptr;
     }
+    // Free model after context (following server.cpp cleanup order)
     if (rn_ctx_->model) {
       llama_model_free(rn_ctx_->model);
       rn_ctx_->model = nullptr;
     }
+    // Clean up additional resources
+    rn_ctx_->vocab = nullptr; // This is owned by the model, so just null it
+    rn_ctx_->chat_templates.reset(); // Clean up chat templates
+    rn_ctx_->lora_adapters.clear(); // Clear LoRA adapters
+    // Reset state flags
+    rn_ctx_->model_loaded = false;
     // Note: rn_ctx_ itself is owned by the module, so we don't delete it here
     rn_ctx_ = nullptr;
   }
+  // Reset our internal state
+  should_stop_completion_ = false;
+  is_predicting_ = false;
 }
 int32_t LlamaCppModel::getVocabSize() const {
@@ -133,6 +161,10 @@ CompletionOptions LlamaCppModel::parseCompletionOptions(jsi::Runtime& rt, const
     options.min_p = obj.getProperty(rt, "min_p").asNumber();
   }
+  if (obj.hasProperty(rt, "presence_penalty") && !obj.getProperty(rt, "presence_penalty").isUndefined()) {
+    options.presence_penalty = obj.getProperty(rt, "presence_penalty").asNumber();
+  }
   if (obj.hasProperty(rt, "n_predict") && !obj.getProperty(rt, "n_predict").isUndefined()) {
     options.n_predict = obj.getProperty(rt, "n_predict").asNumber();
   } else if (obj.hasProperty(rt, "max_tokens") && !obj.getProperty(rt, "max_tokens").isUndefined()) {
@@ -365,13 +397,14 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
   std::lock_guard<std::mutex> lock(rn_ctx_->mutex);
   // Clear the context KV cache
-  llama_kv_self_clear(rn_ctx_->ctx);
+  llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
   // Store original sampling parameters to restore later
   float orig_temp = rn_ctx_->params.sampling.temp;
   float orig_top_p = rn_ctx_->params.sampling.top_p;
   float orig_top_k = rn_ctx_->params.sampling.top_k;
   float orig_min_p = rn_ctx_->params.sampling.min_p;
+  float orig_presence_penalty = rn_ctx_->params.sampling.penalty_present;
   int orig_n_predict = rn_ctx_->params.n_predict;
   // Set sampling parameters from options
@@ -379,6 +412,7 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
   rn_ctx_->params.sampling.top_p = options.top_p;
   rn_ctx_->params.sampling.top_k = options.top_k;
   rn_ctx_->params.sampling.min_p = options.min_p;
+  rn_ctx_->params.sampling.penalty_present = options.presence_penalty;
   rn_ctx_->params.n_predict = options.n_predict;
   // Check for a partial callback
@@ -426,6 +460,7 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
   rn_ctx_->params.sampling.top_p = orig_top_p;
   rn_ctx_->params.sampling.top_k = orig_top_k;
   rn_ctx_->params.sampling.min_p = orig_min_p;
+  rn_ctx_->params.sampling.penalty_present = orig_presence_penalty;
   rn_ctx_->params.n_predict = orig_n_predict;
   return result;
@@ -885,29 +920,28 @@ jsi::Value LlamaCppModel::embeddingJsi(jsi::Runtime& rt, const jsi::Value* args,
     }
     // Clear the context KV cache to ensure clean embedding
-    llama_kv_self_clear(rn_ctx_->ctx);
+    llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
     // Enable embedding mode
     llama_set_embeddings(rn_ctx_->ctx, true);
-    // Evaluate tokens one by one
+    // Create and populate batch using common_batch functions (following server.cpp pattern)
+    llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
+    common_batch_clear(batch);
     for (int i = 0; i < (int)tokens.size(); i++) {
-      llama_token token = tokens[i];
-      llama_batch batch = {
-        /* n_tokens    */ 1,
-        /* token       */ &token,
-        /* embd        */ nullptr,
-        /* pos         */ &i,
-        /* n_seq_id    */ nullptr,
-        /* seq_id      */ nullptr,
-        /* logits      */ nullptr
-      };
-      if (llama_decode(rn_ctx_->ctx, batch) != 0) {
-        throw std::runtime_error("Failed to decode token for embedding");
-      }
+      // For embeddings, we typically need logits for the last token (for pooling)
+      bool needs_logits = (i == (int)tokens.size() - 1);
+      common_batch_add(batch, tokens[i], i, {0}, needs_logits);
     }
+    if (llama_decode(rn_ctx_->ctx, batch) != 0) {
+      llama_batch_free(batch);
+      throw std::runtime_error("Failed to decode tokens for embedding");
+    }
+    llama_batch_free(batch);
     // Get embedding size from the model
     const int n_embd = llama_model_n_embd(rn_ctx_->model);
     if (n_embd <= 0) {

package/cpp/build-info.cpp CHANGED Viewed

@@ -1,4 +1,4 @@
-int LLAMA_BUILD_NUMBER = 5709;
-char const *LLAMA_COMMIT = "d67341dc";
+int LLAMA_BUILD_NUMBER = 5770;
+char const *LLAMA_COMMIT = "b25e9277";
 char const *LLAMA_COMPILER = "unknown";
 char const *LLAMA_BUILD_TARGET = "unknown";

package/cpp/llama.cpp/CMakeLists.txt CHANGED Viewed

@@ -95,7 +95,7 @@ endif()
 if (NOT DEFINED LLAMA_BUILD_COMMIT)
     set(LLAMA_BUILD_COMMIT        ${BUILD_COMMIT})
 endif()
-set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
+set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
 # override ggml options
 set(GGML_ALL_WARNINGS   ${LLAMA_ALL_WARNINGS})

package/cpp/llama.cpp/common/arg.cpp CHANGED Viewed

@@ -2706,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.embd_sep = value;
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(common_arg(
+        {"--cls-separator"}, "STRING",
+        "separator of classification sequences (default \\t) for example \"<#seq#>\"",
+        [](common_params & params, const std::string & value) {
+            params.cls_sep = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(common_arg(
         {"--host"}, "HOST",
         string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),

package/cpp/llama.cpp/common/common.cpp CHANGED Viewed

@@ -1290,6 +1290,9 @@ std::vector<llama_token> common_tokenize(
     int n_tokens = text.length() + 2 * add_special;
     std::vector<llama_token> result(n_tokens);
     n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+    if (n_tokens == std::numeric_limits<int32_t>::min()) {
+        throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
+    }
     if (n_tokens < 0) {
         result.resize(-n_tokens);
         int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);

package/cpp/llama.cpp/common/common.h CHANGED Viewed

@@ -358,6 +358,7 @@ struct common_params {
     int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
     std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
     std::string embd_sep   = "\n";  // separator of embeddings
+    std::string cls_sep    = "\t";  // separator of classification sequences
     // server params
     int32_t port           = 8080;         // server listens on this network port

package/cpp/llama.cpp/common/json-schema-to-grammar.cpp CHANGED Viewed

@@ -41,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
     return result;
 }
-/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
-class string_view {
-    const std::string & _str;
-    const size_t _start;
-    const size_t _end;
-public:
-    string_view(const std::string & str, size_t start = 0, size_t end  = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
-    size_t size() const {
-        return _end - _start;
-    }
-    size_t length() const {
-        return size();
-    }
-    operator std::string() const {
-        return str();
-    }
-    std::string str() const {
-        return _str.substr(_start, _end - _start);
-    }
-    string_view substr(size_t pos, size_t len = std::string::npos) const {
-        return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
-    }
-    char operator[](size_t pos) const {
-        auto index = _start + pos;
-        if (index >= _end) {
-            throw std::out_of_range("string_view index out of range");
-        }
-        return _str[_start + pos];
-    }
-    bool operator==(const string_view & other) const {
-        std::string this_str = *this;
-        std::string other_str = other;
-        return this_str == other_str;
-    }
-};
 static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
     auto has_min = min_value != std::numeric_limits<int>::min();
     auto has_max = max_value != std::numeric_limits<int>::max();
@@ -112,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
         }
         out << "}";
     };
-    std::function<void(const string_view &, const string_view &)> uniform_range =
-        [&](const string_view & from, const string_view & to) {
+    std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
+        [&](const std::string_view & from, const std::string_view & to) {
             size_t i = 0;
             while (i < from.length() && i < to.length() && from[i] == to[i]) {
                 i++;
             }
             if (i > 0) {
-                out << "\"" << from.substr(0, i).str() << "\"";
+                out << "\"" << from.substr(0, i) << "\"";
             }
             if (i < from.length() && i < to.length()) {
                 if (i > 0) {

package/cpp/llama.cpp/convert_hf_to_gguf.py CHANGED Viewed

@@ -310,6 +310,8 @@ class ModelBase:
                             gguf.MODEL_TENSOR.POSNET_NORM2,
                             gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
                             gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
+                            gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF,
+                            gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
                         )
                     )
                     or not new_name.endswith(".weight")
@@ -320,7 +322,11 @@ class ModelBase:
                     self.match_model_tensor_name(new_name, key, bid)
                     for key in (
                         gguf.MODEL_TENSOR.TOKEN_EMBD,
+                        gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
                         gguf.MODEL_TENSOR.OUTPUT,
+                        gguf.MODEL_TENSOR.ALTUP_ROUTER,
+                        gguf.MODEL_TENSOR.LAUREL_L,
+                        gguf.MODEL_TENSOR.LAUREL_R,
                     )
                 ):
                     if self.ftype in (
@@ -921,13 +927,20 @@ class TextModel(ModelBase):
         tokenizer = SentencePieceProcessor()
         tokenizer.LoadFromFile(str(tokenizer_path))
-        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+        vocab_size = self.find_hparam([
+            "vocab_size_per_layer_input", # gemma3n
+            "vocab_size",
+        ], optional=True) or tokenizer.vocab_size()
         tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
         scores: list[float] = [-10000.0] * vocab_size
         toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
         for token_id in range(tokenizer.vocab_size()):
+            if token_id >= vocab_size:
+                logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}')
+                break
             piece = tokenizer.IdToPiece(token_id)
             text = piece.encode("utf-8")
             score = tokenizer.GetScore(token_id)
@@ -2145,7 +2158,6 @@ class Llama4Model(LlamaModel):
     def set_vocab(self):
         self._set_vocab_gpt2()
-        self.gguf_writer.add_add_bos_token(True)
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
@@ -2194,7 +2206,7 @@ class Llama4VisionModel(MmprojModel):
                 name += ".weight"
             if "multi_modal_projector.linear_1" in name:
                 # despite the name with number postfix, this is a single fully connected layer
-                return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
+                return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch)]
             return [(self.map_tensor_name(name), data_torch)]
         return []
@@ -3918,9 +3930,6 @@ class BertModel(TextModel):
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
         special_vocab.add_to_gguf(self.gguf_writer)
-        self.gguf_writer.add_add_bos_token(True)
-        self.gguf_writer.add_add_eos_token(True)
 @ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
 class DistilBertModel(BertModel):
@@ -3962,8 +3971,6 @@ class RobertaModel(BertModel):
         bpe_tok_path = self.dir_model / "tokenizer.json"
         if bpe_tok_path.exists():
             self._set_vocab_gpt2()
-            self.gguf_writer.add_add_bos_token(True)
-            self.gguf_writer.add_add_eos_token(True)
             # we need this to validate the size of the token_type embeddings
             # though currently we are passing all zeros to the token_type embeddings
@@ -4223,6 +4230,7 @@ class Gemma2Model(TextModel):
 @ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
 class Gemma3Model(TextModel):
     model_arch = gguf.MODEL_ARCH.GEMMA3
+    norm_shift = 1.0  # Gemma3RMSNorm adds 1.0 to the norm value
     def set_vocab(self):
         self._set_vocab_sentencepiece()
@@ -4244,9 +4252,8 @@ class Gemma3Model(TextModel):
         self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
         self.gguf_writer.add_file_type(self.ftype)
         self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
-        # both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3
+        # attn_logit_softcapping is removed in Gemma3
         assert hparams.get("attn_logit_softcapping") is None
-        assert hparams.get("final_logit_softcapping") is None
         self.gguf_writer.add_sliding_window(hparams["sliding_window"])
         self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
         if hparams.get("rope_scaling") is not None:
@@ -4258,7 +4265,7 @@ class Gemma3Model(TextModel):
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
-        if name.startswith("language_model."):
+        if "language_model." in name:
             name = name.replace("language_model.", "")
         elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
@@ -4273,8 +4280,9 @@ class Gemma3Model(TextModel):
         # ref code in Gemma3RMSNorm
         # output = output * (1.0 + self.weight.float())
+        # note: this is not the case on gemma3n
         if name.endswith("norm.weight"):
-            data_torch = data_torch + 1
+            data_torch = data_torch + self.norm_shift
         return [(self.map_tensor_name(name), data_torch)]
@@ -4331,6 +4339,104 @@ class Gemma3VisionModel(MmprojModel):
         return [] # skip other tensors
+@ModelBase.register("Gemma3nForConditionalGeneration")
+class Gemma3NModel(Gemma3Model):
+    model_arch = gguf.MODEL_ARCH.GEMMA3N
+    norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
+    _altup_proj: list[Tensor] = []
+    _altup_unembd: list[Tensor] = []
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs"
+        self._altup_proj = [
+            torch.Tensor(), # to be replaced
+            torch.Tensor(), # to be replaced
+            torch.Tensor(), # to be replaced
+        ]
+        self._altup_unembd = [
+            torch.Tensor(), # to be replaced
+            torch.Tensor(), # to be replaced
+            torch.Tensor(), # to be replaced
+        ]
+    def set_vocab(self):
+        with open(self.dir_model / "chat_template.jinja") as f:
+            # quick hack to make sure chat template is added
+            self.gguf_writer.add_chat_template(f.read())
+        super().set_vocab()
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
+        self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"])
+        self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"])
+        self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"])
+        activation_sparsity_scale = []
+        for s in self.hparams["activation_sparsity_pattern"]:
+            normal_dist = torch.distributions.normal.Normal(0, 1)
+            std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32))
+            activation_sparsity_scale.append(std_multiplier.item())
+        self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale)
+        sliding_window_pattern = []
+        for t in self.hparams["layer_types"]:
+            sliding_window_pattern.append(t == "sliding_attention")
+        self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
+    def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None:
+        has_all = all(m.numel() > 0 for m in matrices)
+        if not has_all:
+            return None
+        else:
+            return torch.stack(matrices, dim=0)
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.endswith("_scale"):
+            name = name + ".weight"
+        # TODO: implement self.prediction_coefs.weight.clamp_(...)
+        if "language_model." not in name:
+            return [] # skip non-language model tensors
+        if "altup_unembed_projections" in name:
+            data_torch = data_torch.to(device="cpu")
+            if ".0." in name:
+                self._altup_unembd[0] = data_torch
+            elif ".1." in name:
+                self._altup_unembd[1] = data_torch
+            elif ".2." in name:
+                self._altup_unembd[2] = data_torch
+            else:
+                raise ValueError(f"Unknown name: {name}")
+            out = self._stack_matrices(self._altup_unembd)
+            if out is not None:
+                return [(self.map_tensor_name("model.altup_unembed_projections.weight"), out)]
+            else:
+                return []
+        if "altup_projections" in name:
+            data_torch = data_torch.to(device="cpu")
+            if ".0." in name:
+                self._altup_proj[0] = data_torch
+            elif ".1." in name:
+                self._altup_proj[1] = data_torch
+            elif ".2." in name:
+                self._altup_proj[2] = data_torch
+            else:
+                raise ValueError(f"Unknown name: {name}")
+            out = self._stack_matrices(self._altup_proj)
+            if out is not None:
+                return [(self.map_tensor_name("model.altup_projections.weight"), out)]
+            else:
+                return []
+        return super().modify_tensors(data_torch, name, bid)
 @ModelBase.register("Starcoder2ForCausalLM")
 class StarCoder2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.STARCODER2
@@ -4848,8 +4954,6 @@ class JinaBertV2Model(BertModel):
             self.gguf_writer.add_token_type_count(2)
         else:
             raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
-        self.gguf_writer.add_add_bos_token(True)
-        self.gguf_writer.add_add_eos_token(True)
 @ModelBase.register("OpenELMForCausalLM")
@@ -5451,9 +5555,6 @@ class T5Model(TextModel):
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
         special_vocab.add_to_gguf(self.gguf_writer)
-        self.gguf_writer.add_add_bos_token(False)
-        self.gguf_writer.add_add_eos_token(True)
     def set_gguf_parameters(self):
         if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
             logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -5591,9 +5692,6 @@ class T5EncoderModel(TextModel):
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
         special_vocab.add_to_gguf(self.gguf_writer)
-        self.gguf_writer.add_add_bos_token(False)
-        self.gguf_writer.add_add_eos_token(True)
     def set_gguf_parameters(self):
         if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
             logger.warning("Couldn't find context length in config.json, assuming default value of 512")

package/cpp/llama.cpp/ggml/CMakeLists.txt CHANGED Viewed

@@ -131,6 +131,7 @@ option(GGML_RVV              "ggml: enable rvv"              ON)
 option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
 option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
 option(GGML_VXE              "ggml: enable vxe"              ON)
+option(GGML_NNPA             "ggml: enable nnpa"             ON)
 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
 set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")

package/cpp/llama.cpp/ggml/include/ggml-cpu.h CHANGED Viewed

@@ -101,6 +101,7 @@ extern "C" {
     GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
     GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
     GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
+    GGML_BACKEND_API int ggml_cpu_has_nnpa       (void);
     GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
     GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
@@ -133,6 +134,7 @@ extern "C" {
     GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
+    GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *,       float *, int64_t);
     GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
     GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
     GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);