npm - @fugood/llama.node - Versions diffs - 1.2.5 → 1.3.0-rc.0 - Mend

@fugood/llama.node 1.2.5 → 1.3.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/CMakeLists.txt +1 -0
package/lib/binding.ts +96 -1
package/lib/index.js +4 -2
package/lib/index.ts +4 -1
package/lib/parallel.js +214 -0
package/lib/parallel.ts +273 -0
package/package.json +14 -14
package/src/LlamaContext.cpp +34 -1
package/src/LlamaContext.h +16 -0
package/src/common.hpp +4 -3
package/src/llama.cpp/common/arg.cpp +1 -1
package/src/llama.cpp/ggml/include/ggml.h +44 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +16 -3
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -0
package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +32 -0
package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +4 -0
package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +1 -1
package/src/llama.cpp/src/llama-arch.cpp +5 -0
package/src/llama.cpp/src/llama-arch.h +1 -0
package/src/llama.cpp/src/llama-graph.cpp +74 -43
package/src/llama.cpp/src/llama-graph.h +7 -3
package/src/llama.cpp/src/llama-model.cpp +8 -7
package/src/llama.cpp/src/llama-quant.cpp +7 -1
package/src/llama.cpp/src/llama.cpp +4 -0

package/src/llama.cpp/src/llama-model.cpp CHANGED Viewed

@@ -478,7 +478,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_GENERAL_NAME, name, false);
     // everything past this point is not vocab-related
-    if (hparams.vocab_only) {
+    // for CLIP models, we only need to load tensors, no hparams
+    if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
         return;
     }
@@ -11358,8 +11359,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
     }
 };
-struct llm_build_gemma_embedding_iswa : public llm_graph_context {
-    llm_build_gemma_embedding_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+struct llm_build_gemma_embedding : public llm_graph_context {
+    llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_k;
         ggml_tensor * cur;
@@ -11376,8 +11377,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context {
         // inp_pos - contains the positions
         ggml_tensor * inp_pos = build_inp_pos();
-        // TODO: support cacheless iSWA embeddings [TAG_NO_CACHE_ISWA]
-        auto * inp_attn = build_attn_inp_kv_iswa();
+        auto * inp_attn = build_attn_inp_no_cache();
         ggml_tensor * inp_out_ids = build_inp_out_ids();
@@ -19378,7 +19378,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
         case LLM_ARCH_NOMIC_BERT_MOE:
         case LLM_ARCH_NEO_BERT:
         case LLM_ARCH_WAVTOKENIZER_DEC:
-        //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
+        case LLM_ARCH_GEMMA_EMBEDDING:
         case LLM_ARCH_DREAM:
         case LLM_ARCH_LLADA:
         case LLM_ARCH_LLADA_MOE:
@@ -19671,7 +19671,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             } break;
         case LLM_ARCH_GEMMA_EMBEDDING:
             {
-                llm = std::make_unique<llm_build_gemma_embedding_iswa>(*this, params);
+                llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
             } break;
         case LLM_ARCH_STARCODER2:
             {
@@ -20014,6 +20014,7 @@ int32_t llama_n_head(const llama_model * model) {
 llama_rope_type llama_model_rope_type(const llama_model * model) {
     switch (model->arch) {
         // these models do not use RoPE
+        case LLM_ARCH_CLIP:
         case LLM_ARCH_GPT2:
         case LLM_ARCH_GPTJ:
         case LLM_ARCH_MPT:

package/src/llama.cpp/src/llama-quant.cpp CHANGED Viewed

@@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         });
     }
+    bool is_clip_model = false;
     for (const auto * it : tensors) {
         const struct ggml_tensor * tensor = it->tensor;
@@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
             qs.has_output = true;
         }
+        is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
     }
     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
     // sanity checks for models that have attention layers
-    if (qs.n_attention_wv != 0)
+    if (qs.n_attention_wv != 0 && !is_clip_model)
     {
         const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
         // attention layers have a non-zero number of kv heads
@@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         // do not quantize relative position bias (T5)
         quantize &= name.find("attn_rel_b.weight") == std::string::npos;
+        // do not quantize specific multimodal tensors
+        quantize &= name.find(".position_embd.") == std::string::npos;
         ggml_type new_type;
         void * new_data;
         size_t new_size;

package/src/llama.cpp/src/llama.cpp CHANGED Viewed

@@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
         } catch(const std::exception & e) {
             throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
         }
+        if (model.arch == LLM_ARCH_CLIP) {
+            throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
+        }
         try {
             model.load_vocab(ml);
         } catch(const std::exception & e) {
@@ -312,6 +315,7 @@ struct llama_model * llama_model_load_from_splits(
         LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
         return nullptr;
     }
+    splits.reserve(n_paths);
     for (size_t i = 0; i < n_paths; ++i) {
         splits.push_back(paths[i]);
     }