@fugood/llama.node 1.2.5 → 1.3.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -478,7 +478,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
478
478
  ml.get_key(LLM_KV_GENERAL_NAME, name, false);
479
479
 
480
480
  // everything past this point is not vocab-related
481
- if (hparams.vocab_only) {
481
+ // for CLIP models, we only need to load tensors, no hparams
482
+ if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
482
483
  return;
483
484
  }
484
485
 
@@ -11358,8 +11359,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
11358
11359
  }
11359
11360
  };
11360
11361
 
11361
- struct llm_build_gemma_embedding_iswa : public llm_graph_context {
11362
- llm_build_gemma_embedding_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
11362
+ struct llm_build_gemma_embedding : public llm_graph_context {
11363
+ llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
11363
11364
  const int64_t n_embd_head = hparams.n_embd_head_k;
11364
11365
 
11365
11366
  ggml_tensor * cur;
@@ -11376,8 +11377,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context {
11376
11377
  // inp_pos - contains the positions
11377
11378
  ggml_tensor * inp_pos = build_inp_pos();
11378
11379
 
11379
- // TODO: support cacheless iSWA embeddings [TAG_NO_CACHE_ISWA]
11380
- auto * inp_attn = build_attn_inp_kv_iswa();
11380
+ auto * inp_attn = build_attn_inp_no_cache();
11381
11381
 
11382
11382
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11383
11383
 
@@ -19378,7 +19378,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
19378
19378
  case LLM_ARCH_NOMIC_BERT_MOE:
19379
19379
  case LLM_ARCH_NEO_BERT:
19380
19380
  case LLM_ARCH_WAVTOKENIZER_DEC:
19381
- //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
19381
+ case LLM_ARCH_GEMMA_EMBEDDING:
19382
19382
  case LLM_ARCH_DREAM:
19383
19383
  case LLM_ARCH_LLADA:
19384
19384
  case LLM_ARCH_LLADA_MOE:
@@ -19671,7 +19671,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
19671
19671
  } break;
19672
19672
  case LLM_ARCH_GEMMA_EMBEDDING:
19673
19673
  {
19674
- llm = std::make_unique<llm_build_gemma_embedding_iswa>(*this, params);
19674
+ llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
19675
19675
  } break;
19676
19676
  case LLM_ARCH_STARCODER2:
19677
19677
  {
@@ -20014,6 +20014,7 @@ int32_t llama_n_head(const llama_model * model) {
20014
20014
  llama_rope_type llama_model_rope_type(const llama_model * model) {
20015
20015
  switch (model->arch) {
20016
20016
  // these models do not use RoPE
20017
+ case LLM_ARCH_CLIP:
20017
20018
  case LLM_ARCH_GPT2:
20018
20019
  case LLM_ARCH_GPTJ:
20019
20020
  case LLM_ARCH_MPT:
@@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
701
701
  });
702
702
  }
703
703
 
704
+ bool is_clip_model = false;
704
705
  for (const auto * it : tensors) {
705
706
  const struct ggml_tensor * tensor = it->tensor;
706
707
 
@@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
714
715
  } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
715
716
  qs.has_output = true;
716
717
  }
718
+
719
+ is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
717
720
  }
718
721
 
719
722
  qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
720
723
 
721
724
  // sanity checks for models that have attention layers
722
- if (qs.n_attention_wv != 0)
725
+ if (qs.n_attention_wv != 0 && !is_clip_model)
723
726
  {
724
727
  const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
725
728
  // attention layers have a non-zero number of kv heads
@@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
881
884
  // do not quantize relative position bias (T5)
882
885
  quantize &= name.find("attn_rel_b.weight") == std::string::npos;
883
886
 
887
+ // do not quantize specific multimodal tensors
888
+ quantize &= name.find(".position_embd.") == std::string::npos;
889
+
884
890
  ggml_type new_type;
885
891
  void * new_data;
886
892
  size_t new_size;
@@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
124
124
  } catch(const std::exception & e) {
125
125
  throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
126
126
  }
127
+ if (model.arch == LLM_ARCH_CLIP) {
128
+ throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
129
+ }
127
130
  try {
128
131
  model.load_vocab(ml);
129
132
  } catch(const std::exception & e) {
@@ -312,6 +315,7 @@ struct llama_model * llama_model_load_from_splits(
312
315
  LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
313
316
  return nullptr;
314
317
  }
318
+ splits.reserve(n_paths);
315
319
  for (size_t i = 0; i < n_paths; ++i) {
316
320
  splits.push_back(paths[i]);
317
321
  }