@fugood/llama.node 1.2.5 → 1.3.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -0
- package/lib/binding.ts +96 -1
- package/lib/index.js +4 -2
- package/lib/index.ts +4 -1
- package/lib/parallel.js +214 -0
- package/lib/parallel.ts +273 -0
- package/package.json +14 -14
- package/src/LlamaContext.cpp +34 -1
- package/src/LlamaContext.h +16 -0
- package/src/common.hpp +4 -3
- package/src/llama.cpp/common/arg.cpp +1 -1
- package/src/llama.cpp/ggml/include/ggml.h +44 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +16 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +32 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +1 -1
- package/src/llama.cpp/src/llama-arch.cpp +5 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +74 -43
- package/src/llama.cpp/src/llama-graph.h +7 -3
- package/src/llama.cpp/src/llama-model.cpp +8 -7
- package/src/llama.cpp/src/llama-quant.cpp +7 -1
- package/src/llama.cpp/src/llama.cpp +4 -0
|
@@ -478,7 +478,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
478
478
|
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
|
|
479
479
|
|
|
480
480
|
// everything past this point is not vocab-related
|
|
481
|
-
|
|
481
|
+
// for CLIP models, we only need to load tensors, no hparams
|
|
482
|
+
if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
|
|
482
483
|
return;
|
|
483
484
|
}
|
|
484
485
|
|
|
@@ -11358,8 +11359,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
11358
11359
|
}
|
|
11359
11360
|
};
|
|
11360
11361
|
|
|
11361
|
-
struct
|
|
11362
|
-
|
|
11362
|
+
struct llm_build_gemma_embedding : public llm_graph_context {
|
|
11363
|
+
llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
11363
11364
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
11364
11365
|
|
|
11365
11366
|
ggml_tensor * cur;
|
|
@@ -11376,8 +11377,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context {
|
|
|
11376
11377
|
// inp_pos - contains the positions
|
|
11377
11378
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11378
11379
|
|
|
11379
|
-
|
|
11380
|
-
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
11380
|
+
auto * inp_attn = build_attn_inp_no_cache();
|
|
11381
11381
|
|
|
11382
11382
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11383
11383
|
|
|
@@ -19378,7 +19378,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
19378
19378
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
19379
19379
|
case LLM_ARCH_NEO_BERT:
|
|
19380
19380
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
19381
|
-
|
|
19381
|
+
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
19382
19382
|
case LLM_ARCH_DREAM:
|
|
19383
19383
|
case LLM_ARCH_LLADA:
|
|
19384
19384
|
case LLM_ARCH_LLADA_MOE:
|
|
@@ -19671,7 +19671,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
19671
19671
|
} break;
|
|
19672
19672
|
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
19673
19673
|
{
|
|
19674
|
-
llm = std::make_unique<
|
|
19674
|
+
llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
|
|
19675
19675
|
} break;
|
|
19676
19676
|
case LLM_ARCH_STARCODER2:
|
|
19677
19677
|
{
|
|
@@ -20014,6 +20014,7 @@ int32_t llama_n_head(const llama_model * model) {
|
|
|
20014
20014
|
llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
20015
20015
|
switch (model->arch) {
|
|
20016
20016
|
// these models do not use RoPE
|
|
20017
|
+
case LLM_ARCH_CLIP:
|
|
20017
20018
|
case LLM_ARCH_GPT2:
|
|
20018
20019
|
case LLM_ARCH_GPTJ:
|
|
20019
20020
|
case LLM_ARCH_MPT:
|
|
@@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
701
701
|
});
|
|
702
702
|
}
|
|
703
703
|
|
|
704
|
+
bool is_clip_model = false;
|
|
704
705
|
for (const auto * it : tensors) {
|
|
705
706
|
const struct ggml_tensor * tensor = it->tensor;
|
|
706
707
|
|
|
@@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
714
715
|
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
|
715
716
|
qs.has_output = true;
|
|
716
717
|
}
|
|
718
|
+
|
|
719
|
+
is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
|
|
717
720
|
}
|
|
718
721
|
|
|
719
722
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
|
720
723
|
|
|
721
724
|
// sanity checks for models that have attention layers
|
|
722
|
-
if (qs.n_attention_wv != 0)
|
|
725
|
+
if (qs.n_attention_wv != 0 && !is_clip_model)
|
|
723
726
|
{
|
|
724
727
|
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
|
|
725
728
|
// attention layers have a non-zero number of kv heads
|
|
@@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
881
884
|
// do not quantize relative position bias (T5)
|
|
882
885
|
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
|
883
886
|
|
|
887
|
+
// do not quantize specific multimodal tensors
|
|
888
|
+
quantize &= name.find(".position_embd.") == std::string::npos;
|
|
889
|
+
|
|
884
890
|
ggml_type new_type;
|
|
885
891
|
void * new_data;
|
|
886
892
|
size_t new_size;
|
|
@@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
|
|
|
124
124
|
} catch(const std::exception & e) {
|
|
125
125
|
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
|
|
126
126
|
}
|
|
127
|
+
if (model.arch == LLM_ARCH_CLIP) {
|
|
128
|
+
throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
|
|
129
|
+
}
|
|
127
130
|
try {
|
|
128
131
|
model.load_vocab(ml);
|
|
129
132
|
} catch(const std::exception & e) {
|
|
@@ -312,6 +315,7 @@ struct llama_model * llama_model_load_from_splits(
|
|
|
312
315
|
LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
|
|
313
316
|
return nullptr;
|
|
314
317
|
}
|
|
318
|
+
splits.reserve(n_paths);
|
|
315
319
|
for (size_t i = 0; i < n_paths; ++i) {
|
|
316
320
|
splits.push_back(paths[i]);
|
|
317
321
|
}
|