@fugood/llama.node 0.3.7 → 0.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +8 -0
- package/lib/index.js +16 -1
- package/lib/index.ts +16 -0
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +4 -3
- package/src/LlamaCompletionWorker.cpp +4 -2
- package/src/LlamaContext.cpp +156 -6
- package/src/LlamaContext.h +5 -0
- package/src/common.hpp +6 -11
- package/src/llama.cpp/.github/workflows/build.yml +19 -17
- package/src/llama.cpp/.github/workflows/docker.yml +77 -30
- package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +22 -3
- package/src/llama.cpp/CMakeLists.txt +49 -24
- package/src/llama.cpp/common/arg.cpp +82 -26
- package/src/llama.cpp/common/arg.h +3 -0
- package/src/llama.cpp/common/common.cpp +192 -72
- package/src/llama.cpp/common/common.h +51 -18
- package/src/llama.cpp/common/ngram-cache.cpp +12 -12
- package/src/llama.cpp/common/ngram-cache.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +11 -6
- package/src/llama.cpp/common/speculative.cpp +18 -15
- package/src/llama.cpp/docs/build.md +2 -0
- package/src/llama.cpp/examples/batched/batched.cpp +9 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
- package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
- package/src/llama.cpp/examples/infill/infill.cpp +23 -24
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
- package/src/llama.cpp/examples/llava/clip.cpp +4 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
- package/src/llama.cpp/examples/llava/llava.cpp +2 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
- package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
- package/src/llama.cpp/examples/main/main.cpp +51 -29
- package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
- package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
- package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
- package/src/llama.cpp/examples/run/run.cpp +175 -61
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
- package/src/llama.cpp/examples/server/httplib.h +1295 -409
- package/src/llama.cpp/examples/server/server.cpp +387 -181
- package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
- package/src/llama.cpp/examples/server/utils.hpp +170 -58
- package/src/llama.cpp/examples/simple/simple.cpp +9 -8
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
- package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
- package/src/llama.cpp/examples/tts/tts.cpp +64 -23
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +36 -145
- package/src/llama.cpp/ggml/include/gguf.h +202 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
- package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml.c +117 -1327
- package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
- package/src/llama.cpp/include/llama-cpp.h +6 -1
- package/src/llama.cpp/include/llama.h +138 -75
- package/src/llama.cpp/src/CMakeLists.txt +13 -1
- package/src/llama.cpp/src/llama-adapter.cpp +347 -0
- package/src/llama.cpp/src/llama-adapter.h +74 -0
- package/src/llama.cpp/src/llama-arch.cpp +1487 -0
- package/src/llama.cpp/src/llama-arch.h +400 -0
- package/src/llama.cpp/src/llama-batch.cpp +368 -0
- package/src/llama.cpp/src/llama-batch.h +88 -0
- package/src/llama.cpp/src/llama-chat.cpp +578 -0
- package/src/llama.cpp/src/llama-chat.h +52 -0
- package/src/llama.cpp/src/llama-context.cpp +1775 -0
- package/src/llama.cpp/src/llama-context.h +128 -0
- package/src/llama.cpp/src/llama-cparams.cpp +1 -0
- package/src/llama.cpp/src/llama-cparams.h +37 -0
- package/src/llama.cpp/src/llama-grammar.cpp +5 -4
- package/src/llama.cpp/src/llama-grammar.h +3 -1
- package/src/llama.cpp/src/llama-hparams.cpp +71 -0
- package/src/llama.cpp/src/llama-hparams.h +139 -0
- package/src/llama.cpp/src/llama-impl.cpp +167 -0
- package/src/llama.cpp/src/llama-impl.h +16 -136
- package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
- package/src/llama.cpp/src/llama-kv-cache.h +218 -0
- package/src/llama.cpp/src/llama-mmap.cpp +589 -0
- package/src/llama.cpp/src/llama-mmap.h +67 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
- package/src/llama.cpp/src/llama-model-loader.h +167 -0
- package/src/llama.cpp/src/llama-model.cpp +3953 -0
- package/src/llama.cpp/src/llama-model.h +370 -0
- package/src/llama.cpp/src/llama-quant.cpp +934 -0
- package/src/llama.cpp/src/llama-quant.h +1 -0
- package/src/llama.cpp/src/llama-sampling.cpp +147 -32
- package/src/llama.cpp/src/llama-sampling.h +3 -19
- package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
- package/src/llama.cpp/src/llama-vocab.h +97 -142
- package/src/llama.cpp/src/llama.cpp +7160 -20314
- package/src/llama.cpp/src/unicode.cpp +8 -3
- package/src/llama.cpp/tests/CMakeLists.txt +2 -0
- package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
- package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
- package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
- package/src/llama.cpp/tests/test-gguf.cpp +222 -187
- package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +0 -1
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
|
|
3
3
|
#endif
|
|
4
4
|
|
|
5
|
+
#include "ggml.h"
|
|
6
|
+
#include "gguf.h"
|
|
7
|
+
|
|
5
8
|
#include "common.h"
|
|
6
9
|
#include "log.h"
|
|
7
10
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
|
@@ -18,6 +21,7 @@
|
|
|
18
21
|
#include <cstdarg>
|
|
19
22
|
#include <cstring>
|
|
20
23
|
#include <ctime>
|
|
24
|
+
#include <filesystem>
|
|
21
25
|
#include <fstream>
|
|
22
26
|
#include <iostream>
|
|
23
27
|
#include <iterator>
|
|
@@ -62,11 +66,29 @@
|
|
|
62
66
|
#ifdef __linux__
|
|
63
67
|
#include <linux/limits.h>
|
|
64
68
|
#elif defined(_WIN32)
|
|
65
|
-
#
|
|
69
|
+
# if !defined(PATH_MAX)
|
|
70
|
+
# define PATH_MAX MAX_PATH
|
|
71
|
+
# endif
|
|
66
72
|
#else
|
|
67
73
|
#include <sys/syslimits.h>
|
|
68
74
|
#endif
|
|
69
75
|
#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
|
76
|
+
|
|
77
|
+
//
|
|
78
|
+
// CURL utils
|
|
79
|
+
//
|
|
80
|
+
|
|
81
|
+
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
|
82
|
+
|
|
83
|
+
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
|
|
84
|
+
struct curl_slist_ptr {
|
|
85
|
+
struct curl_slist * ptr = nullptr;
|
|
86
|
+
~curl_slist_ptr() {
|
|
87
|
+
if (ptr) {
|
|
88
|
+
curl_slist_free_all(ptr);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
};
|
|
70
92
|
#endif // LLAMA_USE_CURL
|
|
71
93
|
|
|
72
94
|
using json = nlohmann::ordered_json;
|
|
@@ -843,7 +865,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
843
865
|
} else if (!params.model_url.empty()) {
|
|
844
866
|
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
|
845
867
|
} else {
|
|
846
|
-
model =
|
|
868
|
+
model = llama_model_load_from_file(params.model.c_str(), mparams);
|
|
847
869
|
}
|
|
848
870
|
|
|
849
871
|
if (model == NULL) {
|
|
@@ -851,26 +873,28 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
851
873
|
return iparams;
|
|
852
874
|
}
|
|
853
875
|
|
|
876
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
877
|
+
|
|
854
878
|
if (params.reranking) {
|
|
855
879
|
bool ok = true;
|
|
856
880
|
|
|
857
|
-
if (
|
|
858
|
-
LOG_WRN("%s: warning:
|
|
881
|
+
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
|
882
|
+
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
|
859
883
|
ok = false;
|
|
860
884
|
}
|
|
861
885
|
|
|
862
|
-
if (
|
|
863
|
-
LOG_WRN("%s: warning:
|
|
886
|
+
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
|
887
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
|
|
864
888
|
ok = false;
|
|
865
889
|
}
|
|
866
890
|
|
|
867
|
-
if (
|
|
868
|
-
LOG_WRN("%s: warning:
|
|
891
|
+
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
|
|
892
|
+
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
|
869
893
|
ok = false;
|
|
870
894
|
}
|
|
871
895
|
|
|
872
896
|
if (!ok) {
|
|
873
|
-
|
|
897
|
+
llama_model_free(model);
|
|
874
898
|
|
|
875
899
|
return iparams;
|
|
876
900
|
}
|
|
@@ -878,40 +902,40 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
878
902
|
|
|
879
903
|
auto cparams = common_context_params_to_llama(params);
|
|
880
904
|
|
|
881
|
-
llama_context * lctx =
|
|
905
|
+
llama_context * lctx = llama_init_from_model(model, cparams);
|
|
882
906
|
if (lctx == NULL) {
|
|
883
907
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
|
884
|
-
|
|
908
|
+
llama_model_free(model);
|
|
885
909
|
return iparams;
|
|
886
910
|
}
|
|
887
911
|
|
|
888
912
|
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
return iparams;
|
|
913
|
+
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
|
|
914
|
+
params.ctx_shift = false;
|
|
892
915
|
}
|
|
893
916
|
|
|
894
917
|
if (!params.control_vectors.empty()) {
|
|
895
918
|
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
|
896
|
-
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end =
|
|
919
|
+
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);
|
|
897
920
|
|
|
898
921
|
const auto cvec = common_control_vector_load(params.control_vectors);
|
|
899
922
|
if (cvec.n_embd == -1) {
|
|
900
923
|
llama_free(lctx);
|
|
901
|
-
|
|
924
|
+
llama_model_free(model);
|
|
902
925
|
|
|
903
926
|
return iparams;
|
|
904
927
|
}
|
|
905
928
|
|
|
906
|
-
int err =
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
929
|
+
int err = llama_apply_adapter_cvec(
|
|
930
|
+
lctx,
|
|
931
|
+
cvec.data.data(),
|
|
932
|
+
cvec.data.size(),
|
|
933
|
+
cvec.n_embd,
|
|
934
|
+
params.control_vector_layer_start,
|
|
935
|
+
params.control_vector_layer_end);
|
|
912
936
|
if (err) {
|
|
913
937
|
llama_free(lctx);
|
|
914
|
-
|
|
938
|
+
llama_model_free(model);
|
|
915
939
|
|
|
916
940
|
return iparams;
|
|
917
941
|
}
|
|
@@ -919,30 +943,31 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
919
943
|
|
|
920
944
|
// load and optionally apply lora adapters
|
|
921
945
|
for (auto & la : params.lora_adapters) {
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
|
926
|
-
if (loaded_la.adapter == nullptr) {
|
|
946
|
+
llama_adapter_lora_ptr lora;
|
|
947
|
+
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
|
948
|
+
if (lora == nullptr) {
|
|
927
949
|
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
|
928
950
|
llama_free(lctx);
|
|
929
|
-
|
|
951
|
+
llama_model_free(model);
|
|
930
952
|
return iparams;
|
|
931
953
|
}
|
|
932
|
-
|
|
954
|
+
|
|
955
|
+
la.ptr = lora.get();
|
|
956
|
+
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
|
933
957
|
}
|
|
958
|
+
|
|
934
959
|
if (!params.lora_init_without_apply) {
|
|
935
|
-
|
|
960
|
+
common_set_adapter_lora(lctx, params.lora_adapters);
|
|
936
961
|
}
|
|
937
962
|
|
|
938
|
-
if (params.sampling.ignore_eos &&
|
|
939
|
-
LOG_WRN("%s: warning:
|
|
963
|
+
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
|
964
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
|
940
965
|
params.sampling.ignore_eos = false;
|
|
941
966
|
}
|
|
942
967
|
|
|
943
968
|
if (params.sampling.ignore_eos) {
|
|
944
|
-
for (llama_token i = 0; i <
|
|
945
|
-
if (
|
|
969
|
+
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
|
970
|
+
if (llama_vocab_is_eog(vocab, i)) {
|
|
946
971
|
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
|
947
972
|
params.sampling.logit_bias.push_back({i, -INFINITY});
|
|
948
973
|
}
|
|
@@ -963,8 +988,9 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
963
988
|
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
|
964
989
|
|
|
965
990
|
std::vector<llama_token> tmp;
|
|
966
|
-
llama_token bos =
|
|
967
|
-
llama_token eos =
|
|
991
|
+
llama_token bos = llama_vocab_bos(vocab);
|
|
992
|
+
llama_token eos = llama_vocab_eos(vocab);
|
|
993
|
+
|
|
968
994
|
// some models (e.g. T5) don't have a BOS token
|
|
969
995
|
if (bos != LLAMA_TOKEN_NULL) {
|
|
970
996
|
tmp.push_back(bos);
|
|
@@ -979,7 +1005,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
979
1005
|
if (llama_model_has_encoder(model)) {
|
|
980
1006
|
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
|
981
1007
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
|
982
|
-
if (decoder_start_token_id ==
|
|
1008
|
+
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
|
983
1009
|
decoder_start_token_id = bos;
|
|
984
1010
|
}
|
|
985
1011
|
tmp.clear();
|
|
@@ -993,17 +1019,17 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
993
1019
|
llama_perf_context_reset(lctx);
|
|
994
1020
|
}
|
|
995
1021
|
|
|
996
|
-
iparams.model
|
|
997
|
-
iparams.context
|
|
1022
|
+
iparams.model.reset(model);
|
|
1023
|
+
iparams.context.reset(lctx);
|
|
998
1024
|
|
|
999
1025
|
return iparams;
|
|
1000
1026
|
}
|
|
1001
1027
|
|
|
1002
|
-
void
|
|
1003
|
-
|
|
1004
|
-
for (auto & la :
|
|
1028
|
+
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
|
|
1029
|
+
llama_clear_adapter_lora(ctx);
|
|
1030
|
+
for (auto & la : lora) {
|
|
1005
1031
|
if (la.scale != 0.0f) {
|
|
1006
|
-
|
|
1032
|
+
llama_set_adapter_lora(ctx, la.ptr, la.scale);
|
|
1007
1033
|
}
|
|
1008
1034
|
}
|
|
1009
1035
|
}
|
|
@@ -1017,7 +1043,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
|
1017
1043
|
if (params.n_gpu_layers != -1) {
|
|
1018
1044
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
1019
1045
|
}
|
|
1020
|
-
mparams.rpc_servers = params.rpc_servers.c_str();
|
|
1021
1046
|
mparams.main_gpu = params.main_gpu;
|
|
1022
1047
|
mparams.split_mode = params.split_mode;
|
|
1023
1048
|
mparams.tensor_split = params.tensor_split;
|
|
@@ -1120,7 +1145,8 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
|
|
|
1120
1145
|
|
|
1121
1146
|
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
|
1122
1147
|
// Initialize libcurl
|
|
1123
|
-
|
|
1148
|
+
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
1149
|
+
curl_slist_ptr http_headers;
|
|
1124
1150
|
if (!curl) {
|
|
1125
1151
|
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
|
1126
1152
|
return false;
|
|
@@ -1134,11 +1160,9 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
|
|
1134
1160
|
|
|
1135
1161
|
// Check if hf-token or bearer-token was specified
|
|
1136
1162
|
if (!hf_token.empty()) {
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
http_headers = curl_slist_append(http_headers, auth_header.c_str());
|
|
1141
|
-
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
|
|
1163
|
+
std::string auth_header = "Authorization: Bearer " + hf_token;
|
|
1164
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
|
1165
|
+
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
1142
1166
|
}
|
|
1143
1167
|
|
|
1144
1168
|
#if defined(_WIN32)
|
|
@@ -1148,8 +1172,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
|
|
1148
1172
|
#endif
|
|
1149
1173
|
|
|
1150
1174
|
// Check if the file already exists locally
|
|
1151
|
-
|
|
1152
|
-
auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
|
|
1175
|
+
auto file_exists = std::filesystem::exists(path);
|
|
1153
1176
|
|
|
1154
1177
|
// If the file exists, check its JSON metadata companion file.
|
|
1155
1178
|
std::string metadata_path = path + ".json";
|
|
@@ -1409,7 +1432,7 @@ struct llama_model * common_load_model_from_url(
|
|
|
1409
1432
|
}
|
|
1410
1433
|
}
|
|
1411
1434
|
|
|
1412
|
-
return
|
|
1435
|
+
return llama_model_load_from_file(local_path.c_str(), params);
|
|
1413
1436
|
}
|
|
1414
1437
|
|
|
1415
1438
|
struct llama_model * common_load_model_from_hf(
|
|
@@ -1435,6 +1458,80 @@ struct llama_model * common_load_model_from_hf(
|
|
|
1435
1458
|
return common_load_model_from_url(model_url, local_path, hf_token, params);
|
|
1436
1459
|
}
|
|
1437
1460
|
|
|
1461
|
+
/**
|
|
1462
|
+
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
|
|
1463
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
|
|
1464
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
|
|
1465
|
+
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
|
|
1466
|
+
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
|
|
1467
|
+
*
|
|
1468
|
+
* Return pair of <repo, file> (with "repo" already having tag removed)
|
|
1469
|
+
*
|
|
1470
|
+
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
|
1471
|
+
*/
|
|
1472
|
+
std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
|
|
1473
|
+
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
|
1474
|
+
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
|
1475
|
+
std::string hf_repo = parts[0];
|
|
1476
|
+
if (string_split<std::string>(hf_repo, '/').size() != 2) {
|
|
1477
|
+
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
|
|
1478
|
+
}
|
|
1479
|
+
|
|
1480
|
+
// fetch model info from Hugging Face Hub API
|
|
1481
|
+
json model_info;
|
|
1482
|
+
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
|
1483
|
+
curl_slist_ptr http_headers;
|
|
1484
|
+
std::string res_str;
|
|
1485
|
+
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
|
|
1486
|
+
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
|
|
1487
|
+
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
|
|
1488
|
+
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
|
|
1489
|
+
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
|
|
1490
|
+
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
|
|
1491
|
+
return size * nmemb;
|
|
1492
|
+
};
|
|
1493
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
|
|
1494
|
+
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
|
|
1495
|
+
#if defined(_WIN32)
|
|
1496
|
+
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
|
1497
|
+
#endif
|
|
1498
|
+
if (!hf_token.empty()) {
|
|
1499
|
+
std::string auth_header = "Authorization: Bearer " + hf_token;
|
|
1500
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
|
|
1501
|
+
}
|
|
1502
|
+
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
|
|
1503
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
|
|
1504
|
+
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
|
|
1505
|
+
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
|
|
1506
|
+
|
|
1507
|
+
CURLcode res = curl_easy_perform(curl.get());
|
|
1508
|
+
|
|
1509
|
+
if (res != CURLE_OK) {
|
|
1510
|
+
throw std::runtime_error("error: cannot make GET request to HF API");
|
|
1511
|
+
}
|
|
1512
|
+
|
|
1513
|
+
long res_code;
|
|
1514
|
+
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
|
|
1515
|
+
if (res_code == 200) {
|
|
1516
|
+
model_info = json::parse(res_str);
|
|
1517
|
+
} else if (res_code == 401) {
|
|
1518
|
+
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
|
|
1519
|
+
} else {
|
|
1520
|
+
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
|
|
1521
|
+
}
|
|
1522
|
+
|
|
1523
|
+
// check response
|
|
1524
|
+
if (!model_info.contains("ggufFile")) {
|
|
1525
|
+
throw std::runtime_error("error: model does not have ggufFile");
|
|
1526
|
+
}
|
|
1527
|
+
json & gguf_file = model_info.at("ggufFile");
|
|
1528
|
+
if (!gguf_file.contains("rfilename")) {
|
|
1529
|
+
throw std::runtime_error("error: ggufFile does not have rfilename");
|
|
1530
|
+
}
|
|
1531
|
+
|
|
1532
|
+
return std::make_pair(hf_repo, gguf_file.at("rfilename"));
|
|
1533
|
+
}
|
|
1534
|
+
|
|
1438
1535
|
#else
|
|
1439
1536
|
|
|
1440
1537
|
struct llama_model * common_load_model_from_url(
|
|
@@ -1456,6 +1553,11 @@ struct llama_model * common_load_model_from_hf(
|
|
|
1456
1553
|
return nullptr;
|
|
1457
1554
|
}
|
|
1458
1555
|
|
|
1556
|
+
std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
|
|
1557
|
+
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
|
1558
|
+
return std::make_pair("", "");
|
|
1559
|
+
}
|
|
1560
|
+
|
|
1459
1561
|
#endif // LLAMA_USE_CURL
|
|
1460
1562
|
|
|
1461
1563
|
//
|
|
@@ -1554,21 +1656,23 @@ std::vector<llama_token> common_tokenize(
|
|
|
1554
1656
|
const std::string & text,
|
|
1555
1657
|
bool add_special,
|
|
1556
1658
|
bool parse_special) {
|
|
1557
|
-
|
|
1659
|
+
const llama_model * model = llama_get_model(ctx);
|
|
1660
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
1661
|
+
return common_tokenize(vocab, text, add_special, parse_special);
|
|
1558
1662
|
}
|
|
1559
1663
|
|
|
1560
1664
|
std::vector<llama_token> common_tokenize(
|
|
1561
|
-
const struct
|
|
1665
|
+
const struct llama_vocab * vocab,
|
|
1562
1666
|
const std::string & text,
|
|
1563
1667
|
bool add_special,
|
|
1564
1668
|
bool parse_special) {
|
|
1565
1669
|
// upper limit for the number of tokens
|
|
1566
1670
|
int n_tokens = text.length() + 2 * add_special;
|
|
1567
1671
|
std::vector<llama_token> result(n_tokens);
|
|
1568
|
-
n_tokens = llama_tokenize(
|
|
1672
|
+
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
|
1569
1673
|
if (n_tokens < 0) {
|
|
1570
1674
|
result.resize(-n_tokens);
|
|
1571
|
-
int check = llama_tokenize(
|
|
1675
|
+
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
|
1572
1676
|
GGML_ASSERT(check == -n_tokens);
|
|
1573
1677
|
} else {
|
|
1574
1678
|
result.resize(n_tokens);
|
|
@@ -1577,12 +1681,18 @@ std::vector<llama_token> common_tokenize(
|
|
|
1577
1681
|
}
|
|
1578
1682
|
|
|
1579
1683
|
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
|
1684
|
+
const llama_model * model = llama_get_model(ctx);
|
|
1685
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
1686
|
+
return common_token_to_piece(vocab, token, special);
|
|
1687
|
+
}
|
|
1688
|
+
|
|
1689
|
+
std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
|
|
1580
1690
|
std::string piece;
|
|
1581
1691
|
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
|
1582
|
-
const int n_chars = llama_token_to_piece(
|
|
1692
|
+
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
|
1583
1693
|
if (n_chars < 0) {
|
|
1584
1694
|
piece.resize(-n_chars);
|
|
1585
|
-
int check = llama_token_to_piece(
|
|
1695
|
+
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
|
|
1586
1696
|
GGML_ASSERT(check == -n_chars);
|
|
1587
1697
|
}
|
|
1588
1698
|
else {
|
|
@@ -1592,13 +1702,19 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
|
|
|
1592
1702
|
return piece;
|
|
1593
1703
|
}
|
|
1594
1704
|
|
|
1595
|
-
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
|
1705
|
+
std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
|
1706
|
+
const llama_model * model = llama_get_model(ctx);
|
|
1707
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
1708
|
+
return common_detokenize(vocab, tokens, special);
|
|
1709
|
+
}
|
|
1710
|
+
|
|
1711
|
+
std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
|
|
1596
1712
|
std::string text;
|
|
1597
1713
|
text.resize(std::max(text.capacity(), tokens.size()));
|
|
1598
|
-
int32_t n_chars = llama_detokenize(
|
|
1714
|
+
int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
|
1599
1715
|
if (n_chars < 0) {
|
|
1600
1716
|
text.resize(-n_chars);
|
|
1601
|
-
n_chars = llama_detokenize(
|
|
1717
|
+
n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
|
1602
1718
|
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
|
|
1603
1719
|
}
|
|
1604
1720
|
|
|
@@ -1612,9 +1728,14 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
|
|
|
1612
1728
|
// Chat template utils
|
|
1613
1729
|
//
|
|
1614
1730
|
|
|
1731
|
+
std::string common_get_builtin_chat_template(const struct llama_model * model) {
|
|
1732
|
+
const char * ptr_tmpl = llama_model_chat_template(model);
|
|
1733
|
+
return ptr_tmpl == nullptr ? "" : ptr_tmpl;
|
|
1734
|
+
}
|
|
1735
|
+
|
|
1615
1736
|
bool common_chat_verify_template(const std::string & tmpl) {
|
|
1616
1737
|
llama_chat_message chat[] = {{"user", "test"}};
|
|
1617
|
-
int res = llama_chat_apply_template(
|
|
1738
|
+
const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
|
|
1618
1739
|
return res >= 0;
|
|
1619
1740
|
}
|
|
1620
1741
|
|
|
@@ -1625,16 +1746,16 @@ std::string common_chat_apply_template(const struct llama_model * model,
|
|
|
1625
1746
|
int alloc_size = 0;
|
|
1626
1747
|
bool fallback = false; // indicate if we must fallback to default chatml
|
|
1627
1748
|
std::vector<llama_chat_message> chat;
|
|
1628
|
-
for (auto & msg : msgs) {
|
|
1749
|
+
for (const auto & msg : msgs) {
|
|
1629
1750
|
chat.push_back({msg.role.c_str(), msg.content.c_str()});
|
|
1630
1751
|
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
|
|
1631
1752
|
}
|
|
1632
1753
|
|
|
1633
|
-
const char * ptr_tmpl = tmpl.empty() ?
|
|
1754
|
+
const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
|
|
1634
1755
|
std::vector<char> buf(alloc_size);
|
|
1635
1756
|
|
|
1636
1757
|
// run the first time to get the total output length
|
|
1637
|
-
int32_t res = llama_chat_apply_template(
|
|
1758
|
+
int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
|
1638
1759
|
|
|
1639
1760
|
// error: chat template is not supported
|
|
1640
1761
|
if (res < 0) {
|
|
@@ -1642,18 +1763,17 @@ std::string common_chat_apply_template(const struct llama_model * model,
|
|
|
1642
1763
|
// if the custom "tmpl" is not supported, we throw an error
|
|
1643
1764
|
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
|
|
1644
1765
|
throw std::runtime_error("this custom template is not supported");
|
|
1645
|
-
} else {
|
|
1646
|
-
// If the built-in template is not supported, we default to chatml
|
|
1647
|
-
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
|
1648
|
-
fallback = true;
|
|
1649
1766
|
}
|
|
1767
|
+
|
|
1768
|
+
// If the built-in template is not supported, we default to chatml
|
|
1769
|
+
res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
|
1770
|
+
fallback = true;
|
|
1650
1771
|
}
|
|
1651
1772
|
|
|
1652
1773
|
// if it turns out that our buffer is too small, we resize it
|
|
1653
1774
|
if ((size_t) res > buf.size()) {
|
|
1654
1775
|
buf.resize(res);
|
|
1655
1776
|
res = llama_chat_apply_template(
|
|
1656
|
-
fallback ? nullptr : model,
|
|
1657
1777
|
fallback ? "chatml" : ptr_tmpl,
|
|
1658
1778
|
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
|
|
1659
1779
|
}
|