@fugood/llama.node 0.3.3 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +29 -1
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +15 -5
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +17 -1
- package/src/LlamaContext.cpp +86 -18
- package/src/LlamaContext.h +2 -0
- package/src/llama.cpp/.github/workflows/build.yml +197 -159
- package/src/llama.cpp/.github/workflows/docker.yml +5 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +11 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -2
- package/src/llama.cpp/common/arg.cpp +426 -245
- package/src/llama.cpp/common/common.cpp +143 -80
- package/src/llama.cpp/common/common.h +81 -24
- package/src/llama.cpp/common/sampling.cpp +53 -19
- package/src/llama.cpp/common/sampling.h +22 -1
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +101 -148
- package/src/llama.cpp/examples/CMakeLists.txt +32 -13
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +5 -4
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +262 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/llava.cpp +46 -19
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +9 -5
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
- package/src/llama.cpp/examples/server/server.cpp +1758 -886
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +94 -304
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +4 -0
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
- package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml.h +106 -24
- package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
- package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
- package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
- package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
- package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
- package/src/llama.cpp/ggml/src/ggml.c +367 -207
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +26 -19
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/CMakeLists.txt +2 -7
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +35 -90
- package/src/llama.cpp/src/llama-vocab.cpp +6 -1
- package/src/llama.cpp/src/llama.cpp +1748 -640
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -37
- package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
- package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
- package/src/llama.cpp/tests/test-rope.cpp +61 -20
- package/src/llama.cpp/tests/test-sampling.cpp +2 -2
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
|
@@ -536,12 +536,12 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
|
|
536
536
|
[](const unsigned char c) { return !std::isprint(c); }),
|
|
537
537
|
detokenized.end());
|
|
538
538
|
|
|
539
|
-
buf << "\n"
|
|
540
|
-
<< "
|
|
541
|
-
<< "
|
|
542
|
-
<< "
|
|
543
|
-
<< "
|
|
544
|
-
<< "
|
|
539
|
+
buf << "\n" << std::to_string(i)
|
|
540
|
+
<< ", token '" << detokenized << "'"
|
|
541
|
+
<< ", pos " << std::to_string(batch.pos[i])
|
|
542
|
+
<< ", n_seq_id " << std::to_string(batch.n_seq_id[i])
|
|
543
|
+
<< ", seq_id " << std::to_string(batch.seq_id[i][0])
|
|
544
|
+
<< ", logits " << std::to_string(batch.logits[i]);
|
|
545
545
|
}
|
|
546
546
|
|
|
547
547
|
buf << " ]";
|
|
@@ -652,7 +652,17 @@ bool fs_validate_filename(const std::string & filename) {
|
|
|
652
652
|
|
|
653
653
|
std::u32string filename_utf32;
|
|
654
654
|
try {
|
|
655
|
+
#if defined(__clang__)
|
|
656
|
+
// disable C++17 deprecation warning for std::codecvt_utf8
|
|
657
|
+
# pragma clang diagnostic push
|
|
658
|
+
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
|
659
|
+
#endif
|
|
655
660
|
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
|
661
|
+
|
|
662
|
+
#if defined(__clang__)
|
|
663
|
+
# pragma clang diagnostic pop
|
|
664
|
+
#endif
|
|
665
|
+
|
|
656
666
|
filename_utf32 = converter.from_bytes(filename);
|
|
657
667
|
|
|
658
668
|
// If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
|
|
@@ -829,9 +839,9 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
829
839
|
llama_model * model = nullptr;
|
|
830
840
|
|
|
831
841
|
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
|
832
|
-
model = common_load_model_from_hf(params.hf_repo
|
|
842
|
+
model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
|
|
833
843
|
} else if (!params.model_url.empty()) {
|
|
834
|
-
model = common_load_model_from_url(params.model_url
|
|
844
|
+
model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
|
|
835
845
|
} else {
|
|
836
846
|
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
|
837
847
|
}
|
|
@@ -875,6 +885,12 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
875
885
|
return iparams;
|
|
876
886
|
}
|
|
877
887
|
|
|
888
|
+
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
|
|
889
|
+
LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
|
|
890
|
+
llama_free_model(model);
|
|
891
|
+
return iparams;
|
|
892
|
+
}
|
|
893
|
+
|
|
878
894
|
if (!params.control_vectors.empty()) {
|
|
879
895
|
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
|
880
896
|
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
|
@@ -919,9 +935,28 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
919
935
|
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
|
920
936
|
}
|
|
921
937
|
|
|
922
|
-
if (params.
|
|
938
|
+
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
|
923
939
|
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
|
924
|
-
params.
|
|
940
|
+
params.sampling.ignore_eos = false;
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
if (params.sampling.ignore_eos) {
|
|
944
|
+
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
|
|
945
|
+
if (llama_token_is_eog(model, i)) {
|
|
946
|
+
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
|
947
|
+
params.sampling.logit_bias.push_back({i, -INFINITY});
|
|
948
|
+
}
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
|
|
952
|
+
if (params.sampling.penalty_last_n == -1) {
|
|
953
|
+
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
954
|
+
params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
if (params.sampling.dry_penalty_last_n == -1) {
|
|
958
|
+
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
959
|
+
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
|
925
960
|
}
|
|
926
961
|
|
|
927
962
|
if (params.warmup) {
|
|
@@ -973,9 +1008,12 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_l
|
|
|
973
1008
|
}
|
|
974
1009
|
}
|
|
975
1010
|
|
|
976
|
-
struct llama_model_params common_model_params_to_llama(
|
|
1011
|
+
struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
977
1012
|
auto mparams = llama_model_default_params();
|
|
978
1013
|
|
|
1014
|
+
if (!params.devices.empty()) {
|
|
1015
|
+
mparams.devices = params.devices.data();
|
|
1016
|
+
}
|
|
979
1017
|
if (params.n_gpu_layers != -1) {
|
|
980
1018
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
981
1019
|
}
|
|
@@ -996,38 +1034,6 @@ struct llama_model_params common_model_params_to_llama(const common_params & par
|
|
|
996
1034
|
return mparams;
|
|
997
1035
|
}
|
|
998
1036
|
|
|
999
|
-
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
1000
|
-
if (s == "f32") {
|
|
1001
|
-
return GGML_TYPE_F32;
|
|
1002
|
-
}
|
|
1003
|
-
if (s == "f16") {
|
|
1004
|
-
return GGML_TYPE_F16;
|
|
1005
|
-
}
|
|
1006
|
-
if (s == "bf16") {
|
|
1007
|
-
return GGML_TYPE_BF16;
|
|
1008
|
-
}
|
|
1009
|
-
if (s == "q8_0") {
|
|
1010
|
-
return GGML_TYPE_Q8_0;
|
|
1011
|
-
}
|
|
1012
|
-
if (s == "q4_0") {
|
|
1013
|
-
return GGML_TYPE_Q4_0;
|
|
1014
|
-
}
|
|
1015
|
-
if (s == "q4_1") {
|
|
1016
|
-
return GGML_TYPE_Q4_1;
|
|
1017
|
-
}
|
|
1018
|
-
if (s == "iq4_nl") {
|
|
1019
|
-
return GGML_TYPE_IQ4_NL;
|
|
1020
|
-
}
|
|
1021
|
-
if (s == "q5_0") {
|
|
1022
|
-
return GGML_TYPE_Q5_0;
|
|
1023
|
-
}
|
|
1024
|
-
if (s == "q5_1") {
|
|
1025
|
-
return GGML_TYPE_Q5_1;
|
|
1026
|
-
}
|
|
1027
|
-
|
|
1028
|
-
throw std::runtime_error("Unsupported cache type: " + s);
|
|
1029
|
-
}
|
|
1030
|
-
|
|
1031
1037
|
struct llama_context_params common_context_params_to_llama(const common_params & params) {
|
|
1032
1038
|
auto cparams = llama_context_default_params();
|
|
1033
1039
|
|
|
@@ -1062,8 +1068,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1062
1068
|
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
|
1063
1069
|
}
|
|
1064
1070
|
|
|
1065
|
-
cparams.type_k =
|
|
1066
|
-
cparams.type_v =
|
|
1071
|
+
cparams.type_k = params.cache_type_k;
|
|
1072
|
+
cparams.type_v = params.cache_type_v;
|
|
1067
1073
|
|
|
1068
1074
|
return cparams;
|
|
1069
1075
|
}
|
|
@@ -1089,13 +1095,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
|
|
|
1089
1095
|
#define CURL_MAX_RETRY 3
|
|
1090
1096
|
#define CURL_RETRY_DELAY_SECONDS 2
|
|
1091
1097
|
|
|
1092
|
-
|
|
1093
|
-
static bool starts_with(const std::string & str, const std::string & prefix) {
|
|
1094
|
-
// While we wait for C++20's std::string::starts_with...
|
|
1095
|
-
return str.rfind(prefix, 0) == 0;
|
|
1096
|
-
}
|
|
1097
|
-
|
|
1098
|
-
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
|
|
1098
|
+
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
|
|
1099
1099
|
int remaining_attempts = max_attempts;
|
|
1100
1100
|
|
|
1101
1101
|
while (remaining_attempts > 0) {
|
|
@@ -1119,7 +1119,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
|
|
|
1119
1119
|
}
|
|
1120
1120
|
|
|
1121
1121
|
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
|
1122
|
-
|
|
1123
1122
|
// Initialize libcurl
|
|
1124
1123
|
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
|
1125
1124
|
if (!curl) {
|
|
@@ -1192,11 +1191,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
|
|
1192
1191
|
std::string etag;
|
|
1193
1192
|
std::string last_modified;
|
|
1194
1193
|
};
|
|
1194
|
+
|
|
1195
1195
|
common_load_model_from_url_headers headers;
|
|
1196
|
+
|
|
1196
1197
|
{
|
|
1197
1198
|
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
|
1198
1199
|
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
|
1199
|
-
common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
|
|
1200
|
+
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
|
1200
1201
|
|
|
1201
1202
|
static std::regex header_regex("([^:]+): (.*)\r\n");
|
|
1202
1203
|
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
|
@@ -1333,17 +1334,17 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
|
|
1333
1334
|
}
|
|
1334
1335
|
|
|
1335
1336
|
struct llama_model * common_load_model_from_url(
|
|
1336
|
-
const
|
|
1337
|
-
const
|
|
1338
|
-
const
|
|
1337
|
+
const std::string & model_url,
|
|
1338
|
+
const std::string & local_path,
|
|
1339
|
+
const std::string & hf_token,
|
|
1339
1340
|
const struct llama_model_params & params) {
|
|
1340
1341
|
// Basic validation of the model_url
|
|
1341
|
-
if (
|
|
1342
|
+
if (model_url.empty()) {
|
|
1342
1343
|
LOG_ERR("%s: invalid model_url\n", __func__);
|
|
1343
1344
|
return NULL;
|
|
1344
1345
|
}
|
|
1345
1346
|
|
|
1346
|
-
if (!common_download_file(model_url,
|
|
1347
|
+
if (!common_download_file(model_url, local_path, hf_token)) {
|
|
1347
1348
|
return NULL;
|
|
1348
1349
|
}
|
|
1349
1350
|
|
|
@@ -1354,9 +1355,9 @@ struct llama_model * common_load_model_from_url(
|
|
|
1354
1355
|
/*.no_alloc = */ true,
|
|
1355
1356
|
/*.ctx = */ NULL,
|
|
1356
1357
|
};
|
|
1357
|
-
auto * ctx_gguf = gguf_init_from_file(
|
|
1358
|
+
auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
|
|
1358
1359
|
if (!ctx_gguf) {
|
|
1359
|
-
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__,
|
|
1360
|
+
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
|
|
1360
1361
|
return NULL;
|
|
1361
1362
|
}
|
|
1362
1363
|
|
|
@@ -1375,13 +1376,13 @@ struct llama_model * common_load_model_from_url(
|
|
|
1375
1376
|
// Verify the first split file format
|
|
1376
1377
|
// and extract split URL and PATH prefixes
|
|
1377
1378
|
{
|
|
1378
|
-
if (!llama_split_prefix(split_prefix, sizeof(split_prefix),
|
|
1379
|
-
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__,
|
|
1379
|
+
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
|
|
1380
|
+
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
|
|
1380
1381
|
return NULL;
|
|
1381
1382
|
}
|
|
1382
1383
|
|
|
1383
|
-
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
|
1384
|
-
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
|
|
1384
|
+
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
|
|
1385
|
+
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
|
|
1385
1386
|
return NULL;
|
|
1386
1387
|
}
|
|
1387
1388
|
}
|
|
@@ -1408,14 +1409,14 @@ struct llama_model * common_load_model_from_url(
|
|
|
1408
1409
|
}
|
|
1409
1410
|
}
|
|
1410
1411
|
|
|
1411
|
-
return llama_load_model_from_file(
|
|
1412
|
+
return llama_load_model_from_file(local_path.c_str(), params);
|
|
1412
1413
|
}
|
|
1413
1414
|
|
|
1414
1415
|
struct llama_model * common_load_model_from_hf(
|
|
1415
|
-
const
|
|
1416
|
-
const
|
|
1417
|
-
const
|
|
1418
|
-
const
|
|
1416
|
+
const std::string & repo,
|
|
1417
|
+
const std::string & remote_path,
|
|
1418
|
+
const std::string & local_path,
|
|
1419
|
+
const std::string & hf_token,
|
|
1419
1420
|
const struct llama_model_params & params) {
|
|
1420
1421
|
// construct hugging face model url:
|
|
1421
1422
|
//
|
|
@@ -1429,27 +1430,27 @@ struct llama_model * common_load_model_from_hf(
|
|
|
1429
1430
|
std::string model_url = "https://huggingface.co/";
|
|
1430
1431
|
model_url += repo;
|
|
1431
1432
|
model_url += "/resolve/main/";
|
|
1432
|
-
model_url +=
|
|
1433
|
+
model_url += remote_path;
|
|
1433
1434
|
|
|
1434
|
-
return common_load_model_from_url(model_url
|
|
1435
|
+
return common_load_model_from_url(model_url, local_path, hf_token, params);
|
|
1435
1436
|
}
|
|
1436
1437
|
|
|
1437
1438
|
#else
|
|
1438
1439
|
|
|
1439
1440
|
struct llama_model * common_load_model_from_url(
|
|
1440
|
-
const
|
|
1441
|
-
const
|
|
1442
|
-
const
|
|
1441
|
+
const std::string & /*model_url*/,
|
|
1442
|
+
const std::string & /*local_path*/,
|
|
1443
|
+
const std::string & /*hf_token*/,
|
|
1443
1444
|
const struct llama_model_params & /*params*/) {
|
|
1444
1445
|
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
|
1445
1446
|
return nullptr;
|
|
1446
1447
|
}
|
|
1447
1448
|
|
|
1448
1449
|
struct llama_model * common_load_model_from_hf(
|
|
1449
|
-
const
|
|
1450
|
-
const
|
|
1451
|
-
const
|
|
1452
|
-
const
|
|
1450
|
+
const std::string & /*repo*/,
|
|
1451
|
+
const std::string & /*remote_path*/,
|
|
1452
|
+
const std::string & /*local_path*/,
|
|
1453
|
+
const std::string & /*hf_token*/,
|
|
1453
1454
|
const struct llama_model_params & /*params*/) {
|
|
1454
1455
|
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
|
1455
1456
|
return nullptr;
|
|
@@ -1484,6 +1485,66 @@ void common_batch_add(
|
|
|
1484
1485
|
batch.n_tokens++;
|
|
1485
1486
|
}
|
|
1486
1487
|
|
|
1488
|
+
//
|
|
1489
|
+
// Token utils
|
|
1490
|
+
//
|
|
1491
|
+
|
|
1492
|
+
size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
|
|
1493
|
+
size_t i;
|
|
1494
|
+
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
|
1495
|
+
|
|
1496
|
+
return i;
|
|
1497
|
+
}
|
|
1498
|
+
|
|
1499
|
+
size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
|
|
1500
|
+
// check for empty sequences
|
|
1501
|
+
if (a.empty() || b.empty()) {
|
|
1502
|
+
return 0;
|
|
1503
|
+
}
|
|
1504
|
+
|
|
1505
|
+
// get the lengths of the input sequences
|
|
1506
|
+
size_t a_len = a.size();
|
|
1507
|
+
size_t b_len = b.size();
|
|
1508
|
+
|
|
1509
|
+
// initialize the maximum length of the longest common subsequence (LCS)
|
|
1510
|
+
size_t max_length = 0;
|
|
1511
|
+
|
|
1512
|
+
// use two rows instead of a 2D matrix to optimize space
|
|
1513
|
+
std::vector<size_t> prev_row(b_len + 1, 0);
|
|
1514
|
+
std::vector<size_t> curr_row(b_len + 1, 0);
|
|
1515
|
+
|
|
1516
|
+
// iterate through the elements of a
|
|
1517
|
+
for (size_t i = 1; i <= a_len; i++) {
|
|
1518
|
+
// iterate through the elements of b
|
|
1519
|
+
for (size_t j = 1; j <= b_len; j++) {
|
|
1520
|
+
// if elements at the current positions match
|
|
1521
|
+
if (a[i - 1] == b[j - 1]) {
|
|
1522
|
+
// if it's the first element of either sequences, set LCS length to 1
|
|
1523
|
+
if (i == 1 || j == 1) {
|
|
1524
|
+
curr_row[j] = 1;
|
|
1525
|
+
} else {
|
|
1526
|
+
// increment LCS length by 1 compared to the previous element
|
|
1527
|
+
curr_row[j] = prev_row[j - 1] + 1;
|
|
1528
|
+
}
|
|
1529
|
+
|
|
1530
|
+
// update max_length if necessary
|
|
1531
|
+
if (curr_row[j] > max_length) {
|
|
1532
|
+
max_length = curr_row[j];
|
|
1533
|
+
}
|
|
1534
|
+
} else {
|
|
1535
|
+
// reset LCS length if elements don't match
|
|
1536
|
+
curr_row[j] = 0;
|
|
1537
|
+
}
|
|
1538
|
+
}
|
|
1539
|
+
|
|
1540
|
+
// update the previous row for the next iteration
|
|
1541
|
+
prev_row = curr_row;
|
|
1542
|
+
}
|
|
1543
|
+
|
|
1544
|
+
// return the maximum length of the LCS
|
|
1545
|
+
return max_length;
|
|
1546
|
+
}
|
|
1547
|
+
|
|
1487
1548
|
//
|
|
1488
1549
|
// Vocab utils
|
|
1489
1550
|
//
|
|
@@ -1720,7 +1781,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
|
|
|
1720
1781
|
break;
|
|
1721
1782
|
case 0: // max absolute
|
|
1722
1783
|
for (int i = 0; i < n; i++) {
|
|
1723
|
-
if (sum < std::abs(inp[i]))
|
|
1784
|
+
if (sum < std::abs(inp[i])) {
|
|
1785
|
+
sum = std::abs(inp[i]);
|
|
1786
|
+
}
|
|
1724
1787
|
}
|
|
1725
1788
|
sum /= 32760.0; // make an int16 range
|
|
1726
1789
|
break;
|
|
@@ -33,11 +33,13 @@ struct common_lora_adapter_container : common_lora_adapter_info {
|
|
|
33
33
|
struct llama_lora_adapter * adapter;
|
|
34
34
|
};
|
|
35
35
|
|
|
36
|
+
using llama_tokens = std::vector<llama_token>;
|
|
37
|
+
|
|
36
38
|
// build info
|
|
37
39
|
extern int LLAMA_BUILD_NUMBER;
|
|
38
|
-
extern char
|
|
39
|
-
extern char
|
|
40
|
-
extern char
|
|
40
|
+
extern const char * LLAMA_COMMIT;
|
|
41
|
+
extern const char * LLAMA_COMPILER;
|
|
42
|
+
extern const char * LLAMA_BUILD_TARGET;
|
|
41
43
|
|
|
42
44
|
struct common_control_vector_load_info;
|
|
43
45
|
|
|
@@ -78,6 +80,7 @@ enum llama_example {
|
|
|
78
80
|
LLAMA_EXAMPLE_LLAVA,
|
|
79
81
|
LLAMA_EXAMPLE_LOOKUP,
|
|
80
82
|
LLAMA_EXAMPLE_PARALLEL,
|
|
83
|
+
LLAMA_EXAMPLE_TTS,
|
|
81
84
|
|
|
82
85
|
LLAMA_EXAMPLE_COUNT,
|
|
83
86
|
};
|
|
@@ -93,6 +96,7 @@ enum common_sampler_type {
|
|
|
93
96
|
COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
|
|
94
97
|
COMMON_SAMPLER_TYPE_XTC = 8,
|
|
95
98
|
COMMON_SAMPLER_TYPE_INFILL = 9,
|
|
99
|
+
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
|
96
100
|
};
|
|
97
101
|
|
|
98
102
|
// dimensionality reduction methods, used by cvector-generator
|
|
@@ -101,8 +105,8 @@ enum dimre_method {
|
|
|
101
105
|
DIMRE_METHOD_MEAN,
|
|
102
106
|
};
|
|
103
107
|
|
|
104
|
-
//
|
|
105
|
-
struct
|
|
108
|
+
// sampling parameters
|
|
109
|
+
struct common_params_sampling {
|
|
106
110
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
|
107
111
|
|
|
108
112
|
int32_t n_prev = 64; // number of previous tokens to remember
|
|
@@ -128,14 +132,15 @@ struct common_sampler_params {
|
|
|
128
132
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
129
133
|
float mirostat_tau = 5.00f; // target entropy
|
|
130
134
|
float mirostat_eta = 0.10f; // learning rate
|
|
131
|
-
bool penalize_nl = false; // consider newlines as a repeatable token
|
|
132
135
|
bool ignore_eos = false;
|
|
133
136
|
bool no_perf = false; // disable performance metrics
|
|
137
|
+
bool timing_per_token = false;
|
|
134
138
|
|
|
135
139
|
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
|
136
140
|
|
|
137
141
|
|
|
138
142
|
std::vector<enum common_sampler_type> samplers = {
|
|
143
|
+
COMMON_SAMPLER_TYPE_PENALTIES,
|
|
139
144
|
COMMON_SAMPLER_TYPE_DRY,
|
|
140
145
|
COMMON_SAMPLER_TYPE_TOP_K,
|
|
141
146
|
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
|
@@ -153,21 +158,39 @@ struct common_sampler_params {
|
|
|
153
158
|
std::string print() const;
|
|
154
159
|
};
|
|
155
160
|
|
|
161
|
+
struct common_params_speculative {
|
|
162
|
+
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
|
163
|
+
|
|
164
|
+
int32_t n_ctx = 0; // draft context size
|
|
165
|
+
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
|
166
|
+
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
|
|
167
|
+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
|
168
|
+
float p_split = 0.1f; // speculative decoding split probability
|
|
169
|
+
float p_min = 0.9f; // minimum speculative decoding probability (greedy)
|
|
170
|
+
|
|
171
|
+
struct cpu_params cpuparams;
|
|
172
|
+
struct cpu_params cpuparams_batch;
|
|
173
|
+
|
|
174
|
+
std::string model = ""; // draft model for speculative decoding // NOLINT
|
|
175
|
+
};
|
|
176
|
+
|
|
177
|
+
struct common_params_vocoder {
|
|
178
|
+
std::string hf_repo = ""; // HF repo // NOLINT
|
|
179
|
+
std::string hf_file = ""; // HF file // NOLINT
|
|
180
|
+
|
|
181
|
+
std::string model = ""; // model path // NOLINT
|
|
182
|
+
std::string model_url = ""; // model url to download // NOLINT
|
|
183
|
+
};
|
|
184
|
+
|
|
156
185
|
struct common_params {
|
|
157
186
|
int32_t n_predict = -1; // new tokens to predict
|
|
158
187
|
int32_t n_ctx = 4096; // context size
|
|
159
188
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
160
189
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
|
161
190
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
|
162
|
-
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
|
|
163
191
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
|
164
192
|
int32_t n_parallel = 1; // number of parallel sequences to decode
|
|
165
193
|
int32_t n_sequences = 1; // number of sequences to decode
|
|
166
|
-
float p_split = 0.1f; // speculative decoding split probability
|
|
167
|
-
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
|
168
|
-
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
|
169
|
-
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
|
170
|
-
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
|
171
194
|
int32_t grp_attn_n = 1; // group-attention factor
|
|
172
195
|
int32_t grp_attn_w = 512; // group-attention width
|
|
173
196
|
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
|
|
@@ -180,26 +203,33 @@ struct common_params {
|
|
|
180
203
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
|
181
204
|
float defrag_thold = 0.1f; // KV cache defragmentation threshold
|
|
182
205
|
|
|
206
|
+
// offload params
|
|
207
|
+
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
|
208
|
+
|
|
209
|
+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
|
210
|
+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
|
211
|
+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
|
212
|
+
|
|
213
|
+
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
|
214
|
+
|
|
183
215
|
struct cpu_params cpuparams;
|
|
184
216
|
struct cpu_params cpuparams_batch;
|
|
185
|
-
struct cpu_params draft_cpuparams;
|
|
186
|
-
struct cpu_params draft_cpuparams_batch;
|
|
187
217
|
|
|
188
218
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
|
189
219
|
void * cb_eval_user_data = nullptr;
|
|
190
220
|
|
|
191
221
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
|
192
222
|
|
|
193
|
-
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
|
194
223
|
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
|
195
224
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
|
196
225
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
|
197
226
|
|
|
198
|
-
struct
|
|
227
|
+
struct common_params_sampling sampling;
|
|
228
|
+
struct common_params_speculative speculative;
|
|
229
|
+
struct common_params_vocoder vocoder;
|
|
199
230
|
|
|
200
231
|
std::string model = ""; // model path // NOLINT
|
|
201
|
-
std::string
|
|
202
|
-
std::string model_alias = "unknown"; // model alias // NOLINT
|
|
232
|
+
std::string model_alias = ""; // model alias // NOLINT
|
|
203
233
|
std::string model_url = ""; // model url to download // NOLINT
|
|
204
234
|
std::string hf_token = ""; // HF token // NOLINT
|
|
205
235
|
std::string hf_repo = ""; // HF repo // NOLINT
|
|
@@ -270,8 +300,8 @@ struct common_params {
|
|
|
270
300
|
bool warmup = true; // warmup run
|
|
271
301
|
bool check_tensors = false; // validate tensor data
|
|
272
302
|
|
|
273
|
-
|
|
274
|
-
|
|
303
|
+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
|
304
|
+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
|
275
305
|
|
|
276
306
|
// multimodal models (see examples/llava)
|
|
277
307
|
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
|
@@ -421,6 +451,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
|
|
|
421
451
|
return parts;
|
|
422
452
|
}
|
|
423
453
|
|
|
454
|
+
static bool string_starts_with(const std::string & str,
|
|
455
|
+
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
|
|
456
|
+
return str.rfind(prefix, 0) == 0;
|
|
457
|
+
}
|
|
458
|
+
|
|
424
459
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
|
425
460
|
void string_process_escapes(std::string & input);
|
|
426
461
|
|
|
@@ -451,17 +486,28 @@ struct common_init_result {
|
|
|
451
486
|
|
|
452
487
|
struct common_init_result common_init_from_params(common_params & params);
|
|
453
488
|
|
|
454
|
-
struct llama_model_params common_model_params_to_llama (
|
|
489
|
+
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
|
455
490
|
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
|
456
491
|
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
|
457
492
|
|
|
458
|
-
struct llama_model * common_load_model_from_url(
|
|
459
|
-
|
|
493
|
+
struct llama_model * common_load_model_from_url(
|
|
494
|
+
const std::string & model_url,
|
|
495
|
+
const std::string & local_path,
|
|
496
|
+
const std::string & hf_token,
|
|
497
|
+
const struct llama_model_params & params);
|
|
498
|
+
struct llama_model * common_load_model_from_hf(
|
|
499
|
+
const std::string & repo,
|
|
500
|
+
const std::string & remote_path,
|
|
501
|
+
const std::string & local_path,
|
|
502
|
+
const std::string & hf_token,
|
|
503
|
+
const struct llama_model_params & params);
|
|
460
504
|
|
|
461
505
|
// clear LoRA adapters from context, then apply new list of adapters
|
|
462
506
|
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
|
463
507
|
|
|
508
|
+
//
|
|
464
509
|
// Batch utils
|
|
510
|
+
//
|
|
465
511
|
|
|
466
512
|
void common_batch_clear(struct llama_batch & batch);
|
|
467
513
|
|
|
@@ -472,6 +518,16 @@ void common_batch_add(
|
|
|
472
518
|
const std::vector<llama_seq_id> & seq_ids,
|
|
473
519
|
bool logits);
|
|
474
520
|
|
|
521
|
+
//
|
|
522
|
+
// Token utils
|
|
523
|
+
//
|
|
524
|
+
|
|
525
|
+
// longest common prefix
|
|
526
|
+
size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
|
|
527
|
+
|
|
528
|
+
// longet common subsequence
|
|
529
|
+
size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
|
|
530
|
+
|
|
475
531
|
//
|
|
476
532
|
// Vocab utils
|
|
477
533
|
//
|
|
@@ -551,7 +607,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
|
|
|
551
607
|
// Embedding utils
|
|
552
608
|
//
|
|
553
609
|
|
|
554
|
-
|
|
610
|
+
// TODO: repace embd_norm with an enum
|
|
611
|
+
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
|
|
555
612
|
|
|
556
613
|
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
|
557
614
|
|