@fugood/llama.node 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +8 -9
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +43 -9
- package/src/llama.cpp/.github/workflows/docker.yml +3 -0
- package/src/llama.cpp/CMakeLists.txt +7 -4
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +0 -2
- package/src/llama.cpp/common/arg.cpp +642 -607
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +79 -281
- package/src/llama.cpp/common/common.h +130 -100
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +116 -108
- package/src/llama.cpp/common/sampling.h +20 -20
- package/src/llama.cpp/docs/build.md +37 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +14 -14
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
- package/src/llama.cpp/examples/infill/infill.cpp +40 -86
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/clip.cpp +1 -0
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +37 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
- package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
- package/src/llama.cpp/examples/main/main.cpp +64 -109
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
- package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
- package/src/llama.cpp/examples/server/server.cpp +553 -691
- package/src/llama.cpp/examples/server/utils.hpp +312 -25
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +128 -96
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +53 -393
- package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
- package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
- package/src/llama.cpp/include/llama.h +67 -33
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-sampling.cpp +745 -105
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +49 -9
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +2636 -2406
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
- package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +1 -0
- package/src/llama.cpp/tests/test-sampling.cpp +162 -137
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
- /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
|
|
13
13
|
#include <algorithm>
|
|
14
14
|
#include <cinttypes>
|
|
15
|
+
#include <climits>
|
|
15
16
|
#include <cmath>
|
|
16
17
|
#include <codecvt>
|
|
17
18
|
#include <cstdarg>
|
|
@@ -23,10 +24,10 @@
|
|
|
23
24
|
#include <regex>
|
|
24
25
|
#include <sstream>
|
|
25
26
|
#include <string>
|
|
27
|
+
#include <thread>
|
|
26
28
|
#include <unordered_map>
|
|
27
29
|
#include <unordered_set>
|
|
28
30
|
#include <vector>
|
|
29
|
-
#include <thread>
|
|
30
31
|
|
|
31
32
|
#if defined(__APPLE__) && defined(__MACH__)
|
|
32
33
|
#include <sys/types.h>
|
|
@@ -362,10 +363,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
|
|
|
362
363
|
return true;
|
|
363
364
|
}
|
|
364
365
|
|
|
365
|
-
void
|
|
366
|
+
void common_init() {
|
|
366
367
|
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
|
|
367
|
-
if (LOG_DEFAULT_LLAMA <=
|
|
368
|
-
|
|
368
|
+
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
|
|
369
|
+
common_log_add(common_log_main(), level, "%s", text);
|
|
369
370
|
}
|
|
370
371
|
}, NULL);
|
|
371
372
|
|
|
@@ -378,7 +379,7 @@ void gpt_init() {
|
|
|
378
379
|
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
|
379
380
|
}
|
|
380
381
|
|
|
381
|
-
std::string
|
|
382
|
+
std::string common_params_get_system_info(const common_params & params) {
|
|
382
383
|
std::ostringstream os;
|
|
383
384
|
|
|
384
385
|
os << "system_info: n_threads = " << params.cpuparams.n_threads;
|
|
@@ -400,17 +401,19 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
|
|
|
400
401
|
// String utils
|
|
401
402
|
//
|
|
402
403
|
|
|
403
|
-
std::
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
404
|
+
std::string string_format(const char * fmt, ...) {
|
|
405
|
+
va_list ap;
|
|
406
|
+
va_list ap2;
|
|
407
|
+
va_start(ap, fmt);
|
|
408
|
+
va_copy(ap2, ap);
|
|
409
|
+
int size = vsnprintf(NULL, 0, fmt, ap);
|
|
410
|
+
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
|
411
|
+
std::vector<char> buf(size + 1);
|
|
412
|
+
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
|
413
|
+
GGML_ASSERT(size2 == size);
|
|
414
|
+
va_end(ap2);
|
|
415
|
+
va_end(ap);
|
|
416
|
+
return std::string(buf.data(), size);
|
|
414
417
|
}
|
|
415
418
|
|
|
416
419
|
std::string string_strip(const std::string & str) {
|
|
@@ -493,7 +496,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
|
|
|
493
496
|
first = false;
|
|
494
497
|
}
|
|
495
498
|
|
|
496
|
-
auto detokenized =
|
|
499
|
+
auto detokenized = common_token_to_piece(ctx, token);
|
|
497
500
|
|
|
498
501
|
detokenized.erase(
|
|
499
502
|
std::remove_if(
|
|
@@ -524,7 +527,7 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
|
|
524
527
|
first = false;
|
|
525
528
|
}
|
|
526
529
|
|
|
527
|
-
auto detokenized =
|
|
530
|
+
auto detokenized = common_token_to_piece(ctx, batch.token[i]);
|
|
528
531
|
|
|
529
532
|
detokenized.erase(
|
|
530
533
|
std::remove_if(
|
|
@@ -819,16 +822,16 @@ std::string fs_get_cache_file(const std::string & filename) {
|
|
|
819
822
|
//
|
|
820
823
|
// Model utils
|
|
821
824
|
//
|
|
822
|
-
struct
|
|
823
|
-
|
|
824
|
-
auto mparams =
|
|
825
|
+
struct common_init_result common_init_from_params(common_params & params) {
|
|
826
|
+
common_init_result iparams;
|
|
827
|
+
auto mparams = common_model_params_to_llama(params);
|
|
825
828
|
|
|
826
829
|
llama_model * model = nullptr;
|
|
827
830
|
|
|
828
831
|
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
|
829
|
-
model =
|
|
832
|
+
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
|
830
833
|
} else if (!params.model_url.empty()) {
|
|
831
|
-
model =
|
|
834
|
+
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
|
832
835
|
} else {
|
|
833
836
|
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
|
834
837
|
}
|
|
@@ -863,7 +866,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
|
863
866
|
}
|
|
864
867
|
}
|
|
865
868
|
|
|
866
|
-
auto cparams =
|
|
869
|
+
auto cparams = common_context_params_to_llama(params);
|
|
867
870
|
|
|
868
871
|
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
|
869
872
|
if (lctx == NULL) {
|
|
@@ -876,7 +879,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
|
876
879
|
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
|
877
880
|
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
|
878
881
|
|
|
879
|
-
const auto cvec =
|
|
882
|
+
const auto cvec = common_control_vector_load(params.control_vectors);
|
|
880
883
|
if (cvec.n_embd == -1) {
|
|
881
884
|
llama_free(lctx);
|
|
882
885
|
llama_free_model(model);
|
|
@@ -900,7 +903,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
|
900
903
|
|
|
901
904
|
// load and optionally apply lora adapters
|
|
902
905
|
for (auto & la : params.lora_adapters) {
|
|
903
|
-
|
|
906
|
+
common_lora_adapter_container loaded_la;
|
|
904
907
|
loaded_la.path = la.path;
|
|
905
908
|
loaded_la.scale = la.scale;
|
|
906
909
|
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
|
@@ -913,7 +916,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
|
913
916
|
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
|
914
917
|
}
|
|
915
918
|
if (!params.lora_init_without_apply) {
|
|
916
|
-
|
|
919
|
+
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
|
917
920
|
}
|
|
918
921
|
|
|
919
922
|
if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
|
@@ -939,7 +942,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
|
939
942
|
}
|
|
940
943
|
|
|
941
944
|
if (llama_model_has_encoder(model)) {
|
|
942
|
-
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()
|
|
945
|
+
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
|
|
943
946
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
|
944
947
|
if (decoder_start_token_id == -1) {
|
|
945
948
|
decoder_start_token_id = bos;
|
|
@@ -948,7 +951,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
|
948
951
|
tmp.push_back(decoder_start_token_id);
|
|
949
952
|
}
|
|
950
953
|
if (llama_model_has_decoder(model)) {
|
|
951
|
-
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)
|
|
954
|
+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
|
952
955
|
}
|
|
953
956
|
llama_kv_cache_clear(lctx);
|
|
954
957
|
llama_synchronize(lctx);
|
|
@@ -961,7 +964,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|
|
961
964
|
return iparams;
|
|
962
965
|
}
|
|
963
966
|
|
|
964
|
-
void
|
|
967
|
+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
|
|
965
968
|
llama_lora_adapter_clear(ctx);
|
|
966
969
|
for (auto & la : lora_adapters) {
|
|
967
970
|
if (la.scale != 0.0f) {
|
|
@@ -970,7 +973,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
|
|
|
970
973
|
}
|
|
971
974
|
}
|
|
972
975
|
|
|
973
|
-
struct llama_model_params
|
|
976
|
+
struct llama_model_params common_model_params_to_llama(const common_params & params) {
|
|
974
977
|
auto mparams = llama_model_default_params();
|
|
975
978
|
|
|
976
979
|
if (params.n_gpu_layers != -1) {
|
|
@@ -1000,6 +1003,9 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
|
1000
1003
|
if (s == "f16") {
|
|
1001
1004
|
return GGML_TYPE_F16;
|
|
1002
1005
|
}
|
|
1006
|
+
if (s == "bf16") {
|
|
1007
|
+
return GGML_TYPE_BF16;
|
|
1008
|
+
}
|
|
1003
1009
|
if (s == "q8_0") {
|
|
1004
1010
|
return GGML_TYPE_Q8_0;
|
|
1005
1011
|
}
|
|
@@ -1019,10 +1025,10 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
|
|
|
1019
1025
|
return GGML_TYPE_Q5_1;
|
|
1020
1026
|
}
|
|
1021
1027
|
|
|
1022
|
-
throw std::runtime_error("
|
|
1028
|
+
throw std::runtime_error("Unsupported cache type: " + s);
|
|
1023
1029
|
}
|
|
1024
1030
|
|
|
1025
|
-
struct llama_context_params
|
|
1031
|
+
struct llama_context_params common_context_params_to_llama(const common_params & params) {
|
|
1026
1032
|
auto cparams = llama_context_default_params();
|
|
1027
1033
|
|
|
1028
1034
|
cparams.n_ctx = params.n_ctx;
|
|
@@ -1031,7 +1037,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|
|
1031
1037
|
cparams.n_ubatch = params.n_ubatch;
|
|
1032
1038
|
cparams.n_threads = params.cpuparams.n_threads;
|
|
1033
1039
|
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
|
1034
|
-
|
|
1040
|
+
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
|
1035
1041
|
cparams.logits_all = params.logits_all;
|
|
1036
1042
|
cparams.embeddings = params.embedding;
|
|
1037
1043
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
|
@@ -1112,7 +1118,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
|
|
|
1112
1118
|
return false;
|
|
1113
1119
|
}
|
|
1114
1120
|
|
|
1115
|
-
static bool
|
|
1121
|
+
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
|
1116
1122
|
|
|
1117
1123
|
// Initialize libcurl
|
|
1118
1124
|
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
|
@@ -1182,15 +1188,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
1182
1188
|
}
|
|
1183
1189
|
|
|
1184
1190
|
// Send a HEAD request to retrieve the etag and last-modified headers
|
|
1185
|
-
struct
|
|
1191
|
+
struct common_load_model_from_url_headers {
|
|
1186
1192
|
std::string etag;
|
|
1187
1193
|
std::string last_modified;
|
|
1188
1194
|
};
|
|
1189
|
-
|
|
1195
|
+
common_load_model_from_url_headers headers;
|
|
1190
1196
|
{
|
|
1191
1197
|
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
|
1192
1198
|
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
|
1193
|
-
|
|
1199
|
+
common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
|
|
1194
1200
|
|
|
1195
1201
|
static std::regex header_regex("([^:]+): (.*)\r\n");
|
|
1196
1202
|
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
|
@@ -1326,7 +1332,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|
|
1326
1332
|
return true;
|
|
1327
1333
|
}
|
|
1328
1334
|
|
|
1329
|
-
struct llama_model *
|
|
1335
|
+
struct llama_model * common_load_model_from_url(
|
|
1330
1336
|
const char * model_url,
|
|
1331
1337
|
const char * path_model,
|
|
1332
1338
|
const char * hf_token,
|
|
@@ -1337,7 +1343,7 @@ struct llama_model * llama_load_model_from_url(
|
|
|
1337
1343
|
return NULL;
|
|
1338
1344
|
}
|
|
1339
1345
|
|
|
1340
|
-
if (!
|
|
1346
|
+
if (!common_download_file(model_url, path_model, hf_token)) {
|
|
1341
1347
|
return NULL;
|
|
1342
1348
|
}
|
|
1343
1349
|
|
|
@@ -1390,7 +1396,7 @@ struct llama_model * llama_load_model_from_url(
|
|
|
1390
1396
|
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
|
1391
1397
|
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
|
1392
1398
|
|
|
1393
|
-
return
|
|
1399
|
+
return common_download_file(split_url, split_path, hf_token);
|
|
1394
1400
|
}, idx));
|
|
1395
1401
|
}
|
|
1396
1402
|
|
|
@@ -1405,7 +1411,7 @@ struct llama_model * llama_load_model_from_url(
|
|
|
1405
1411
|
return llama_load_model_from_file(path_model, params);
|
|
1406
1412
|
}
|
|
1407
1413
|
|
|
1408
|
-
struct llama_model *
|
|
1414
|
+
struct llama_model * common_load_model_from_hf(
|
|
1409
1415
|
const char * repo,
|
|
1410
1416
|
const char * model,
|
|
1411
1417
|
const char * path_model,
|
|
@@ -1425,12 +1431,12 @@ struct llama_model * llama_load_model_from_hf(
|
|
|
1425
1431
|
model_url += "/resolve/main/";
|
|
1426
1432
|
model_url += model;
|
|
1427
1433
|
|
|
1428
|
-
return
|
|
1434
|
+
return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
|
|
1429
1435
|
}
|
|
1430
1436
|
|
|
1431
1437
|
#else
|
|
1432
1438
|
|
|
1433
|
-
struct llama_model *
|
|
1439
|
+
struct llama_model * common_load_model_from_url(
|
|
1434
1440
|
const char * /*model_url*/,
|
|
1435
1441
|
const char * /*path_model*/,
|
|
1436
1442
|
const char * /*hf_token*/,
|
|
@@ -1439,7 +1445,7 @@ struct llama_model * llama_load_model_from_url(
|
|
|
1439
1445
|
return nullptr;
|
|
1440
1446
|
}
|
|
1441
1447
|
|
|
1442
|
-
struct llama_model *
|
|
1448
|
+
struct llama_model * common_load_model_from_hf(
|
|
1443
1449
|
const char * /*repo*/,
|
|
1444
1450
|
const char * /*model*/,
|
|
1445
1451
|
const char * /*path_model*/,
|
|
@@ -1455,11 +1461,11 @@ struct llama_model * llama_load_model_from_hf(
|
|
|
1455
1461
|
// Batch utils
|
|
1456
1462
|
//
|
|
1457
1463
|
|
|
1458
|
-
void
|
|
1464
|
+
void common_batch_clear(struct llama_batch & batch) {
|
|
1459
1465
|
batch.n_tokens = 0;
|
|
1460
1466
|
}
|
|
1461
1467
|
|
|
1462
|
-
void
|
|
1468
|
+
void common_batch_add(
|
|
1463
1469
|
struct llama_batch & batch,
|
|
1464
1470
|
llama_token id,
|
|
1465
1471
|
llama_pos pos,
|
|
@@ -1482,15 +1488,15 @@ void llama_batch_add(
|
|
|
1482
1488
|
// Vocab utils
|
|
1483
1489
|
//
|
|
1484
1490
|
|
|
1485
|
-
std::vector<llama_token>
|
|
1491
|
+
std::vector<llama_token> common_tokenize(
|
|
1486
1492
|
const struct llama_context * ctx,
|
|
1487
1493
|
const std::string & text,
|
|
1488
1494
|
bool add_special,
|
|
1489
1495
|
bool parse_special) {
|
|
1490
|
-
return
|
|
1496
|
+
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
|
1491
1497
|
}
|
|
1492
1498
|
|
|
1493
|
-
std::vector<llama_token>
|
|
1499
|
+
std::vector<llama_token> common_tokenize(
|
|
1494
1500
|
const struct llama_model * model,
|
|
1495
1501
|
const std::string & text,
|
|
1496
1502
|
bool add_special,
|
|
@@ -1509,7 +1515,7 @@ std::vector<llama_token> llama_tokenize(
|
|
|
1509
1515
|
return result;
|
|
1510
1516
|
}
|
|
1511
1517
|
|
|
1512
|
-
std::string
|
|
1518
|
+
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
|
1513
1519
|
std::string piece;
|
|
1514
1520
|
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
|
1515
1521
|
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
|
@@ -1525,7 +1531,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
|
|
|
1525
1531
|
return piece;
|
|
1526
1532
|
}
|
|
1527
1533
|
|
|
1528
|
-
std::string
|
|
1534
|
+
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
|
1529
1535
|
std::string text;
|
|
1530
1536
|
text.resize(std::max(text.capacity(), tokens.size()));
|
|
1531
1537
|
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
|
@@ -1545,15 +1551,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
|
|
|
1545
1551
|
// Chat template utils
|
|
1546
1552
|
//
|
|
1547
1553
|
|
|
1548
|
-
bool
|
|
1554
|
+
bool common_chat_verify_template(const std::string & tmpl) {
|
|
1549
1555
|
llama_chat_message chat[] = {{"user", "test"}};
|
|
1550
1556
|
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
|
1551
1557
|
return res >= 0;
|
|
1552
1558
|
}
|
|
1553
1559
|
|
|
1554
|
-
std::string
|
|
1560
|
+
std::string common_chat_apply_template(const struct llama_model * model,
|
|
1555
1561
|
const std::string & tmpl,
|
|
1556
|
-
const std::vector<
|
|
1562
|
+
const std::vector<common_chat_msg> & msgs,
|
|
1557
1563
|
bool add_ass) {
|
|
1558
1564
|
int alloc_size = 0;
|
|
1559
1565
|
bool fallback = false; // indicate if we must fallback to default chatml
|
|
@@ -1595,42 +1601,42 @@ std::string llama_chat_apply_template(const struct llama_model * model,
|
|
|
1595
1601
|
return formatted_chat;
|
|
1596
1602
|
}
|
|
1597
1603
|
|
|
1598
|
-
std::string
|
|
1604
|
+
std::string common_chat_format_single(const struct llama_model * model,
|
|
1599
1605
|
const std::string & tmpl,
|
|
1600
|
-
const std::vector<
|
|
1601
|
-
const
|
|
1606
|
+
const std::vector<common_chat_msg> & past_msg,
|
|
1607
|
+
const common_chat_msg & new_msg,
|
|
1602
1608
|
bool add_ass) {
|
|
1603
1609
|
std::ostringstream ss;
|
|
1604
|
-
auto fmt_past_msg = past_msg.empty() ? "" :
|
|
1605
|
-
std::vector<
|
|
1610
|
+
auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
|
|
1611
|
+
std::vector<common_chat_msg> chat_new(past_msg);
|
|
1606
1612
|
// if the past_msg ends with a newline, we must preserve it in the formatted version
|
|
1607
1613
|
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
|
|
1608
1614
|
ss << "\n";
|
|
1609
1615
|
};
|
|
1610
1616
|
// format chat with new_msg
|
|
1611
1617
|
chat_new.push_back(new_msg);
|
|
1612
|
-
auto fmt_new_msg =
|
|
1618
|
+
auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
|
|
1613
1619
|
// get the diff part
|
|
1614
1620
|
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
|
1615
1621
|
return ss.str();
|
|
1616
1622
|
}
|
|
1617
1623
|
|
|
1618
|
-
std::string
|
|
1624
|
+
std::string common_chat_format_example(const struct llama_model * model,
|
|
1619
1625
|
const std::string & tmpl) {
|
|
1620
|
-
std::vector<
|
|
1626
|
+
std::vector<common_chat_msg> msgs = {
|
|
1621
1627
|
{"system", "You are a helpful assistant"},
|
|
1622
1628
|
{"user", "Hello"},
|
|
1623
1629
|
{"assistant", "Hi there"},
|
|
1624
1630
|
{"user", "How are you?"},
|
|
1625
1631
|
};
|
|
1626
|
-
return
|
|
1632
|
+
return common_chat_apply_template(model, tmpl, msgs, true);
|
|
1627
1633
|
}
|
|
1628
1634
|
|
|
1629
1635
|
//
|
|
1630
1636
|
// KV cache utils
|
|
1631
1637
|
//
|
|
1632
1638
|
|
|
1633
|
-
void
|
|
1639
|
+
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
|
1634
1640
|
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
|
1635
1641
|
|
|
1636
1642
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
|
@@ -1653,7 +1659,7 @@ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
|
|
1653
1659
|
printf("\n=== Done dumping\n");
|
|
1654
1660
|
}
|
|
1655
1661
|
|
|
1656
|
-
void
|
|
1662
|
+
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
|
1657
1663
|
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
|
1658
1664
|
|
|
1659
1665
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
|
@@ -1705,7 +1711,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
|
|
|
1705
1711
|
// Embedding utils
|
|
1706
1712
|
//
|
|
1707
1713
|
|
|
1708
|
-
void
|
|
1714
|
+
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
|
|
1709
1715
|
double sum = 0.0;
|
|
1710
1716
|
|
|
1711
1717
|
switch (embd_norm) {
|
|
@@ -1739,7 +1745,7 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm)
|
|
|
1739
1745
|
}
|
|
1740
1746
|
}
|
|
1741
1747
|
|
|
1742
|
-
float
|
|
1748
|
+
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
|
|
1743
1749
|
double sum = 0.0;
|
|
1744
1750
|
double sum1 = 0.0;
|
|
1745
1751
|
double sum2 = 0.0;
|
|
@@ -1765,8 +1771,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
|
|
|
1765
1771
|
// Control vector utils
|
|
1766
1772
|
//
|
|
1767
1773
|
|
|
1768
|
-
static
|
|
1769
|
-
|
|
1774
|
+
static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
|
|
1775
|
+
common_control_vector_data result = { -1, {} };
|
|
1770
1776
|
|
|
1771
1777
|
ggml_context * ctx = nullptr;
|
|
1772
1778
|
struct gguf_init_params meta_gguf_params = {
|
|
@@ -1850,11 +1856,11 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|
|
1850
1856
|
return result;
|
|
1851
1857
|
}
|
|
1852
1858
|
|
|
1853
|
-
|
|
1854
|
-
|
|
1859
|
+
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
|
|
1860
|
+
common_control_vector_data result = { -1, {} };
|
|
1855
1861
|
|
|
1856
1862
|
for (const auto & info : load_infos) {
|
|
1857
|
-
auto cur =
|
|
1863
|
+
auto cur = common_control_vector_load_one(info);
|
|
1858
1864
|
|
|
1859
1865
|
if (cur.n_embd == -1) {
|
|
1860
1866
|
result.n_embd = -1;
|
|
@@ -1884,211 +1890,3 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|
|
1884
1890
|
return result;
|
|
1885
1891
|
}
|
|
1886
1892
|
|
|
1887
|
-
//
|
|
1888
|
-
// YAML utils
|
|
1889
|
-
//
|
|
1890
|
-
|
|
1891
|
-
void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
|
|
1892
|
-
if (data.empty()) {
|
|
1893
|
-
fprintf(stream, "%s:\n", prop_name);
|
|
1894
|
-
return;
|
|
1895
|
-
}
|
|
1896
|
-
|
|
1897
|
-
fprintf(stream, "%s: [", prop_name);
|
|
1898
|
-
for (size_t i = 0; i < data.size() - 1; ++i) {
|
|
1899
|
-
fprintf(stream, "%e, ", data[i]);
|
|
1900
|
-
}
|
|
1901
|
-
fprintf(stream, "%e]\n", data.back());
|
|
1902
|
-
}
|
|
1903
|
-
|
|
1904
|
-
void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
|
|
1905
|
-
if (data.empty()) {
|
|
1906
|
-
fprintf(stream, "%s:\n", prop_name);
|
|
1907
|
-
return;
|
|
1908
|
-
}
|
|
1909
|
-
|
|
1910
|
-
fprintf(stream, "%s: [", prop_name);
|
|
1911
|
-
for (size_t i = 0; i < data.size() - 1; ++i) {
|
|
1912
|
-
fprintf(stream, "%d, ", data[i]);
|
|
1913
|
-
}
|
|
1914
|
-
fprintf(stream, "%d]\n", data.back());
|
|
1915
|
-
}
|
|
1916
|
-
|
|
1917
|
-
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
|
|
1918
|
-
std::string data_str(data == NULL ? "" : data);
|
|
1919
|
-
|
|
1920
|
-
if (data_str.empty()) {
|
|
1921
|
-
fprintf(stream, "%s:\n", prop_name);
|
|
1922
|
-
return;
|
|
1923
|
-
}
|
|
1924
|
-
|
|
1925
|
-
size_t pos_start = 0;
|
|
1926
|
-
size_t pos_found = 0;
|
|
1927
|
-
|
|
1928
|
-
if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
|
|
1929
|
-
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
|
1930
|
-
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
|
1931
|
-
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
|
|
1932
|
-
data_str = "\"" + data_str + "\"";
|
|
1933
|
-
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
|
1934
|
-
return;
|
|
1935
|
-
}
|
|
1936
|
-
|
|
1937
|
-
if (data_str.find('\n') == std::string::npos) {
|
|
1938
|
-
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
|
1939
|
-
return;
|
|
1940
|
-
}
|
|
1941
|
-
|
|
1942
|
-
fprintf(stream, "%s: |\n", prop_name);
|
|
1943
|
-
while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
|
|
1944
|
-
fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
|
|
1945
|
-
pos_start = pos_found + 1;
|
|
1946
|
-
}
|
|
1947
|
-
}
|
|
1948
|
-
|
|
1949
|
-
void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
|
1950
|
-
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
|
1951
|
-
const auto & sparams = params.sparams;
|
|
1952
|
-
|
|
1953
|
-
fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
|
|
1954
|
-
fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
|
|
1955
|
-
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
|
1956
|
-
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
|
1957
|
-
fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
|
|
1958
|
-
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
|
1959
|
-
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
|
1960
|
-
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
|
1961
|
-
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
|
1962
|
-
fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
|
|
1963
|
-
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
|
|
1964
|
-
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
|
|
1965
|
-
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
|
1966
|
-
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
|
1967
|
-
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
|
1968
|
-
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
|
1969
|
-
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
|
1970
|
-
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
|
1971
|
-
fprintf(stream, "cpu_has_riscv_v: %s\n", ggml_cpu_has_riscv_v() ? "true" : "false");
|
|
1972
|
-
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
|
1973
|
-
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
|
1974
|
-
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
|
1975
|
-
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
|
1976
|
-
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
|
|
1977
|
-
|
|
1978
|
-
#ifdef NDEBUG
|
|
1979
|
-
fprintf(stream, "debug: false\n");
|
|
1980
|
-
#else
|
|
1981
|
-
fprintf(stream, "debug: true\n");
|
|
1982
|
-
#endif // NDEBUG
|
|
1983
|
-
|
|
1984
|
-
fprintf(stream, "model_desc: %s\n", model_desc);
|
|
1985
|
-
fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
|
|
1986
|
-
|
|
1987
|
-
#ifdef __OPTIMIZE__
|
|
1988
|
-
fprintf(stream, "optimize: true\n");
|
|
1989
|
-
#else
|
|
1990
|
-
fprintf(stream, "optimize: false\n");
|
|
1991
|
-
#endif // __OPTIMIZE__
|
|
1992
|
-
|
|
1993
|
-
fprintf(stream, "time: %s\n", timestamp.c_str());
|
|
1994
|
-
|
|
1995
|
-
fprintf(stream, "\n");
|
|
1996
|
-
fprintf(stream, "###############\n");
|
|
1997
|
-
fprintf(stream, "# User Inputs #\n");
|
|
1998
|
-
fprintf(stream, "###############\n");
|
|
1999
|
-
fprintf(stream, "\n");
|
|
2000
|
-
|
|
2001
|
-
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
|
2002
|
-
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
|
2003
|
-
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
|
2004
|
-
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
|
2005
|
-
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
|
2006
|
-
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
|
2007
|
-
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
|
2008
|
-
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
|
|
2009
|
-
yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
|
|
2010
|
-
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
|
2011
|
-
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
|
2012
|
-
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
|
2013
|
-
fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
|
|
2014
|
-
|
|
2015
|
-
yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
|
2016
|
-
fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
|
|
2017
|
-
yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
|
|
2018
|
-
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
|
2019
|
-
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
|
2020
|
-
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
|
2021
|
-
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
|
2022
|
-
|
|
2023
|
-
fprintf(stream, "logit_bias:\n");
|
|
2024
|
-
for (const auto & logit_bias : sparams.logit_bias) {
|
|
2025
|
-
fprintf(stream, " %d: %f", logit_bias.token, logit_bias.bias);
|
|
2026
|
-
}
|
|
2027
|
-
|
|
2028
|
-
fprintf(stream, "lora:\n");
|
|
2029
|
-
for (auto & la : params.lora_adapters) {
|
|
2030
|
-
if (la.scale == 1.0f) {
|
|
2031
|
-
fprintf(stream, " - %s\n", la.path.c_str());
|
|
2032
|
-
}
|
|
2033
|
-
}
|
|
2034
|
-
fprintf(stream, "lora_scaled:\n");
|
|
2035
|
-
for (auto & la : params.lora_adapters) {
|
|
2036
|
-
if (la.scale != 1.0f) {
|
|
2037
|
-
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
|
|
2038
|
-
}
|
|
2039
|
-
}
|
|
2040
|
-
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
|
|
2041
|
-
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
|
2042
|
-
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
|
|
2043
|
-
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
|
2044
|
-
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
|
2045
|
-
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
|
2046
|
-
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
|
2047
|
-
fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
|
|
2048
|
-
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
|
2049
|
-
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
|
2050
|
-
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
|
2051
|
-
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
|
2052
|
-
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
|
|
2053
|
-
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
|
2054
|
-
fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
|
|
2055
|
-
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
|
2056
|
-
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
|
2057
|
-
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
|
|
2058
|
-
yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
|
|
2059
|
-
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
|
2060
|
-
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
|
2061
|
-
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
|
2062
|
-
yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
|
|
2063
|
-
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
|
|
2064
|
-
|
|
2065
|
-
fprintf(stream, "reverse_prompt:\n");
|
|
2066
|
-
for (std::string ap : params.antiprompt) {
|
|
2067
|
-
size_t pos = 0;
|
|
2068
|
-
while ((pos = ap.find('\n', pos)) != std::string::npos) {
|
|
2069
|
-
ap.replace(pos, 1, "\\n");
|
|
2070
|
-
pos += 1;
|
|
2071
|
-
}
|
|
2072
|
-
|
|
2073
|
-
fprintf(stream, " - %s\n", ap.c_str());
|
|
2074
|
-
}
|
|
2075
|
-
|
|
2076
|
-
fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
|
|
2077
|
-
fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
|
|
2078
|
-
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
|
2079
|
-
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
|
2080
|
-
fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
|
|
2081
|
-
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
|
2082
|
-
|
|
2083
|
-
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
|
|
2084
|
-
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
|
2085
|
-
|
|
2086
|
-
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
|
2087
|
-
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
|
|
2088
|
-
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
|
2089
|
-
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
|
2090
|
-
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
|
2091
|
-
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
|
|
2092
|
-
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
|
2093
|
-
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
|
2094
|
-
}
|