@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
2
|
-
|
|
3
3
|
#include "console.h"
|
|
4
|
+
#include "log.h"
|
|
5
|
+
#include "sampling.h"
|
|
4
6
|
#include "llama.h"
|
|
5
7
|
|
|
6
8
|
#include <cassert>
|
|
7
|
-
#include <cinttypes>
|
|
8
|
-
#include <cmath>
|
|
9
9
|
#include <cstdio>
|
|
10
10
|
#include <cstring>
|
|
11
11
|
#include <ctime>
|
|
@@ -33,13 +33,23 @@
|
|
|
33
33
|
|
|
34
34
|
static llama_context ** g_ctx;
|
|
35
35
|
static llama_model ** g_model;
|
|
36
|
-
static
|
|
36
|
+
static common_sampler ** g_smpl;
|
|
37
|
+
static common_params * g_params;
|
|
37
38
|
static std::vector<llama_token> * g_input_tokens;
|
|
38
39
|
static std::ostringstream * g_output_ss;
|
|
39
40
|
static std::vector<llama_token> * g_output_tokens;
|
|
40
41
|
static bool is_interacting = false;
|
|
41
42
|
static bool need_insert_eot = false;
|
|
42
43
|
|
|
44
|
+
static void print_usage(int argc, char ** argv) {
|
|
45
|
+
(void) argc;
|
|
46
|
+
|
|
47
|
+
LOG("\nexample usage:\n");
|
|
48
|
+
LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
|
|
49
|
+
LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
|
|
50
|
+
LOG("\n");
|
|
51
|
+
}
|
|
52
|
+
|
|
43
53
|
static bool file_exists(const std::string & path) {
|
|
44
54
|
std::ifstream f(path.c_str());
|
|
45
55
|
return f.good();
|
|
@@ -52,50 +62,6 @@ static bool file_is_empty(const std::string & path) {
|
|
|
52
62
|
return f.tellg() == 0;
|
|
53
63
|
}
|
|
54
64
|
|
|
55
|
-
static void write_logfile(
|
|
56
|
-
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
|
57
|
-
const std::vector<llama_token> & input_tokens, const std::string & output,
|
|
58
|
-
const std::vector<llama_token> & output_tokens
|
|
59
|
-
) {
|
|
60
|
-
if (params.logdir.empty()) {
|
|
61
|
-
return;
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
const std::string timestamp = string_get_sortable_timestamp();
|
|
65
|
-
|
|
66
|
-
const bool success = fs_create_directory_with_parents(params.logdir);
|
|
67
|
-
if (!success) {
|
|
68
|
-
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
|
69
|
-
__func__, params.logdir.c_str());
|
|
70
|
-
return;
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
const std::string logfile_path = params.logdir + timestamp + ".yml";
|
|
74
|
-
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
|
75
|
-
|
|
76
|
-
if (logfile == NULL) {
|
|
77
|
-
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
|
78
|
-
return;
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
fprintf(logfile, "binary: main\n");
|
|
82
|
-
char model_desc[128];
|
|
83
|
-
llama_model_desc(model, model_desc, sizeof(model_desc));
|
|
84
|
-
yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
|
85
|
-
|
|
86
|
-
fprintf(logfile, "\n");
|
|
87
|
-
fprintf(logfile, "######################\n");
|
|
88
|
-
fprintf(logfile, "# Generation Results #\n");
|
|
89
|
-
fprintf(logfile, "######################\n");
|
|
90
|
-
fprintf(logfile, "\n");
|
|
91
|
-
|
|
92
|
-
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
|
93
|
-
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
|
94
|
-
|
|
95
|
-
llama_dump_timing_info_yaml(logfile, ctx);
|
|
96
|
-
fclose(logfile);
|
|
97
|
-
}
|
|
98
|
-
|
|
99
65
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
100
66
|
static void sigint_handler(int signo) {
|
|
101
67
|
if (signo == SIGINT) {
|
|
@@ -104,50 +70,37 @@ static void sigint_handler(int signo) {
|
|
|
104
70
|
need_insert_eot = true;
|
|
105
71
|
} else {
|
|
106
72
|
console::cleanup();
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
73
|
+
LOG("\n");
|
|
74
|
+
common_perf_print(*g_ctx, *g_smpl);
|
|
75
|
+
|
|
76
|
+
// make sure all logs are flushed
|
|
77
|
+
LOG("Interrupted by user\n");
|
|
78
|
+
common_log_pause(common_log_main());
|
|
79
|
+
|
|
110
80
|
_exit(130);
|
|
111
81
|
}
|
|
112
82
|
}
|
|
113
83
|
}
|
|
114
84
|
#endif
|
|
115
85
|
|
|
116
|
-
static
|
|
117
|
-
|
|
118
|
-
(
|
|
119
|
-
LOG_TEE("%s", text);
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
|
|
123
|
-
llama_chat_msg new_msg{role, content};
|
|
124
|
-
auto formatted = llama_chat_format_single(
|
|
125
|
-
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
|
86
|
+
static std::string chat_add_and_format(struct llama_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
|
|
87
|
+
common_chat_msg new_msg{role, content};
|
|
88
|
+
auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
|
126
89
|
chat_msgs.push_back({role, content});
|
|
127
|
-
|
|
90
|
+
LOG_DBG("formatted: '%s'\n", formatted.c_str());
|
|
128
91
|
return formatted;
|
|
129
92
|
}
|
|
130
93
|
|
|
131
94
|
int main(int argc, char ** argv) {
|
|
132
|
-
|
|
95
|
+
common_params params;
|
|
133
96
|
g_params = ¶ms;
|
|
134
|
-
|
|
135
|
-
if (!gpt_params_parse(argc, argv, params)) {
|
|
136
|
-
gpt_params_print_usage(argc, argv, params);
|
|
97
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
|
|
137
98
|
return 1;
|
|
138
99
|
}
|
|
139
100
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
#ifndef LOG_DISABLE_LOGS
|
|
143
|
-
log_set_target(log_filename_generator("main", "log"));
|
|
144
|
-
LOG_TEE("Log start\n");
|
|
145
|
-
log_dump_cmdline(argc, argv);
|
|
146
|
-
llama_log_set(llama_log_callback_logTee, nullptr);
|
|
147
|
-
#endif // LOG_DISABLE_LOGS
|
|
101
|
+
common_init();
|
|
148
102
|
|
|
149
|
-
|
|
150
|
-
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
|
|
103
|
+
auto & sparams = params.sparams;
|
|
151
104
|
|
|
152
105
|
// save choice to use color for later
|
|
153
106
|
// (note for later: this is a slightly awkward choice)
|
|
@@ -155,120 +108,141 @@ int main(int argc, char ** argv) {
|
|
|
155
108
|
atexit([]() { console::cleanup(); });
|
|
156
109
|
|
|
157
110
|
if (params.logits_all) {
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
111
|
+
LOG_ERR("************\n");
|
|
112
|
+
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
|
113
|
+
LOG_ERR("************\n\n");
|
|
161
114
|
|
|
162
115
|
return 0;
|
|
163
116
|
}
|
|
164
117
|
|
|
165
118
|
if (params.embedding) {
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
119
|
+
LOG_ERR("************\n");
|
|
120
|
+
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
|
121
|
+
LOG_ERR("************\n\n");
|
|
169
122
|
|
|
170
123
|
return 0;
|
|
171
124
|
}
|
|
172
125
|
|
|
173
126
|
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
|
174
|
-
|
|
127
|
+
LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
|
175
128
|
params.n_ctx = 8;
|
|
176
129
|
}
|
|
177
130
|
|
|
178
131
|
if (params.rope_freq_base != 0.0) {
|
|
179
|
-
|
|
132
|
+
LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
|
180
133
|
}
|
|
181
134
|
|
|
182
135
|
if (params.rope_freq_scale != 0.0) {
|
|
183
|
-
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
|
187
|
-
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
|
188
|
-
|
|
189
|
-
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
190
|
-
params.seed = time(NULL);
|
|
136
|
+
LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
|
191
137
|
}
|
|
192
138
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
std::mt19937 rng(params.seed);
|
|
139
|
+
LOG_INF("%s: llama backend init\n", __func__);
|
|
196
140
|
|
|
197
|
-
LOG("%s: llama backend init\n", __func__);
|
|
198
141
|
llama_backend_init();
|
|
199
142
|
llama_numa_init(params.numa);
|
|
200
143
|
|
|
201
|
-
llama_model * model;
|
|
202
|
-
llama_context * ctx;
|
|
203
|
-
|
|
204
|
-
|
|
144
|
+
llama_model * model = nullptr;
|
|
145
|
+
llama_context * ctx = nullptr;
|
|
146
|
+
common_sampler * smpl = nullptr;
|
|
147
|
+
|
|
148
|
+
std::vector<common_chat_msg> chat_msgs;
|
|
149
|
+
|
|
205
150
|
g_model = &model;
|
|
206
151
|
g_ctx = &ctx;
|
|
152
|
+
g_smpl = &smpl;
|
|
207
153
|
|
|
208
154
|
// load the model and apply lora adapter, if any
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
}
|
|
155
|
+
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
|
156
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
157
|
+
|
|
158
|
+
model = llama_init.model;
|
|
159
|
+
ctx = llama_init.context;
|
|
215
160
|
|
|
216
161
|
if (model == NULL) {
|
|
217
|
-
|
|
162
|
+
LOG_ERR("%s: error: unable to load model\n", __func__);
|
|
163
|
+
return 1;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
|
167
|
+
|
|
168
|
+
struct ggml_threadpool_params tpp_batch =
|
|
169
|
+
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
|
|
170
|
+
struct ggml_threadpool_params tpp =
|
|
171
|
+
ggml_threadpool_params_from_cpu_params(params.cpuparams);
|
|
172
|
+
|
|
173
|
+
set_process_priority(params.cpuparams.priority);
|
|
174
|
+
|
|
175
|
+
struct ggml_threadpool * threadpool_batch = NULL;
|
|
176
|
+
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
|
177
|
+
threadpool_batch = ggml_threadpool_new(&tpp_batch);
|
|
178
|
+
if (!threadpool_batch) {
|
|
179
|
+
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
|
180
|
+
return 1;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Start the non-batch threadpool in the paused state
|
|
184
|
+
tpp.paused = true;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
|
188
|
+
if (!threadpool) {
|
|
189
|
+
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
|
218
190
|
return 1;
|
|
219
191
|
}
|
|
220
192
|
|
|
193
|
+
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
|
|
194
|
+
|
|
221
195
|
const int n_ctx_train = llama_n_ctx_train(model);
|
|
222
196
|
const int n_ctx = llama_n_ctx(ctx);
|
|
223
|
-
LOG("n_ctx: %d\n", n_ctx);
|
|
224
197
|
|
|
225
198
|
if (n_ctx > n_ctx_train) {
|
|
226
|
-
|
|
227
|
-
__func__, n_ctx_train, n_ctx);
|
|
199
|
+
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
|
228
200
|
}
|
|
229
201
|
|
|
230
202
|
// print chat template example in conversation mode
|
|
231
203
|
if (params.conversation) {
|
|
232
204
|
if (params.enable_chat_template) {
|
|
233
|
-
|
|
205
|
+
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
|
|
234
206
|
} else {
|
|
235
|
-
|
|
207
|
+
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
|
236
208
|
}
|
|
237
209
|
}
|
|
238
210
|
|
|
239
211
|
// print system information
|
|
240
212
|
{
|
|
241
|
-
|
|
242
|
-
|
|
213
|
+
LOG_INF("\n");
|
|
214
|
+
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
|
215
|
+
LOG_INF("\n");
|
|
243
216
|
}
|
|
244
217
|
|
|
245
218
|
std::string path_session = params.path_prompt_cache;
|
|
246
219
|
std::vector<llama_token> session_tokens;
|
|
247
220
|
|
|
248
221
|
if (!path_session.empty()) {
|
|
249
|
-
|
|
222
|
+
LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
|
|
250
223
|
if (!file_exists(path_session)) {
|
|
251
|
-
|
|
224
|
+
LOG_INF("%s: session file does not exist, will create.\n", __func__);
|
|
252
225
|
} else if (file_is_empty(path_session)) {
|
|
253
|
-
|
|
226
|
+
LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
|
|
254
227
|
} else {
|
|
255
228
|
// The file exists and is not empty
|
|
256
229
|
session_tokens.resize(n_ctx);
|
|
257
230
|
size_t n_token_count_out = 0;
|
|
258
231
|
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
|
259
|
-
|
|
232
|
+
LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
|
|
260
233
|
return 1;
|
|
261
234
|
}
|
|
262
235
|
session_tokens.resize(n_token_count_out);
|
|
263
|
-
|
|
236
|
+
LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
|
264
237
|
}
|
|
265
238
|
}
|
|
266
239
|
|
|
267
|
-
const bool add_bos =
|
|
240
|
+
const bool add_bos = llama_add_bos_token(model);
|
|
268
241
|
if (!llama_model_has_encoder(model)) {
|
|
269
|
-
GGML_ASSERT(llama_add_eos_token(model)
|
|
242
|
+
GGML_ASSERT(!llama_add_eos_token(model));
|
|
270
243
|
}
|
|
271
|
-
|
|
244
|
+
|
|
245
|
+
LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
|
|
272
246
|
|
|
273
247
|
std::vector<llama_token> embd_inp;
|
|
274
248
|
|
|
@@ -277,49 +251,31 @@ int main(int argc, char ** argv) {
|
|
|
277
251
|
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
|
278
252
|
: params.prompt;
|
|
279
253
|
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
|
280
|
-
|
|
281
|
-
embd_inp =
|
|
254
|
+
LOG_DBG("tokenize the prompt\n");
|
|
255
|
+
embd_inp = common_tokenize(ctx, prompt, true, true);
|
|
282
256
|
} else {
|
|
283
|
-
|
|
257
|
+
LOG_DBG("use session tokens\n");
|
|
284
258
|
embd_inp = session_tokens;
|
|
285
259
|
}
|
|
286
260
|
|
|
287
|
-
|
|
288
|
-
|
|
261
|
+
LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
|
|
262
|
+
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
|
|
289
263
|
}
|
|
290
264
|
|
|
291
265
|
// Should not run without any tokens
|
|
292
266
|
if (embd_inp.empty()) {
|
|
293
267
|
if (add_bos) {
|
|
294
268
|
embd_inp.push_back(llama_token_bos(model));
|
|
295
|
-
|
|
269
|
+
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
|
296
270
|
} else {
|
|
297
|
-
|
|
271
|
+
LOG_ERR("input is empty\n");
|
|
298
272
|
return -1;
|
|
299
273
|
}
|
|
300
274
|
}
|
|
301
275
|
|
|
302
276
|
// Tokenize negative prompt
|
|
303
|
-
std::vector<llama_token> guidance_inp;
|
|
304
|
-
int guidance_offset = 0;
|
|
305
|
-
int original_prompt_len = 0;
|
|
306
|
-
if (ctx_guidance) {
|
|
307
|
-
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
|
308
|
-
|
|
309
|
-
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
|
|
310
|
-
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
|
311
|
-
|
|
312
|
-
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
|
313
|
-
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
|
314
|
-
|
|
315
|
-
original_prompt_len = original_inp.size();
|
|
316
|
-
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
|
317
|
-
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
|
|
318
|
-
LOG("guidance_offset: %s", log_tostr(guidance_offset));
|
|
319
|
-
}
|
|
320
|
-
|
|
321
277
|
if ((int) embd_inp.size() > n_ctx - 4) {
|
|
322
|
-
|
|
278
|
+
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
|
323
279
|
return 1;
|
|
324
280
|
}
|
|
325
281
|
|
|
@@ -333,29 +289,28 @@ int main(int argc, char ** argv) {
|
|
|
333
289
|
n_matching_session_tokens++;
|
|
334
290
|
}
|
|
335
291
|
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
|
|
336
|
-
|
|
292
|
+
LOG_INF("%s: using full prompt from session file\n", __func__);
|
|
337
293
|
} else if (n_matching_session_tokens >= embd_inp.size()) {
|
|
338
|
-
|
|
294
|
+
LOG_INF("%s: session file has exact match for prompt!\n", __func__);
|
|
339
295
|
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
|
|
340
|
-
|
|
341
|
-
|
|
296
|
+
LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
|
297
|
+
__func__, n_matching_session_tokens, embd_inp.size());
|
|
342
298
|
} else {
|
|
343
|
-
|
|
344
|
-
|
|
299
|
+
LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
|
|
300
|
+
__func__, n_matching_session_tokens, embd_inp.size());
|
|
345
301
|
}
|
|
346
302
|
|
|
347
303
|
// remove any "future" tokens that we might have inherited from the previous session
|
|
348
304
|
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
|
349
305
|
}
|
|
350
306
|
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
|
|
307
|
+
LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
|
|
308
|
+
embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
|
|
354
309
|
|
|
355
310
|
// if we will use the cache for the full prompt without reaching the end of the cache, force
|
|
356
311
|
// reevaluation of the last token to recalculate the cached logits
|
|
357
312
|
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
|
|
358
|
-
|
|
313
|
+
LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
|
|
359
314
|
|
|
360
315
|
session_tokens.resize(embd_inp.size() - 1);
|
|
361
316
|
}
|
|
@@ -377,30 +332,20 @@ int main(int argc, char ** argv) {
|
|
|
377
332
|
}
|
|
378
333
|
|
|
379
334
|
if (params.verbose_prompt) {
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
|
335
|
+
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
|
336
|
+
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
|
383
337
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
|
384
|
-
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
if (ctx_guidance) {
|
|
388
|
-
LOG_TEE("\n");
|
|
389
|
-
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
|
390
|
-
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
|
391
|
-
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
|
392
|
-
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
|
393
|
-
}
|
|
338
|
+
LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
394
339
|
}
|
|
395
340
|
|
|
396
341
|
if (params.n_keep > add_bos) {
|
|
397
|
-
|
|
342
|
+
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
|
398
343
|
for (int i = 0; i < params.n_keep; i++) {
|
|
399
|
-
|
|
344
|
+
LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
400
345
|
}
|
|
401
|
-
|
|
346
|
+
LOG_CNT("'\n");
|
|
402
347
|
}
|
|
403
|
-
|
|
348
|
+
LOG_INF("\n");
|
|
404
349
|
}
|
|
405
350
|
|
|
406
351
|
// ctrl+C handling
|
|
@@ -420,47 +365,56 @@ int main(int argc, char ** argv) {
|
|
|
420
365
|
}
|
|
421
366
|
|
|
422
367
|
if (params.interactive) {
|
|
423
|
-
|
|
368
|
+
LOG_INF("%s: interactive mode on.\n", __func__);
|
|
424
369
|
|
|
425
370
|
if (!params.antiprompt.empty()) {
|
|
426
371
|
for (const auto & antiprompt : params.antiprompt) {
|
|
427
|
-
|
|
372
|
+
LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
|
|
428
373
|
if (params.verbose_prompt) {
|
|
429
|
-
auto tmp =
|
|
374
|
+
auto tmp = common_tokenize(ctx, antiprompt, false, true);
|
|
430
375
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
431
|
-
|
|
376
|
+
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
|
|
432
377
|
}
|
|
433
378
|
}
|
|
434
379
|
}
|
|
435
380
|
}
|
|
436
381
|
|
|
437
382
|
if (params.input_prefix_bos) {
|
|
438
|
-
|
|
383
|
+
LOG_INF("Input prefix with BOS\n");
|
|
439
384
|
}
|
|
440
385
|
|
|
441
386
|
if (!params.input_prefix.empty()) {
|
|
442
|
-
|
|
387
|
+
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
|
|
443
388
|
if (params.verbose_prompt) {
|
|
444
|
-
auto tmp =
|
|
389
|
+
auto tmp = common_tokenize(ctx, params.input_prefix, true, true);
|
|
445
390
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
446
|
-
|
|
391
|
+
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
|
|
447
392
|
}
|
|
448
393
|
}
|
|
449
394
|
}
|
|
450
395
|
|
|
451
396
|
if (!params.input_suffix.empty()) {
|
|
452
|
-
|
|
397
|
+
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
|
453
398
|
if (params.verbose_prompt) {
|
|
454
|
-
auto tmp =
|
|
399
|
+
auto tmp = common_tokenize(ctx, params.input_suffix, false, true);
|
|
455
400
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
456
|
-
|
|
401
|
+
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
|
|
457
402
|
}
|
|
458
403
|
}
|
|
459
404
|
}
|
|
460
405
|
}
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
406
|
+
|
|
407
|
+
smpl = common_sampler_init(model, sparams);
|
|
408
|
+
if (!smpl) {
|
|
409
|
+
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
|
410
|
+
return 1;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
|
|
414
|
+
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
|
415
|
+
LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
|
|
416
|
+
|
|
417
|
+
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
|
464
418
|
|
|
465
419
|
// group-attention state
|
|
466
420
|
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
|
|
@@ -474,9 +428,9 @@ int main(int argc, char ** argv) {
|
|
|
474
428
|
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
|
|
475
429
|
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
|
|
476
430
|
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
|
|
477
|
-
|
|
431
|
+
LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
|
|
478
432
|
}
|
|
479
|
-
|
|
433
|
+
LOG_INF("\n");
|
|
480
434
|
|
|
481
435
|
if (params.interactive) {
|
|
482
436
|
const char * control_message;
|
|
@@ -488,11 +442,11 @@ int main(int argc, char ** argv) {
|
|
|
488
442
|
" - To return control without starting a new line, end your input with '/'.\n"
|
|
489
443
|
" - If you want to submit another line, end your input with '\\'.\n";
|
|
490
444
|
}
|
|
491
|
-
|
|
445
|
+
LOG_INF("== Running in interactive mode. ==\n");
|
|
492
446
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
493
|
-
|
|
447
|
+
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
|
|
494
448
|
#endif
|
|
495
|
-
|
|
449
|
+
LOG_INF( "%s\n", control_message);
|
|
496
450
|
|
|
497
451
|
is_interacting = params.interactive_first;
|
|
498
452
|
}
|
|
@@ -506,7 +460,6 @@ int main(int argc, char ** argv) {
|
|
|
506
460
|
int n_remain = params.n_predict;
|
|
507
461
|
int n_consumed = 0;
|
|
508
462
|
int n_session_consumed = 0;
|
|
509
|
-
int n_past_guidance = 0;
|
|
510
463
|
|
|
511
464
|
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
|
512
465
|
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
|
@@ -518,28 +471,21 @@ int main(int argc, char ** argv) {
|
|
|
518
471
|
display = params.display_prompt;
|
|
519
472
|
|
|
520
473
|
std::vector<llama_token> embd;
|
|
521
|
-
std::vector<llama_token> embd_guidance;
|
|
522
474
|
|
|
523
475
|
// tokenized antiprompts
|
|
524
476
|
std::vector<std::vector<llama_token>> antiprompt_ids;
|
|
525
477
|
|
|
526
478
|
antiprompt_ids.reserve(params.antiprompt.size());
|
|
527
479
|
for (const std::string & antiprompt : params.antiprompt) {
|
|
528
|
-
antiprompt_ids.emplace_back(::
|
|
529
|
-
}
|
|
530
|
-
|
|
531
|
-
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
|
532
|
-
if (!ctx_sampling) {
|
|
533
|
-
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
|
534
|
-
exit(1);
|
|
480
|
+
antiprompt_ids.emplace_back(::common_tokenize(ctx, antiprompt, false, true));
|
|
535
481
|
}
|
|
536
482
|
|
|
537
483
|
if (llama_model_has_encoder(model)) {
|
|
538
484
|
int enc_input_size = embd_inp.size();
|
|
539
485
|
llama_token * enc_input_buf = embd_inp.data();
|
|
540
486
|
|
|
541
|
-
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size
|
|
542
|
-
|
|
487
|
+
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) {
|
|
488
|
+
LOG_ERR("%s : failed to eval\n", __func__);
|
|
543
489
|
return 1;
|
|
544
490
|
}
|
|
545
491
|
|
|
@@ -565,9 +511,8 @@ int main(int argc, char ** argv) {
|
|
|
565
511
|
embd.resize(max_embd_size);
|
|
566
512
|
|
|
567
513
|
console::set_display(console::error);
|
|
568
|
-
|
|
514
|
+
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
|
569
515
|
console::set_display(console::reset);
|
|
570
|
-
fflush(stdout);
|
|
571
516
|
}
|
|
572
517
|
|
|
573
518
|
if (ga_n == 1) {
|
|
@@ -575,16 +520,22 @@ int main(int argc, char ** argv) {
|
|
|
575
520
|
// if we run out of context:
|
|
576
521
|
// - take the n_keep first tokens from the original prompt (via n_past)
|
|
577
522
|
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
|
578
|
-
|
|
523
|
+
|
|
524
|
+
if (n_past + (int) embd.size() >= n_ctx) {
|
|
525
|
+
if (!params.ctx_shift){
|
|
526
|
+
LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
|
|
527
|
+
break;
|
|
528
|
+
}
|
|
529
|
+
|
|
579
530
|
if (params.n_predict == -2) {
|
|
580
|
-
|
|
531
|
+
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
|
581
532
|
break;
|
|
582
533
|
}
|
|
583
534
|
|
|
584
535
|
const int n_left = n_past - params.n_keep;
|
|
585
536
|
const int n_discard = n_left/2;
|
|
586
537
|
|
|
587
|
-
|
|
538
|
+
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
|
588
539
|
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
|
589
540
|
|
|
590
541
|
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
|
@@ -592,15 +543,11 @@ int main(int argc, char ** argv) {
|
|
|
592
543
|
|
|
593
544
|
n_past -= n_discard;
|
|
594
545
|
|
|
595
|
-
|
|
596
|
-
n_past_guidance -= n_discard;
|
|
597
|
-
}
|
|
598
|
-
|
|
599
|
-
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
|
|
546
|
+
LOG_DBG("after swap: n_past = %d\n", n_past);
|
|
600
547
|
|
|
601
|
-
|
|
548
|
+
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
|
602
549
|
|
|
603
|
-
|
|
550
|
+
LOG_DBG("clear session path\n");
|
|
604
551
|
path_session.clear();
|
|
605
552
|
}
|
|
606
553
|
} else {
|
|
@@ -610,10 +557,10 @@ int main(int argc, char ** argv) {
|
|
|
610
557
|
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
|
611
558
|
const int dd = (ga_w/ga_n) - ib*bd - ga_w;
|
|
612
559
|
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
560
|
+
LOG_DBG("\n");
|
|
561
|
+
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
|
|
562
|
+
LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
|
563
|
+
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
|
617
564
|
|
|
618
565
|
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
|
619
566
|
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
|
@@ -623,7 +570,7 @@ int main(int argc, char ** argv) {
|
|
|
623
570
|
|
|
624
571
|
ga_i += ga_w/ga_n;
|
|
625
572
|
|
|
626
|
-
|
|
573
|
+
LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
|
|
627
574
|
}
|
|
628
575
|
}
|
|
629
576
|
|
|
@@ -649,65 +596,25 @@ int main(int argc, char ** argv) {
|
|
|
649
596
|
}
|
|
650
597
|
}
|
|
651
598
|
|
|
652
|
-
// evaluate tokens in batches
|
|
653
|
-
// embd is typically prepared beforehand to fit within a batch, but not always
|
|
654
|
-
if (ctx_guidance) {
|
|
655
|
-
int input_size = 0;
|
|
656
|
-
llama_token * input_buf = NULL;
|
|
657
|
-
|
|
658
|
-
if (n_past_guidance < (int) guidance_inp.size()) {
|
|
659
|
-
// Guidance context should have the same data with these modifications:
|
|
660
|
-
//
|
|
661
|
-
// * Replace the initial prompt
|
|
662
|
-
// * Shift everything by guidance_offset
|
|
663
|
-
embd_guidance = guidance_inp;
|
|
664
|
-
if (embd.begin() + original_prompt_len < embd.end()) {
|
|
665
|
-
embd_guidance.insert(
|
|
666
|
-
embd_guidance.end(),
|
|
667
|
-
embd.begin() + original_prompt_len,
|
|
668
|
-
embd.end()
|
|
669
|
-
);
|
|
670
|
-
}
|
|
671
|
-
|
|
672
|
-
input_buf = embd_guidance.data();
|
|
673
|
-
input_size = embd_guidance.size();
|
|
674
|
-
|
|
675
|
-
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
|
|
676
|
-
} else {
|
|
677
|
-
input_buf = embd.data();
|
|
678
|
-
input_size = embd.size();
|
|
679
|
-
}
|
|
680
|
-
|
|
681
|
-
for (int i = 0; i < input_size; i += params.n_batch) {
|
|
682
|
-
int n_eval = std::min(input_size - i, params.n_batch);
|
|
683
|
-
if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
|
|
684
|
-
LOG_TEE("%s : failed to eval\n", __func__);
|
|
685
|
-
return 1;
|
|
686
|
-
}
|
|
687
|
-
|
|
688
|
-
n_past_guidance += n_eval;
|
|
689
|
-
}
|
|
690
|
-
}
|
|
691
|
-
|
|
692
599
|
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
|
693
600
|
int n_eval = (int) embd.size() - i;
|
|
694
601
|
if (n_eval > params.n_batch) {
|
|
695
602
|
n_eval = params.n_batch;
|
|
696
603
|
}
|
|
697
604
|
|
|
698
|
-
|
|
605
|
+
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
|
699
606
|
|
|
700
|
-
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval
|
|
701
|
-
|
|
607
|
+
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
|
|
608
|
+
LOG_ERR("%s : failed to eval\n", __func__);
|
|
702
609
|
return 1;
|
|
703
610
|
}
|
|
704
611
|
|
|
705
612
|
n_past += n_eval;
|
|
706
613
|
|
|
707
|
-
|
|
614
|
+
LOG_DBG("n_past = %d\n", n_past);
|
|
708
615
|
// Display total tokens alongside total time
|
|
709
616
|
if (params.n_print > 0 && n_past % params.n_print == 0) {
|
|
710
|
-
|
|
617
|
+
LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
|
|
711
618
|
}
|
|
712
619
|
}
|
|
713
620
|
|
|
@@ -718,7 +625,6 @@ int main(int argc, char ** argv) {
|
|
|
718
625
|
}
|
|
719
626
|
|
|
720
627
|
embd.clear();
|
|
721
|
-
embd_guidance.clear();
|
|
722
628
|
|
|
723
629
|
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
|
724
630
|
// optionally save the session on first sample (for faster prompt loading next time)
|
|
@@ -726,14 +632,14 @@ int main(int argc, char ** argv) {
|
|
|
726
632
|
need_to_save_session = false;
|
|
727
633
|
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
|
728
634
|
|
|
729
|
-
|
|
635
|
+
LOG_DBG("saved session to %s\n", path_session.c_str());
|
|
730
636
|
}
|
|
731
637
|
|
|
732
|
-
const llama_token id =
|
|
638
|
+
const llama_token id = common_sampler_sample(smpl, ctx, -1);
|
|
733
639
|
|
|
734
|
-
|
|
640
|
+
common_sampler_accept(smpl, id, /* accept_grammar= */ true);
|
|
735
641
|
|
|
736
|
-
|
|
642
|
+
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
|
737
643
|
|
|
738
644
|
embd.push_back(id);
|
|
739
645
|
|
|
@@ -743,16 +649,16 @@ int main(int argc, char ** argv) {
|
|
|
743
649
|
// decrement remaining sampling budget
|
|
744
650
|
--n_remain;
|
|
745
651
|
|
|
746
|
-
|
|
652
|
+
LOG_DBG("n_remain: %d\n", n_remain);
|
|
747
653
|
} else {
|
|
748
654
|
// some user input remains from prompt or interaction, forward it to processing
|
|
749
|
-
|
|
655
|
+
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
|
750
656
|
while ((int) embd_inp.size() > n_consumed) {
|
|
751
657
|
embd.push_back(embd_inp[n_consumed]);
|
|
752
658
|
|
|
753
659
|
// push the prompt in the sampling context in order to apply repetition penalties later
|
|
754
660
|
// for the prompt, we don't apply grammar rules
|
|
755
|
-
|
|
661
|
+
common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
|
|
756
662
|
|
|
757
663
|
++n_consumed;
|
|
758
664
|
if ((int) embd.size() >= params.n_batch) {
|
|
@@ -764,10 +670,10 @@ int main(int argc, char ** argv) {
|
|
|
764
670
|
// display text
|
|
765
671
|
if (input_echo && display) {
|
|
766
672
|
for (auto id : embd) {
|
|
767
|
-
const std::string token_str =
|
|
673
|
+
const std::string token_str = common_token_to_piece(ctx, id, params.special);
|
|
768
674
|
|
|
769
675
|
// Console/Stream Output
|
|
770
|
-
|
|
676
|
+
LOG("%s", token_str.c_str());
|
|
771
677
|
|
|
772
678
|
// Record Displayed Tokens To Log
|
|
773
679
|
// Note: Generated tokens are created one by one hence this check
|
|
@@ -779,8 +685,6 @@ int main(int argc, char ** argv) {
|
|
|
779
685
|
output_tokens.push_back(id);
|
|
780
686
|
output_ss << token_str;
|
|
781
687
|
}
|
|
782
|
-
|
|
783
|
-
fflush(stdout);
|
|
784
688
|
}
|
|
785
689
|
}
|
|
786
690
|
|
|
@@ -795,7 +699,7 @@ int main(int argc, char ** argv) {
|
|
|
795
699
|
// check for reverse prompt in the last n_prev tokens
|
|
796
700
|
if (!params.antiprompt.empty()) {
|
|
797
701
|
const int n_prev = 32;
|
|
798
|
-
const std::string last_output =
|
|
702
|
+
const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev);
|
|
799
703
|
|
|
800
704
|
is_antiprompt = false;
|
|
801
705
|
// Check if each of the reverse prompts appears at the end of the output.
|
|
@@ -817,7 +721,7 @@ int main(int argc, char ** argv) {
|
|
|
817
721
|
}
|
|
818
722
|
|
|
819
723
|
// check for reverse prompt using special tokens
|
|
820
|
-
llama_token last_token =
|
|
724
|
+
llama_token last_token = common_sampler_last(smpl);
|
|
821
725
|
for (std::vector<llama_token> ids : antiprompt_ids) {
|
|
822
726
|
if (ids.size() == 1 && last_token == ids[0]) {
|
|
823
727
|
if (params.interactive) {
|
|
@@ -829,18 +733,18 @@ int main(int argc, char ** argv) {
|
|
|
829
733
|
}
|
|
830
734
|
|
|
831
735
|
if (is_antiprompt) {
|
|
832
|
-
|
|
736
|
+
LOG_DBG("found antiprompt: %s\n", last_output.c_str());
|
|
833
737
|
}
|
|
834
738
|
}
|
|
835
739
|
|
|
836
740
|
// deal with end of generation tokens in interactive mode
|
|
837
|
-
if (llama_token_is_eog(model,
|
|
838
|
-
|
|
741
|
+
if (llama_token_is_eog(model, common_sampler_last(smpl))) {
|
|
742
|
+
LOG_DBG("found an EOG token\n");
|
|
839
743
|
|
|
840
744
|
if (params.interactive) {
|
|
841
745
|
if (!params.antiprompt.empty()) {
|
|
842
746
|
// tokenize and inject first reverse prompt
|
|
843
|
-
const auto first_antiprompt =
|
|
747
|
+
const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true);
|
|
844
748
|
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
|
|
845
749
|
is_antiprompt = true;
|
|
846
750
|
}
|
|
@@ -849,32 +753,32 @@ int main(int argc, char ** argv) {
|
|
|
849
753
|
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
|
|
850
754
|
}
|
|
851
755
|
is_interacting = true;
|
|
852
|
-
|
|
756
|
+
LOG("\n");
|
|
853
757
|
}
|
|
854
758
|
}
|
|
855
759
|
|
|
856
760
|
// if current token is not EOG, we add it to current assistant message
|
|
857
761
|
if (params.conversation) {
|
|
858
|
-
auto id =
|
|
859
|
-
assistant_ss <<
|
|
762
|
+
const auto id = common_sampler_last(smpl);
|
|
763
|
+
assistant_ss << common_token_to_piece(ctx, id, false);
|
|
860
764
|
}
|
|
861
765
|
|
|
862
766
|
if (n_past > 0 && is_interacting) {
|
|
863
|
-
|
|
767
|
+
LOG_DBG("waiting for user input\n");
|
|
864
768
|
|
|
865
769
|
if (params.conversation) {
|
|
866
|
-
|
|
770
|
+
LOG("\n> ");
|
|
867
771
|
}
|
|
868
772
|
|
|
869
773
|
if (params.input_prefix_bos) {
|
|
870
|
-
|
|
774
|
+
LOG_DBG("adding input prefix BOS token\n");
|
|
871
775
|
embd_inp.push_back(llama_token_bos(model));
|
|
872
776
|
}
|
|
873
777
|
|
|
874
778
|
std::string buffer;
|
|
875
779
|
if (!params.input_prefix.empty() && !params.conversation) {
|
|
876
|
-
|
|
877
|
-
|
|
780
|
+
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
|
781
|
+
LOG("%s", params.input_prefix.c_str());
|
|
878
782
|
}
|
|
879
783
|
|
|
880
784
|
// color user input only
|
|
@@ -897,11 +801,11 @@ int main(int argc, char ** argv) {
|
|
|
897
801
|
if (buffer.length() > 1) {
|
|
898
802
|
// append input suffix if any
|
|
899
803
|
if (!params.input_suffix.empty() && !params.conversation) {
|
|
900
|
-
|
|
901
|
-
|
|
804
|
+
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
|
805
|
+
LOG("%s", params.input_suffix.c_str());
|
|
902
806
|
}
|
|
903
807
|
|
|
904
|
-
|
|
808
|
+
LOG_DBG("buffer: '%s'\n", buffer.c_str());
|
|
905
809
|
|
|
906
810
|
const size_t original_size = embd_inp.size();
|
|
907
811
|
|
|
@@ -914,11 +818,11 @@ int main(int argc, char ** argv) {
|
|
|
914
818
|
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
|
915
819
|
: std::move(buffer);
|
|
916
820
|
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
|
|
917
|
-
const auto line_pfx =
|
|
918
|
-
const auto line_inp =
|
|
919
|
-
const auto line_sfx =
|
|
821
|
+
const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
|
|
822
|
+
const auto line_inp = common_tokenize(ctx, user_inp, false, format_chat);
|
|
823
|
+
const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true);
|
|
920
824
|
|
|
921
|
-
|
|
825
|
+
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
|
922
826
|
|
|
923
827
|
// if user stop generation mid-way, we must add EOT to finish model's last response
|
|
924
828
|
if (need_insert_eot && format_chat) {
|
|
@@ -934,16 +838,16 @@ int main(int argc, char ** argv) {
|
|
|
934
838
|
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
|
935
839
|
const llama_token token = embd_inp[i];
|
|
936
840
|
output_tokens.push_back(token);
|
|
937
|
-
output_ss <<
|
|
841
|
+
output_ss << common_token_to_piece(ctx, token);
|
|
938
842
|
}
|
|
939
843
|
|
|
940
844
|
// reset assistant message
|
|
941
845
|
assistant_ss.str("");
|
|
942
846
|
|
|
943
847
|
n_remain -= line_inp.size();
|
|
944
|
-
|
|
848
|
+
LOG_DBG("n_remain: %d\n", n_remain);
|
|
945
849
|
} else {
|
|
946
|
-
|
|
850
|
+
LOG_DBG("empty line, passing control back\n");
|
|
947
851
|
}
|
|
948
852
|
|
|
949
853
|
input_echo = false; // do not echo this again
|
|
@@ -951,7 +855,7 @@ int main(int argc, char ** argv) {
|
|
|
951
855
|
|
|
952
856
|
if (n_past > 0) {
|
|
953
857
|
if (is_interacting) {
|
|
954
|
-
|
|
858
|
+
common_sampler_reset(smpl);
|
|
955
859
|
}
|
|
956
860
|
is_interacting = false;
|
|
957
861
|
}
|
|
@@ -959,7 +863,7 @@ int main(int argc, char ** argv) {
|
|
|
959
863
|
|
|
960
864
|
// end of generation
|
|
961
865
|
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
|
|
962
|
-
|
|
866
|
+
LOG(" [end of text]\n");
|
|
963
867
|
break;
|
|
964
868
|
}
|
|
965
869
|
|
|
@@ -972,23 +876,22 @@ int main(int argc, char ** argv) {
|
|
|
972
876
|
}
|
|
973
877
|
|
|
974
878
|
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
|
975
|
-
|
|
879
|
+
LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
|
976
880
|
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
|
977
881
|
}
|
|
978
882
|
|
|
979
|
-
|
|
980
|
-
|
|
883
|
+
LOG("\n\n");
|
|
884
|
+
common_perf_print(ctx, smpl);
|
|
885
|
+
|
|
886
|
+
common_sampler_free(smpl);
|
|
981
887
|
|
|
982
|
-
if (ctx_guidance) { llama_free(ctx_guidance); }
|
|
983
888
|
llama_free(ctx);
|
|
984
889
|
llama_free_model(model);
|
|
985
890
|
|
|
986
|
-
llama_sampling_free(ctx_sampling);
|
|
987
891
|
llama_backend_free();
|
|
988
892
|
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
#endif // LOG_DISABLE_LOGS
|
|
893
|
+
ggml_threadpool_free(threadpool);
|
|
894
|
+
ggml_threadpool_free(threadpool_batch);
|
|
992
895
|
|
|
993
896
|
return 0;
|
|
994
897
|
}
|