@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
2
|
-
|
|
3
3
|
#include "console.h"
|
|
4
|
+
#include "sampling.h"
|
|
5
|
+
#include "log.h"
|
|
4
6
|
#include "llama.h"
|
|
5
|
-
#include "grammar-parser.h"
|
|
6
7
|
|
|
7
8
|
#include <cassert>
|
|
8
9
|
#include <cinttypes>
|
|
@@ -34,57 +35,14 @@
|
|
|
34
35
|
|
|
35
36
|
static llama_context ** g_ctx;
|
|
36
37
|
static llama_model ** g_model;
|
|
37
|
-
static
|
|
38
|
+
static common_sampler ** g_smpl;
|
|
39
|
+
static common_params * g_params;
|
|
38
40
|
static std::vector<llama_token> * g_input_tokens;
|
|
39
41
|
static std::ostringstream * g_output_ss;
|
|
40
42
|
static std::vector<llama_token> * g_output_tokens;
|
|
41
43
|
|
|
42
44
|
static bool is_interacting = false;
|
|
43
45
|
|
|
44
|
-
static void write_logfile(
|
|
45
|
-
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
|
46
|
-
const std::vector<llama_token> & input_tokens, const std::string & output,
|
|
47
|
-
const std::vector<llama_token> & output_tokens
|
|
48
|
-
) {
|
|
49
|
-
if (params.logdir.empty()) {
|
|
50
|
-
return;
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
const std::string timestamp = string_get_sortable_timestamp();
|
|
54
|
-
|
|
55
|
-
const bool success = fs_create_directory_with_parents(params.logdir);
|
|
56
|
-
if (!success) {
|
|
57
|
-
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
|
58
|
-
__func__, params.logdir.c_str());
|
|
59
|
-
return;
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
const std::string logfile_path = params.logdir + timestamp + ".yml";
|
|
63
|
-
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
|
64
|
-
|
|
65
|
-
if (logfile == NULL) {
|
|
66
|
-
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
|
67
|
-
return;
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
fprintf(logfile, "binary: infill\n");
|
|
71
|
-
char model_desc[128];
|
|
72
|
-
llama_model_desc(model, model_desc, sizeof(model_desc));
|
|
73
|
-
yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
|
74
|
-
|
|
75
|
-
fprintf(logfile, "\n");
|
|
76
|
-
fprintf(logfile, "######################\n");
|
|
77
|
-
fprintf(logfile, "# Generation Results #\n");
|
|
78
|
-
fprintf(logfile, "######################\n");
|
|
79
|
-
fprintf(logfile, "\n");
|
|
80
|
-
|
|
81
|
-
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
|
82
|
-
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
|
83
|
-
|
|
84
|
-
llama_dump_timing_info_yaml(logfile, ctx);
|
|
85
|
-
fclose(logfile);
|
|
86
|
-
}
|
|
87
|
-
|
|
88
46
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
89
47
|
static void sigint_handler(int signo) {
|
|
90
48
|
if (signo == SIGINT) {
|
|
@@ -92,9 +50,13 @@ static void sigint_handler(int signo) {
|
|
|
92
50
|
is_interacting = true;
|
|
93
51
|
} else {
|
|
94
52
|
console::cleanup();
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
53
|
+
LOG("\n");
|
|
54
|
+
common_perf_print(*g_ctx, *g_smpl);
|
|
55
|
+
|
|
56
|
+
// make sure all logs are flushed
|
|
57
|
+
LOG("Interrupted by user\n");
|
|
58
|
+
common_log_pause(common_log_main());
|
|
59
|
+
|
|
98
60
|
_exit(130);
|
|
99
61
|
}
|
|
100
62
|
}
|
|
@@ -102,118 +64,107 @@ static void sigint_handler(int signo) {
|
|
|
102
64
|
#endif
|
|
103
65
|
|
|
104
66
|
int main(int argc, char ** argv) {
|
|
105
|
-
|
|
106
|
-
llama_sampling_params & sparams = params.sparams;
|
|
67
|
+
common_params params;
|
|
107
68
|
g_params = ¶ms;
|
|
108
69
|
|
|
109
|
-
if (!
|
|
110
|
-
gpt_params_print_usage(argc, argv, params);
|
|
70
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
|
|
111
71
|
return 1;
|
|
112
72
|
}
|
|
113
73
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
log_dump_cmdline(argc, argv);
|
|
118
|
-
#endif // LOG_DISABLE_LOGS
|
|
74
|
+
common_init();
|
|
75
|
+
|
|
76
|
+
auto & sparams = params.sparams;
|
|
119
77
|
|
|
120
78
|
console::init(params.simple_io, params.use_color);
|
|
121
79
|
atexit([]() { console::cleanup(); });
|
|
122
80
|
|
|
123
81
|
if (params.logits_all) {
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
82
|
+
LOG_ERR("\n************\n");
|
|
83
|
+
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
|
84
|
+
LOG_ERR("************\n\n");
|
|
127
85
|
|
|
128
86
|
return 0;
|
|
129
87
|
}
|
|
130
88
|
|
|
131
89
|
if (params.embedding) {
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
90
|
+
LOG_ERR("\n************\n");
|
|
91
|
+
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
|
92
|
+
LOG_ERR("************\n\n");
|
|
135
93
|
|
|
136
94
|
return 0;
|
|
137
95
|
}
|
|
138
96
|
|
|
139
97
|
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
|
140
|
-
|
|
98
|
+
LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
|
|
141
99
|
params.n_ctx = 8;
|
|
142
100
|
}
|
|
101
|
+
|
|
143
102
|
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
103
|
+
LOG_ERR("\n************\n");
|
|
104
|
+
LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
|
105
|
+
LOG_ERR("************\n\n");
|
|
147
106
|
|
|
148
107
|
return 0;
|
|
149
108
|
}
|
|
150
109
|
|
|
151
110
|
if (params.rope_freq_base != 0.0) {
|
|
152
|
-
|
|
111
|
+
LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
|
153
112
|
}
|
|
154
113
|
|
|
155
114
|
if (params.rope_freq_scale != 0.0) {
|
|
156
|
-
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
|
160
|
-
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
|
161
|
-
|
|
162
|
-
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
163
|
-
params.seed = time(NULL);
|
|
115
|
+
LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
|
164
116
|
}
|
|
165
117
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
std::mt19937 rng(params.seed);
|
|
169
|
-
|
|
170
|
-
LOG("%s: llama backend init\n", __func__);
|
|
118
|
+
LOG_INF("%s: llama backend init\n", __func__);
|
|
171
119
|
llama_backend_init();
|
|
172
120
|
llama_numa_init(params.numa);
|
|
173
121
|
|
|
174
|
-
llama_model * model;
|
|
175
|
-
llama_context * ctx;
|
|
122
|
+
llama_model * model = nullptr;
|
|
123
|
+
llama_context * ctx = nullptr;
|
|
124
|
+
common_sampler * smpl = nullptr;
|
|
176
125
|
|
|
177
126
|
g_model = &model;
|
|
178
127
|
g_ctx = &ctx;
|
|
128
|
+
g_smpl = &smpl;
|
|
179
129
|
|
|
180
130
|
// load the model and apply lora adapter, if any
|
|
181
|
-
|
|
182
|
-
|
|
131
|
+
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
|
132
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
133
|
+
|
|
134
|
+
model = llama_init.model;
|
|
135
|
+
ctx = llama_init.context;
|
|
183
136
|
|
|
184
137
|
if (model == NULL) {
|
|
185
|
-
|
|
138
|
+
LOG_ERR("%s: unable to load model\n", __func__);
|
|
186
139
|
return 1;
|
|
187
140
|
}
|
|
188
141
|
|
|
189
142
|
const int n_ctx_train = llama_n_ctx_train(model);
|
|
190
143
|
const int n_ctx = llama_n_ctx(ctx);
|
|
191
|
-
|
|
144
|
+
LOG_DBG("n_ctx: %d\n", n_ctx);
|
|
192
145
|
|
|
193
146
|
if (n_ctx > n_ctx_train) {
|
|
194
|
-
|
|
195
|
-
__func__, n_ctx_train, n_ctx);
|
|
147
|
+
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
|
196
148
|
}
|
|
197
149
|
|
|
198
150
|
// print system information
|
|
199
151
|
{
|
|
200
|
-
|
|
201
|
-
|
|
152
|
+
LOG_INF("\n");
|
|
153
|
+
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
|
202
154
|
}
|
|
203
|
-
const bool add_bos =
|
|
204
|
-
GGML_ASSERT(llama_add_eos_token(model)
|
|
205
|
-
LOG("add_bos: %d\n", add_bos);
|
|
155
|
+
const bool add_bos = llama_add_bos_token(model);
|
|
156
|
+
GGML_ASSERT(!llama_add_eos_token(model));
|
|
206
157
|
|
|
207
158
|
std::vector<llama_token> embd_inp;
|
|
208
159
|
std::vector<llama_token> embd_end;
|
|
209
|
-
std::vector<llama_token> inp_pfx =
|
|
210
|
-
std::vector<llama_token> inp_sfx =
|
|
160
|
+
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
|
161
|
+
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
|
211
162
|
|
|
212
|
-
GGML_ASSERT(
|
|
213
|
-
GGML_ASSERT(
|
|
163
|
+
GGML_ASSERT(llama_token_fim_pre(model) >= 0);
|
|
164
|
+
GGML_ASSERT(llama_token_fim_suf(model) >= 0);
|
|
214
165
|
|
|
215
|
-
inp_pfx.insert(inp_pfx.begin(),
|
|
216
|
-
inp_sfx.insert(inp_sfx.begin(),
|
|
166
|
+
inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
|
|
167
|
+
inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
|
|
217
168
|
|
|
218
169
|
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
|
219
170
|
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
|
@@ -222,23 +173,24 @@ int main(int argc, char ** argv) {
|
|
|
222
173
|
}
|
|
223
174
|
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
|
224
175
|
|
|
225
|
-
const llama_token middle_token =
|
|
176
|
+
const llama_token middle_token = llama_token_fim_mid(model);
|
|
226
177
|
if (middle_token >= 0) {
|
|
227
178
|
embd_inp.push_back(middle_token);
|
|
228
179
|
}
|
|
229
180
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
181
|
+
LOG_DBG("add_bos: %d\n", add_bos);
|
|
182
|
+
LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
|
|
183
|
+
LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
|
|
184
|
+
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
|
|
233
185
|
|
|
234
186
|
// Should not run without any tokens
|
|
235
187
|
if (embd_inp.empty()) {
|
|
236
188
|
embd_inp.push_back(llama_token_bos(model));
|
|
237
|
-
|
|
189
|
+
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
|
238
190
|
}
|
|
239
191
|
|
|
240
192
|
if ((int) embd_inp.size() > n_ctx - 4) {
|
|
241
|
-
|
|
193
|
+
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
|
242
194
|
return 1;
|
|
243
195
|
}
|
|
244
196
|
|
|
@@ -247,9 +199,8 @@ int main(int argc, char ** argv) {
|
|
|
247
199
|
params.n_keep = (int)embd_inp.size();
|
|
248
200
|
}
|
|
249
201
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
202
|
+
LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
|
|
203
|
+
LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
|
|
253
204
|
|
|
254
205
|
// enable interactive mode if interactive start is specified
|
|
255
206
|
if (params.interactive_first) {
|
|
@@ -257,21 +208,21 @@ int main(int argc, char ** argv) {
|
|
|
257
208
|
}
|
|
258
209
|
|
|
259
210
|
if (params.verbose_prompt) {
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
211
|
+
LOG_INF("\n");
|
|
212
|
+
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
|
213
|
+
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
|
263
214
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
|
264
|
-
|
|
215
|
+
LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
265
216
|
}
|
|
266
217
|
|
|
267
218
|
if (params.n_keep > 0) {
|
|
268
|
-
|
|
219
|
+
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
|
269
220
|
for (int i = 0; i < params.n_keep; i++) {
|
|
270
|
-
|
|
221
|
+
LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
271
222
|
}
|
|
272
|
-
|
|
223
|
+
LOG_CNT("'\n");
|
|
273
224
|
}
|
|
274
|
-
|
|
225
|
+
LOG_INF("\n");
|
|
275
226
|
}
|
|
276
227
|
|
|
277
228
|
if (params.interactive) {
|
|
@@ -288,30 +239,30 @@ int main(int argc, char ** argv) {
|
|
|
288
239
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
|
289
240
|
#endif
|
|
290
241
|
|
|
291
|
-
|
|
242
|
+
LOG_INF("%s: interactive mode on.\n", __func__);
|
|
292
243
|
|
|
293
244
|
if (params.input_prefix_bos) {
|
|
294
|
-
|
|
245
|
+
LOG_INF("Input prefix with BOS\n");
|
|
295
246
|
}
|
|
296
247
|
|
|
297
248
|
if (!params.input_prefix.empty()) {
|
|
298
|
-
|
|
249
|
+
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
|
|
299
250
|
}
|
|
300
251
|
|
|
301
252
|
if (!params.input_suffix.empty()) {
|
|
302
|
-
|
|
253
|
+
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
|
303
254
|
}
|
|
304
255
|
}
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
256
|
+
smpl = common_sampler_init(model, sparams);
|
|
257
|
+
|
|
258
|
+
LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
|
|
259
|
+
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
|
260
|
+
LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
|
|
261
|
+
|
|
262
|
+
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
|
263
|
+
|
|
264
|
+
LOG_INF("\n");
|
|
265
|
+
LOG_INF("\n##### Infill mode #####\n\n");
|
|
315
266
|
if (params.interactive) {
|
|
316
267
|
const char *control_message;
|
|
317
268
|
if (params.multiline_input) {
|
|
@@ -322,11 +273,11 @@ int main(int argc, char ** argv) {
|
|
|
322
273
|
" - To return control without starting a new line, end your input with '/'.\n"
|
|
323
274
|
" - If you want to submit another line, end your input with '\\'.\n";
|
|
324
275
|
}
|
|
325
|
-
|
|
276
|
+
LOG_INF("== Running in interactive mode. ==\n");
|
|
326
277
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
327
|
-
|
|
278
|
+
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
|
|
328
279
|
#endif
|
|
329
|
-
|
|
280
|
+
LOG_INF( "%s\n", control_message);
|
|
330
281
|
|
|
331
282
|
is_interacting = params.interactive_first;
|
|
332
283
|
}
|
|
@@ -346,8 +297,6 @@ int main(int argc, char ** argv) {
|
|
|
346
297
|
|
|
347
298
|
std::vector<llama_token> embd;
|
|
348
299
|
|
|
349
|
-
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
|
350
|
-
|
|
351
300
|
while (n_remain != 0 || params.interactive) {
|
|
352
301
|
// predict
|
|
353
302
|
if (!embd.empty()) {
|
|
@@ -361,9 +310,8 @@ int main(int argc, char ** argv) {
|
|
|
361
310
|
embd.resize(max_embd_size);
|
|
362
311
|
|
|
363
312
|
console::set_display(console::error);
|
|
364
|
-
|
|
313
|
+
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
|
365
314
|
console::set_display(console::reset);
|
|
366
|
-
fflush(stdout);
|
|
367
315
|
}
|
|
368
316
|
|
|
369
317
|
// infinite text generation via context swapping
|
|
@@ -372,14 +320,14 @@ int main(int argc, char ** argv) {
|
|
|
372
320
|
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
|
373
321
|
if (n_past + (int) embd.size() > n_ctx) {
|
|
374
322
|
if (params.n_predict == -2) {
|
|
375
|
-
|
|
323
|
+
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
|
376
324
|
break;
|
|
377
325
|
}
|
|
378
326
|
|
|
379
327
|
const int n_left = n_past - params.n_keep - 1;
|
|
380
328
|
const int n_discard = n_left/2;
|
|
381
329
|
|
|
382
|
-
|
|
330
|
+
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
|
383
331
|
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
|
384
332
|
|
|
385
333
|
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
|
@@ -387,9 +335,9 @@ int main(int argc, char ** argv) {
|
|
|
387
335
|
|
|
388
336
|
n_past -= n_discard;
|
|
389
337
|
|
|
390
|
-
|
|
338
|
+
LOG_DBG("after swap: n_past = %d\n", n_past);
|
|
391
339
|
|
|
392
|
-
|
|
340
|
+
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
|
393
341
|
|
|
394
342
|
}
|
|
395
343
|
|
|
@@ -401,16 +349,16 @@ int main(int argc, char ** argv) {
|
|
|
401
349
|
n_eval = params.n_batch;
|
|
402
350
|
}
|
|
403
351
|
|
|
404
|
-
|
|
352
|
+
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
|
405
353
|
|
|
406
|
-
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval
|
|
407
|
-
|
|
354
|
+
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
|
|
355
|
+
LOG_ERR("%s : failed to eval\n", __func__);
|
|
408
356
|
return 1;
|
|
409
357
|
}
|
|
410
358
|
|
|
411
359
|
n_past += n_eval;
|
|
412
360
|
|
|
413
|
-
|
|
361
|
+
LOG_DBG("n_past = %d\n", n_past);
|
|
414
362
|
}
|
|
415
363
|
|
|
416
364
|
}
|
|
@@ -418,11 +366,11 @@ int main(int argc, char ** argv) {
|
|
|
418
366
|
embd.clear();
|
|
419
367
|
|
|
420
368
|
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
|
421
|
-
const llama_token id =
|
|
369
|
+
const llama_token id = common_sampler_sample(smpl, ctx, -1);
|
|
422
370
|
|
|
423
|
-
|
|
371
|
+
common_sampler_accept(smpl, id, true);
|
|
424
372
|
|
|
425
|
-
|
|
373
|
+
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
|
426
374
|
|
|
427
375
|
embd.push_back(id);
|
|
428
376
|
|
|
@@ -432,16 +380,16 @@ int main(int argc, char ** argv) {
|
|
|
432
380
|
// decrement remaining sampling budget
|
|
433
381
|
--n_remain;
|
|
434
382
|
|
|
435
|
-
|
|
383
|
+
LOG_DBG("n_remain: %d\n", n_remain);
|
|
436
384
|
} else {
|
|
437
385
|
// some user input remains from prompt or interaction, forward it to processing
|
|
438
|
-
|
|
386
|
+
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
|
439
387
|
while ((int) embd_inp.size() > n_consumed) {
|
|
440
388
|
embd.push_back(embd_inp[n_consumed]);
|
|
441
389
|
|
|
442
390
|
// push the prompt in the sampling context in order to apply repetition penalties later
|
|
443
391
|
// for the prompt, we don't apply grammar rules
|
|
444
|
-
|
|
392
|
+
common_sampler_accept(smpl, embd_inp[n_consumed], false);
|
|
445
393
|
|
|
446
394
|
++n_consumed;
|
|
447
395
|
if ((int) embd.size() >= params.n_batch) {
|
|
@@ -453,8 +401,8 @@ int main(int argc, char ** argv) {
|
|
|
453
401
|
// display text
|
|
454
402
|
if (input_echo) {
|
|
455
403
|
for (auto id : embd) {
|
|
456
|
-
const std::string token_str =
|
|
457
|
-
|
|
404
|
+
const std::string token_str = common_token_to_piece(ctx, id);
|
|
405
|
+
LOG("%s", token_str.c_str());
|
|
458
406
|
|
|
459
407
|
if (embd.size() > 1) {
|
|
460
408
|
input_tokens.push_back(id);
|
|
@@ -463,7 +411,6 @@ int main(int argc, char ** argv) {
|
|
|
463
411
|
output_ss << token_str;
|
|
464
412
|
}
|
|
465
413
|
}
|
|
466
|
-
fflush(stdout);
|
|
467
414
|
}
|
|
468
415
|
// reset color to default if we there is no pending user input
|
|
469
416
|
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
|
@@ -473,13 +420,12 @@ int main(int argc, char ** argv) {
|
|
|
473
420
|
// if not currently processing queued inputs;
|
|
474
421
|
if ((int) embd_inp.size() <= n_consumed) {
|
|
475
422
|
// deal with eot token in infill mode
|
|
476
|
-
if ((
|
|
423
|
+
if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
|
|
477
424
|
if (is_interacting && !params.interactive_first) {
|
|
478
425
|
// print an eot token
|
|
479
|
-
|
|
426
|
+
LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
|
480
427
|
}
|
|
481
|
-
|
|
482
|
-
printf("\n");
|
|
428
|
+
LOG("\n");
|
|
483
429
|
console::set_display(console::user_input);
|
|
484
430
|
std::string buffer;
|
|
485
431
|
std::string line;
|
|
@@ -514,11 +460,11 @@ int main(int argc, char ** argv) {
|
|
|
514
460
|
}
|
|
515
461
|
|
|
516
462
|
// tokenize new prefix and suffix
|
|
517
|
-
std::vector<llama_token> inp_pfx =
|
|
518
|
-
std::vector<llama_token> inp_sfx =
|
|
463
|
+
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
|
464
|
+
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
|
519
465
|
|
|
520
|
-
inp_pfx.insert(inp_pfx.begin(),
|
|
521
|
-
inp_sfx.insert(inp_sfx.begin(),
|
|
466
|
+
inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
|
|
467
|
+
inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
|
|
522
468
|
|
|
523
469
|
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
|
524
470
|
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
|
@@ -535,35 +481,33 @@ int main(int argc, char ** argv) {
|
|
|
535
481
|
n_remain = params.n_predict;
|
|
536
482
|
n_past = 0;
|
|
537
483
|
n_consumed = 0;
|
|
538
|
-
// LOG_TEE("took new input\n");
|
|
539
484
|
is_interacting = false;
|
|
540
485
|
}
|
|
541
486
|
// deal with end of generation tokens in interactive mode
|
|
542
|
-
else if (llama_token_is_eog(model,
|
|
543
|
-
|
|
487
|
+
else if (llama_token_is_eog(model, common_sampler_last(smpl))) {
|
|
488
|
+
LOG_DBG("found EOS token\n");
|
|
544
489
|
|
|
545
490
|
if (params.interactive) {
|
|
546
491
|
|
|
547
492
|
is_interacting = true;
|
|
548
|
-
|
|
493
|
+
LOG("\n");
|
|
549
494
|
console::set_display(console::user_input);
|
|
550
|
-
fflush(stdout);
|
|
551
495
|
}
|
|
552
496
|
}
|
|
553
497
|
|
|
554
498
|
if (n_past > 0 && is_interacting && !params.interactive) {
|
|
555
|
-
|
|
499
|
+
LOG_DBG("waiting for user input\n");
|
|
556
500
|
|
|
557
501
|
if (params.input_prefix_bos) {
|
|
558
|
-
|
|
502
|
+
LOG_DBG("adding input prefix BOS token\n");
|
|
559
503
|
embd_inp.push_back(llama_token_bos(model));
|
|
560
504
|
}
|
|
561
505
|
|
|
562
506
|
std::string buffer;
|
|
563
507
|
if (!params.input_prefix.empty()) {
|
|
564
|
-
|
|
508
|
+
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
|
565
509
|
buffer += params.input_prefix;
|
|
566
|
-
|
|
510
|
+
LOG("%s", buffer.c_str());
|
|
567
511
|
}
|
|
568
512
|
|
|
569
513
|
std::string line;
|
|
@@ -581,30 +525,30 @@ int main(int argc, char ** argv) {
|
|
|
581
525
|
if (buffer.length() > 1) {
|
|
582
526
|
// append input suffix if any
|
|
583
527
|
if (!params.input_suffix.empty()) {
|
|
584
|
-
|
|
528
|
+
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
|
585
529
|
buffer += params.input_suffix;
|
|
586
|
-
|
|
530
|
+
LOG("%s", params.input_suffix.c_str());
|
|
587
531
|
}
|
|
588
532
|
|
|
589
|
-
|
|
533
|
+
LOG_DBG("buffer: '%s'\n", buffer.c_str());
|
|
590
534
|
|
|
591
535
|
const size_t original_size = embd_inp.size();
|
|
592
536
|
|
|
593
|
-
const auto line_inp =
|
|
594
|
-
|
|
537
|
+
const auto line_inp = common_tokenize(ctx, buffer, false);
|
|
538
|
+
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
|
595
539
|
|
|
596
540
|
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
|
597
541
|
|
|
598
542
|
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
|
599
543
|
const llama_token token = embd_inp[i];
|
|
600
544
|
output_tokens.push_back(token);
|
|
601
|
-
output_ss <<
|
|
545
|
+
output_ss << common_token_to_piece(ctx, token);
|
|
602
546
|
}
|
|
603
547
|
|
|
604
548
|
n_remain -= line_inp.size();
|
|
605
|
-
|
|
549
|
+
LOG_DBG("n_remain: %d\n", n_remain);
|
|
606
550
|
} else {
|
|
607
|
-
|
|
551
|
+
LOG_DBG("empty line, passing control back\n");
|
|
608
552
|
}
|
|
609
553
|
|
|
610
554
|
input_echo = false; // do not echo this again
|
|
@@ -612,7 +556,7 @@ int main(int argc, char ** argv) {
|
|
|
612
556
|
|
|
613
557
|
if (n_past > 0) {
|
|
614
558
|
if (is_interacting) {
|
|
615
|
-
|
|
559
|
+
common_sampler_reset(smpl);
|
|
616
560
|
}
|
|
617
561
|
is_interacting = false;
|
|
618
562
|
}
|
|
@@ -631,22 +575,17 @@ int main(int argc, char ** argv) {
|
|
|
631
575
|
}
|
|
632
576
|
}
|
|
633
577
|
if (!params.interactive && n_remain <= 0) {
|
|
634
|
-
|
|
635
|
-
fflush(stdout);
|
|
578
|
+
LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
|
636
579
|
}
|
|
637
580
|
|
|
638
|
-
|
|
639
|
-
|
|
581
|
+
LOG("\n");
|
|
582
|
+
common_perf_print(ctx, smpl);
|
|
640
583
|
|
|
641
584
|
llama_free(ctx);
|
|
642
585
|
llama_free_model(model);
|
|
643
586
|
|
|
644
|
-
|
|
587
|
+
common_sampler_free(smpl);
|
|
645
588
|
llama_backend_free();
|
|
646
589
|
|
|
647
|
-
#ifndef LOG_DISABLE_LOGS
|
|
648
|
-
LOG_TEE("Log end\n");
|
|
649
|
-
#endif // LOG_DISABLE_LOGS
|
|
650
|
-
|
|
651
590
|
return 0;
|
|
652
591
|
}
|