@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,72 +1,68 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "ggml.h"
|
|
2
|
-
#include "llama.h"
|
|
3
3
|
#include "common.h"
|
|
4
4
|
#include "ngram-cache.h"
|
|
5
|
+
#include "sampling.h"
|
|
6
|
+
#include "log.h"
|
|
7
|
+
#include "llama.h"
|
|
5
8
|
|
|
6
|
-
#include <cmath>
|
|
7
9
|
#include <cstdint>
|
|
8
10
|
#include <cstdio>
|
|
9
11
|
#include <fstream>
|
|
10
12
|
#include <string>
|
|
11
13
|
#include <vector>
|
|
12
|
-
#include <unordered_map>
|
|
13
14
|
|
|
14
15
|
int main(int argc, char ** argv){
|
|
15
|
-
|
|
16
|
+
common_params params;
|
|
16
17
|
|
|
17
|
-
if (!
|
|
18
|
-
gpt_params_print_usage(argc, argv, params);
|
|
18
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
|
19
19
|
return 1;
|
|
20
20
|
}
|
|
21
21
|
|
|
22
|
+
common_init();
|
|
23
|
+
|
|
22
24
|
// max. number of additional tokens to draft if match is found
|
|
23
25
|
const int n_draft = params.n_draft;
|
|
24
26
|
|
|
25
27
|
const bool dump_kv_cache = params.dump_kv_cache;
|
|
26
28
|
|
|
27
|
-
#ifndef LOG_DISABLE_LOGS
|
|
28
|
-
log_set_target(log_filename_generator("lookup", "log"));
|
|
29
|
-
LOG_TEE("Log start\n");
|
|
30
|
-
log_dump_cmdline(argc, argv);
|
|
31
|
-
#endif // LOG_DISABLE_LOGS
|
|
32
|
-
|
|
33
29
|
// init llama.cpp
|
|
34
30
|
llama_backend_init();
|
|
35
31
|
llama_numa_init(params.numa);
|
|
36
32
|
|
|
37
|
-
llama_model * model = NULL;
|
|
38
|
-
llama_context * ctx = NULL;
|
|
39
|
-
|
|
40
33
|
// load the model
|
|
41
|
-
|
|
34
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
35
|
+
|
|
36
|
+
llama_model * model = llama_init.model;
|
|
37
|
+
llama_context * ctx = llama_init.context;
|
|
42
38
|
|
|
43
39
|
// tokenize the prompt
|
|
44
40
|
std::vector<llama_token> inp;
|
|
45
|
-
inp =
|
|
41
|
+
inp = common_tokenize(ctx, params.prompt, true, true);
|
|
46
42
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
43
|
+
common_ngram_cache ngram_cache_context;
|
|
44
|
+
common_ngram_cache ngram_cache_dynamic;
|
|
45
|
+
common_ngram_cache ngram_cache_static;
|
|
50
46
|
int64_t t_draft_flat_us = 0;
|
|
51
47
|
int64_t t_draft_us = 0;
|
|
52
48
|
|
|
53
49
|
{
|
|
54
50
|
// Fill up context ngram cache with tokens from user input:
|
|
55
51
|
const int64_t t_start_draft_us = ggml_time_us();
|
|
56
|
-
|
|
52
|
+
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
|
|
57
53
|
|
|
58
54
|
if (!params.lookup_cache_static.empty()) {
|
|
59
55
|
try {
|
|
60
|
-
ngram_cache_static =
|
|
56
|
+
ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
|
|
61
57
|
} catch (std::ifstream::failure const &) {
|
|
62
|
-
|
|
58
|
+
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
|
63
59
|
exit(1);
|
|
64
60
|
}
|
|
65
61
|
}
|
|
66
62
|
|
|
67
63
|
if (!params.lookup_cache_dynamic.empty()) {
|
|
68
64
|
try {
|
|
69
|
-
ngram_cache_dynamic =
|
|
65
|
+
ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
|
|
70
66
|
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
|
71
67
|
}
|
|
72
68
|
|
|
@@ -77,14 +73,14 @@ int main(int argc, char ** argv){
|
|
|
77
73
|
const int max_tokens_list_size = max_context_size - 4;
|
|
78
74
|
|
|
79
75
|
if ((int) inp.size() > max_tokens_list_size) {
|
|
80
|
-
|
|
76
|
+
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
|
81
77
|
return 1;
|
|
82
78
|
}
|
|
83
79
|
|
|
84
|
-
|
|
80
|
+
LOG("\n\n");
|
|
85
81
|
|
|
86
82
|
for (auto id : inp) {
|
|
87
|
-
|
|
83
|
+
LOG("%s", common_token_to_piece(ctx, id).c_str());
|
|
88
84
|
}
|
|
89
85
|
|
|
90
86
|
fflush(stderr);
|
|
@@ -93,8 +89,8 @@ int main(int argc, char ** argv){
|
|
|
93
89
|
|
|
94
90
|
const auto t_enc_start = ggml_time_us();
|
|
95
91
|
|
|
96
|
-
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1
|
|
97
|
-
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1
|
|
92
|
+
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
|
|
93
|
+
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
|
|
98
94
|
|
|
99
95
|
const auto t_enc_end = ggml_time_us();
|
|
100
96
|
|
|
@@ -106,7 +102,7 @@ int main(int argc, char ** argv){
|
|
|
106
102
|
|
|
107
103
|
bool has_eos = false;
|
|
108
104
|
|
|
109
|
-
struct
|
|
105
|
+
struct common_sampler * smpl = common_sampler_init(model, params.sparams);
|
|
110
106
|
|
|
111
107
|
std::vector<llama_token> draft;
|
|
112
108
|
|
|
@@ -121,23 +117,23 @@ int main(int argc, char ** argv){
|
|
|
121
117
|
// debug
|
|
122
118
|
if (dump_kv_cache) {
|
|
123
119
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
124
|
-
|
|
120
|
+
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
125
121
|
}
|
|
126
122
|
|
|
127
123
|
// print current draft sequence
|
|
128
|
-
|
|
124
|
+
LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
|
|
129
125
|
|
|
130
126
|
int i_dft = 0;
|
|
131
127
|
while (true) {
|
|
132
128
|
// sample from the target model
|
|
133
|
-
llama_token id =
|
|
129
|
+
llama_token id = common_sampler_sample(smpl, ctx, i_dft);
|
|
134
130
|
|
|
135
|
-
|
|
131
|
+
common_sampler_accept(smpl, id, true);
|
|
136
132
|
|
|
137
|
-
const std::string token_str =
|
|
133
|
+
const std::string token_str = common_token_to_piece(ctx, id);
|
|
138
134
|
|
|
139
135
|
if (!params.use_color) {
|
|
140
|
-
|
|
136
|
+
LOG("%s", token_str.c_str());
|
|
141
137
|
}
|
|
142
138
|
|
|
143
139
|
if (llama_token_is_eog(model, id)) {
|
|
@@ -148,7 +144,7 @@ int main(int argc, char ** argv){
|
|
|
148
144
|
|
|
149
145
|
// check if the target token matches the draft
|
|
150
146
|
if (i_dft < (int) draft.size() && id == draft[i_dft]) {
|
|
151
|
-
|
|
147
|
+
LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
|
|
152
148
|
++n_accept;
|
|
153
149
|
++n_past;
|
|
154
150
|
++i_dft;
|
|
@@ -156,25 +152,25 @@ int main(int argc, char ** argv){
|
|
|
156
152
|
{
|
|
157
153
|
// Update context ngram cache with the newly accepted token:
|
|
158
154
|
const int64_t t_start_draft_us = ggml_time_us();
|
|
159
|
-
|
|
155
|
+
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
|
160
156
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
|
161
157
|
}
|
|
162
158
|
|
|
163
159
|
if (params.use_color) {
|
|
164
160
|
// color accepted draft token
|
|
165
|
-
|
|
161
|
+
LOG("\033[34m%s\033[0m", token_str.c_str());
|
|
166
162
|
fflush(stdout);
|
|
167
163
|
}
|
|
168
164
|
continue;
|
|
169
165
|
}
|
|
170
166
|
|
|
171
167
|
if (params.use_color) {
|
|
172
|
-
|
|
168
|
+
LOG("%s", token_str.c_str());
|
|
173
169
|
}
|
|
174
170
|
fflush(stdout);
|
|
175
171
|
|
|
176
172
|
|
|
177
|
-
|
|
173
|
+
LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
|
178
174
|
|
|
179
175
|
draft.clear();
|
|
180
176
|
draft.push_back(id);
|
|
@@ -182,7 +178,7 @@ int main(int argc, char ** argv){
|
|
|
182
178
|
{
|
|
183
179
|
// Update context ngram cache with the newly accepted token:
|
|
184
180
|
const int64_t t_start_draft_us = ggml_time_us();
|
|
185
|
-
|
|
181
|
+
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
|
186
182
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
|
187
183
|
}
|
|
188
184
|
break;
|
|
@@ -196,18 +192,18 @@ int main(int argc, char ** argv){
|
|
|
196
192
|
// clean the cache of draft tokens that weren't accepted
|
|
197
193
|
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
|
198
194
|
|
|
199
|
-
|
|
200
|
-
|
|
195
|
+
common_batch_clear(batch_tgt);
|
|
196
|
+
common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
|
|
201
197
|
|
|
202
198
|
// Draft already contains a single token sampled from the model:
|
|
203
199
|
GGML_ASSERT(draft.size() == 1);
|
|
204
200
|
GGML_ASSERT(draft[0] == inp.back());
|
|
205
201
|
const int64_t t_start_draft_us = ggml_time_us();
|
|
206
202
|
|
|
207
|
-
|
|
203
|
+
common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
|
208
204
|
|
|
209
205
|
for (size_t i = 1; i < draft.size(); ++i) {
|
|
210
|
-
|
|
206
|
+
common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
|
|
211
207
|
}
|
|
212
208
|
|
|
213
209
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
|
@@ -222,28 +218,29 @@ int main(int argc, char ** argv){
|
|
|
222
218
|
auto t_dec_end = ggml_time_us();
|
|
223
219
|
|
|
224
220
|
// Update dynamic ngram cache with context ngram cache and save it to disk:
|
|
225
|
-
|
|
226
|
-
|
|
221
|
+
common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
|
222
|
+
common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
|
|
227
223
|
|
|
228
|
-
|
|
224
|
+
LOG("\n\n");
|
|
229
225
|
|
|
230
|
-
|
|
231
|
-
|
|
226
|
+
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
|
227
|
+
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
|
232
228
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
229
|
+
LOG_INF("\n");
|
|
230
|
+
LOG_INF("n_draft = %d\n", n_draft);
|
|
231
|
+
LOG_INF("n_predict = %d\n", n_predict);
|
|
232
|
+
LOG_INF("n_drafted = %d\n", n_drafted);
|
|
233
|
+
LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
|
234
|
+
LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
|
239
235
|
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
|
240
|
-
|
|
241
|
-
|
|
236
|
+
LOG_INF("n_accept = %d\n", n_accept);
|
|
237
|
+
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
|
238
|
+
|
|
239
|
+
LOG_INF("\ntarget:\n\n");
|
|
240
|
+
common_perf_print(ctx, smpl);
|
|
242
241
|
|
|
243
|
-
|
|
244
|
-
llama_print_timings(ctx);
|
|
242
|
+
common_sampler_free(smpl);
|
|
245
243
|
|
|
246
|
-
llama_sampling_free(ctx_sampling);
|
|
247
244
|
llama_batch_free(batch_tgt);
|
|
248
245
|
|
|
249
246
|
llama_free(ctx);
|
|
@@ -251,7 +248,7 @@ int main(int argc, char ** argv){
|
|
|
251
248
|
|
|
252
249
|
llama_backend_free();
|
|
253
250
|
|
|
254
|
-
|
|
251
|
+
LOG("\n\n");
|
|
255
252
|
|
|
256
253
|
return 0;
|
|
257
254
|
}
|