@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
3
|
+
#include "sampling.h"
|
|
4
|
+
#include "log.h"
|
|
2
5
|
#include "llama.h"
|
|
3
6
|
|
|
4
|
-
#include <cmath>
|
|
5
7
|
#include <cstdio>
|
|
6
8
|
#include <string>
|
|
7
9
|
#include <vector>
|
|
@@ -35,54 +37,49 @@ struct ngram_container {
|
|
|
35
37
|
};
|
|
36
38
|
|
|
37
39
|
int main(int argc, char ** argv) {
|
|
38
|
-
|
|
40
|
+
common_params params;
|
|
39
41
|
|
|
40
|
-
if (!
|
|
41
|
-
gpt_params_print_usage(argc, argv, params);
|
|
42
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
|
42
43
|
return 1;
|
|
43
44
|
}
|
|
44
45
|
|
|
46
|
+
common_init();
|
|
47
|
+
|
|
45
48
|
const int W = 15; // lookahead window
|
|
46
49
|
const int N = 5; // n-gram size
|
|
47
50
|
const int G = 15; // max verification n-grams
|
|
48
51
|
|
|
49
52
|
const bool dump_kv_cache = params.dump_kv_cache;
|
|
50
53
|
|
|
51
|
-
#ifndef LOG_DISABLE_LOGS
|
|
52
|
-
log_set_target(log_filename_generator("lookahead", "log"));
|
|
53
|
-
LOG_TEE("Log start\n");
|
|
54
|
-
log_dump_cmdline(argc, argv);
|
|
55
|
-
#endif // LOG_DISABLE_LOGS
|
|
56
|
-
|
|
57
54
|
// init llama.cpp
|
|
58
55
|
llama_backend_init();
|
|
59
56
|
llama_numa_init(params.numa);
|
|
60
57
|
|
|
61
|
-
llama_model * model = NULL;
|
|
62
|
-
llama_context * ctx = NULL;
|
|
63
|
-
|
|
64
58
|
// load the target model
|
|
65
|
-
|
|
59
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
60
|
+
|
|
61
|
+
llama_model * model = llama_init.model;
|
|
62
|
+
llama_context * ctx = llama_init.context;
|
|
66
63
|
|
|
67
64
|
// Tokenize the prompt
|
|
68
65
|
std::vector<llama_token> inp;
|
|
69
66
|
std::vector<llama_token> all;
|
|
70
67
|
|
|
71
|
-
inp =
|
|
68
|
+
inp = common_tokenize(ctx, params.prompt, true, true);
|
|
72
69
|
all = inp;
|
|
73
70
|
|
|
74
71
|
const int max_context_size = llama_n_ctx(ctx);
|
|
75
72
|
const int max_tokens_list_size = max_context_size - 4;
|
|
76
73
|
|
|
77
74
|
if ((int) inp.size() > max_tokens_list_size) {
|
|
78
|
-
|
|
75
|
+
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
|
79
76
|
return 1;
|
|
80
77
|
}
|
|
81
78
|
|
|
82
|
-
|
|
79
|
+
LOG("\n\n");
|
|
83
80
|
|
|
84
81
|
for (auto id : inp) {
|
|
85
|
-
|
|
82
|
+
LOG("%s", common_token_to_piece(ctx, id).c_str());
|
|
86
83
|
}
|
|
87
84
|
|
|
88
85
|
fflush(stderr);
|
|
@@ -92,8 +89,8 @@ int main(int argc, char ** argv) {
|
|
|
92
89
|
const auto t_enc_start = ggml_time_us();
|
|
93
90
|
|
|
94
91
|
// eval the prompt
|
|
95
|
-
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1
|
|
96
|
-
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1
|
|
92
|
+
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
|
|
93
|
+
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
|
|
97
94
|
|
|
98
95
|
for (int s = 1; s < W + G + 1; ++s) {
|
|
99
96
|
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
|
|
@@ -118,7 +115,7 @@ int main(int argc, char ** argv) {
|
|
|
118
115
|
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
|
119
116
|
|
|
120
117
|
// target model sampling context
|
|
121
|
-
struct
|
|
118
|
+
struct common_sampler * smpl = common_sampler_init(model, params.sparams);
|
|
122
119
|
|
|
123
120
|
// verification n-grams
|
|
124
121
|
std::vector<ngram_data> ngrams_cur(G);
|
|
@@ -159,14 +156,14 @@ int main(int argc, char ** argv) {
|
|
|
159
156
|
|
|
160
157
|
// sample first token
|
|
161
158
|
{
|
|
162
|
-
id =
|
|
159
|
+
id = common_sampler_sample(smpl, ctx, 0);
|
|
163
160
|
|
|
164
|
-
|
|
161
|
+
common_sampler_accept(smpl, id, true);
|
|
165
162
|
|
|
166
163
|
{
|
|
167
|
-
const std::string token_str =
|
|
164
|
+
const std::string token_str = common_token_to_piece(ctx, id);
|
|
168
165
|
|
|
169
|
-
|
|
166
|
+
LOG("%s", token_str.c_str());
|
|
170
167
|
fflush(stdout);
|
|
171
168
|
}
|
|
172
169
|
}
|
|
@@ -175,7 +172,7 @@ int main(int argc, char ** argv) {
|
|
|
175
172
|
// debug
|
|
176
173
|
if (dump_kv_cache) {
|
|
177
174
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
178
|
-
|
|
175
|
+
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
179
176
|
}
|
|
180
177
|
|
|
181
178
|
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
|
@@ -204,10 +201,10 @@ int main(int argc, char ** argv) {
|
|
|
204
201
|
// V V V V V V
|
|
205
202
|
// id
|
|
206
203
|
{
|
|
207
|
-
|
|
204
|
+
common_batch_clear(batch);
|
|
208
205
|
|
|
209
206
|
// current token - first token of the first level
|
|
210
|
-
|
|
207
|
+
common_batch_add(batch, id, n_past, seq_id_all, true);
|
|
211
208
|
|
|
212
209
|
// verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
|
|
213
210
|
{
|
|
@@ -232,7 +229,7 @@ int main(int argc, char ** argv) {
|
|
|
232
229
|
ngrams_cur[g].tokens [j + 1] = t;
|
|
233
230
|
ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
|
|
234
231
|
|
|
235
|
-
|
|
232
|
+
common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
|
|
236
233
|
}
|
|
237
234
|
}
|
|
238
235
|
}
|
|
@@ -244,19 +241,19 @@ int main(int argc, char ** argv) {
|
|
|
244
241
|
seq_id_look[j] = i + j + 1;
|
|
245
242
|
}
|
|
246
243
|
|
|
247
|
-
|
|
244
|
+
common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
|
|
248
245
|
}
|
|
249
246
|
|
|
250
247
|
// fill the rest of the levels
|
|
251
248
|
for (int j = 1; j < N - 1; j++) {
|
|
252
249
|
for (int i = 0; i < W; i++) {
|
|
253
|
-
|
|
250
|
+
common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
|
|
254
251
|
}
|
|
255
252
|
}
|
|
256
253
|
}
|
|
257
254
|
|
|
258
255
|
if (llama_decode(ctx, batch) != 0) {
|
|
259
|
-
|
|
256
|
+
LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
|
|
260
257
|
return 1;
|
|
261
258
|
}
|
|
262
259
|
|
|
@@ -284,19 +281,19 @@ int main(int argc, char ** argv) {
|
|
|
284
281
|
}
|
|
285
282
|
|
|
286
283
|
// sample the next token
|
|
287
|
-
id =
|
|
284
|
+
id = common_sampler_sample(smpl, ctx, i_batch);
|
|
288
285
|
|
|
289
|
-
|
|
286
|
+
common_sampler_accept(smpl, id, true);
|
|
290
287
|
|
|
291
288
|
// print
|
|
292
289
|
{
|
|
293
|
-
const std::string token_str =
|
|
290
|
+
const std::string token_str = common_token_to_piece(ctx, id);
|
|
294
291
|
|
|
295
292
|
if (v == 0) {
|
|
296
|
-
|
|
293
|
+
LOG("%s", token_str.c_str());
|
|
297
294
|
} else {
|
|
298
295
|
// print light cyan
|
|
299
|
-
|
|
296
|
+
LOG("\033[0;96m%s\033[0m", token_str.c_str());
|
|
300
297
|
}
|
|
301
298
|
fflush(stdout);
|
|
302
299
|
|
|
@@ -330,21 +327,21 @@ int main(int argc, char ** argv) {
|
|
|
330
327
|
// print known n-grams starting with token id (debug)
|
|
331
328
|
if (0 && v == 0) {
|
|
332
329
|
if (ngrams_observed.cnt[id] > 0) {
|
|
333
|
-
|
|
330
|
+
LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], common_token_to_piece(ctx, id).c_str());
|
|
334
331
|
}
|
|
335
332
|
|
|
336
333
|
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
|
|
337
|
-
|
|
334
|
+
LOG(" - ngram %2d: ", i);
|
|
338
335
|
|
|
339
336
|
const int idx = id*(N - 1)*G + i*(N - 1);
|
|
340
337
|
|
|
341
338
|
for (int j = 0; j < N - 1; j++) {
|
|
342
|
-
const std::string token_str =
|
|
339
|
+
const std::string token_str = common_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
|
|
343
340
|
|
|
344
|
-
|
|
341
|
+
LOG("%s", token_str.c_str());
|
|
345
342
|
}
|
|
346
343
|
|
|
347
|
-
|
|
344
|
+
LOG("\n");
|
|
348
345
|
}
|
|
349
346
|
}
|
|
350
347
|
|
|
@@ -361,7 +358,7 @@ int main(int argc, char ** argv) {
|
|
|
361
358
|
if (v == 0) {
|
|
362
359
|
// sample from the last level
|
|
363
360
|
for (int i = 0; i < W; i++) {
|
|
364
|
-
tokens_j[N - 2][i] =
|
|
361
|
+
tokens_j[N - 2][i] = common_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
|
|
365
362
|
}
|
|
366
363
|
} else {
|
|
367
364
|
for (int i = 0; i < W; i++) {
|
|
@@ -455,23 +452,25 @@ int main(int argc, char ** argv) {
|
|
|
455
452
|
|
|
456
453
|
auto t_dec_end = ggml_time_us();
|
|
457
454
|
|
|
458
|
-
|
|
455
|
+
LOG("\n\n");
|
|
456
|
+
|
|
457
|
+
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
|
458
|
+
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
|
459
459
|
|
|
460
|
-
|
|
461
|
-
|
|
460
|
+
LOG_INF("\n");
|
|
461
|
+
LOG_INF("W = %2d\n", W);
|
|
462
|
+
LOG_INF("N = %2d\n", N);
|
|
463
|
+
LOG_INF("G = %2d\n", G);
|
|
464
|
+
LOG_INF("\n");
|
|
465
|
+
LOG_INF("n_predict = %d\n", n_predict);
|
|
466
|
+
LOG_INF("n_accept = %d\n", n_accept);
|
|
462
467
|
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
LOG_TEE("N = %2d\n", N);
|
|
466
|
-
LOG_TEE("G = %2d\n", G);
|
|
467
|
-
LOG_TEE("\n");
|
|
468
|
-
LOG_TEE("n_predict = %d\n", n_predict);
|
|
469
|
-
LOG_TEE("n_accept = %d\n", n_accept);
|
|
468
|
+
LOG_INF("\n");
|
|
469
|
+
common_perf_print(ctx, smpl);
|
|
470
470
|
|
|
471
|
-
|
|
471
|
+
common_sampler_free(smpl);
|
|
472
472
|
|
|
473
473
|
llama_kv_cache_view_free(&kvc_view);
|
|
474
|
-
llama_sampling_free(ctx_sampling);
|
|
475
474
|
|
|
476
475
|
llama_batch_free(batch);
|
|
477
476
|
|
|
@@ -480,7 +479,7 @@ int main(int argc, char ** argv) {
|
|
|
480
479
|
|
|
481
480
|
llama_backend_free();
|
|
482
481
|
|
|
483
|
-
|
|
482
|
+
LOG("\n\n");
|
|
484
483
|
|
|
485
484
|
return 0;
|
|
486
485
|
}
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
#include "
|
|
2
|
-
#include "llama.h"
|
|
1
|
+
#include "arg.h"
|
|
3
2
|
#include "common.h"
|
|
4
3
|
#include "ngram-cache.h"
|
|
4
|
+
#include "ggml.h"
|
|
5
|
+
#include "llama.h"
|
|
5
6
|
|
|
6
7
|
#include <cstdint>
|
|
7
8
|
#include <fstream>
|
|
@@ -11,10 +12,9 @@
|
|
|
11
12
|
#include <vector>
|
|
12
13
|
|
|
13
14
|
int main(int argc, char ** argv){
|
|
14
|
-
|
|
15
|
+
common_params params;
|
|
15
16
|
|
|
16
|
-
if (!
|
|
17
|
-
gpt_params_print_usage(argc, argv, params);
|
|
17
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
|
18
18
|
return 1;
|
|
19
19
|
}
|
|
20
20
|
|
|
@@ -22,22 +22,24 @@ int main(int argc, char ** argv){
|
|
|
22
22
|
llama_backend_init();
|
|
23
23
|
llama_numa_init(params.numa);
|
|
24
24
|
|
|
25
|
-
llama_model * model = NULL;
|
|
26
|
-
llama_context * ctx = NULL;
|
|
27
|
-
|
|
28
25
|
// load the model
|
|
29
|
-
|
|
26
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
27
|
+
|
|
28
|
+
llama_model * model = llama_init.model;
|
|
29
|
+
llama_context * ctx = llama_init.context;
|
|
30
30
|
GGML_ASSERT(model != nullptr);
|
|
31
31
|
|
|
32
32
|
// tokenize the prompt
|
|
33
33
|
std::vector<llama_token> inp;
|
|
34
|
-
inp =
|
|
34
|
+
inp = common_tokenize(ctx, params.prompt, true, true);
|
|
35
35
|
fprintf(stderr, "%s: tokenization done\n", __func__);
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
|
|
39
|
-
|
|
38
|
+
common_ngram_cache ngram_cache;
|
|
39
|
+
common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
|
|
40
40
|
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
|
41
41
|
|
|
42
|
-
|
|
42
|
+
common_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
|
43
|
+
|
|
44
|
+
return 0;
|
|
43
45
|
}
|
|
@@ -33,15 +33,15 @@ int main(int argc, char ** argv){
|
|
|
33
33
|
}
|
|
34
34
|
|
|
35
35
|
fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
|
|
36
|
-
|
|
36
|
+
common_ngram_cache ngram_cache_merged = common_ngram_cache_load(args[0]);
|
|
37
37
|
|
|
38
38
|
for (size_t i = 1; i < args.size()-1; ++i) {
|
|
39
39
|
fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
|
|
40
|
-
|
|
40
|
+
common_ngram_cache ngram_cache = common_ngram_cache_load(args[i]);
|
|
41
41
|
|
|
42
|
-
|
|
42
|
+
common_ngram_cache_merge(ngram_cache_merged, ngram_cache);
|
|
43
43
|
}
|
|
44
44
|
|
|
45
45
|
fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
|
|
46
|
-
|
|
46
|
+
common_ngram_cache_save(ngram_cache_merged, args.back());
|
|
47
47
|
}
|
|
@@ -1,44 +1,45 @@
|
|
|
1
|
-
#include "
|
|
1
|
+
#include "arg.h"
|
|
2
2
|
#include "common.h"
|
|
3
|
-
#include "llama.h"
|
|
4
3
|
#include "log.h"
|
|
5
4
|
#include "ngram-cache.h"
|
|
5
|
+
#include "llama.h"
|
|
6
|
+
#include "ggml.h"
|
|
6
7
|
|
|
7
|
-
#include <cmath>
|
|
8
8
|
#include <cstdint>
|
|
9
9
|
#include <cstdio>
|
|
10
|
+
#include <cinttypes>
|
|
10
11
|
#include <fstream>
|
|
11
12
|
#include <string>
|
|
12
13
|
#include <vector>
|
|
13
|
-
#include <unordered_map>
|
|
14
14
|
|
|
15
15
|
int main(int argc, char ** argv){
|
|
16
|
-
|
|
16
|
+
common_params params;
|
|
17
17
|
|
|
18
|
-
if (!
|
|
19
|
-
gpt_params_print_usage(argc, argv, params);
|
|
18
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
|
20
19
|
return 1;
|
|
21
20
|
}
|
|
22
21
|
|
|
22
|
+
common_init();
|
|
23
|
+
|
|
23
24
|
const int n_draft = params.n_draft;
|
|
24
25
|
|
|
25
26
|
// init llama.cpp
|
|
26
27
|
llama_backend_init();
|
|
27
28
|
llama_numa_init(params.numa);
|
|
28
29
|
|
|
29
|
-
llama_model * model = NULL;
|
|
30
|
-
llama_context * ctx = NULL;
|
|
31
|
-
|
|
32
30
|
// load the model
|
|
33
|
-
|
|
31
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
32
|
+
|
|
33
|
+
llama_model * model = llama_init.model;
|
|
34
|
+
llama_context * ctx = llama_init.context;
|
|
34
35
|
|
|
35
36
|
// tokenize the prompt
|
|
36
37
|
std::vector<llama_token> inp;
|
|
37
|
-
inp =
|
|
38
|
+
inp = common_tokenize(ctx, params.prompt, true, true);
|
|
38
39
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
40
|
+
common_ngram_cache ngram_cache_context;
|
|
41
|
+
common_ngram_cache ngram_cache_dynamic;
|
|
42
|
+
common_ngram_cache ngram_cache_static;
|
|
42
43
|
int64_t t_draft_flat_us = 0;
|
|
43
44
|
int64_t t_draft_us = 0;
|
|
44
45
|
|
|
@@ -47,16 +48,16 @@ int main(int argc, char ** argv){
|
|
|
47
48
|
|
|
48
49
|
if (!params.lookup_cache_static.empty()) {
|
|
49
50
|
try {
|
|
50
|
-
ngram_cache_static =
|
|
51
|
+
ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
|
|
51
52
|
} catch (std::ifstream::failure const &) {
|
|
52
|
-
|
|
53
|
+
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
|
53
54
|
exit(1);
|
|
54
55
|
}
|
|
55
56
|
}
|
|
56
57
|
|
|
57
58
|
if (!params.lookup_cache_dynamic.empty()) {
|
|
58
59
|
try {
|
|
59
|
-
ngram_cache_dynamic =
|
|
60
|
+
ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
|
|
60
61
|
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
|
61
62
|
}
|
|
62
63
|
|
|
@@ -85,7 +86,7 @@ int main(int argc, char ** argv){
|
|
|
85
86
|
|
|
86
87
|
{
|
|
87
88
|
const int64_t t_start_draft_us = ggml_time_us();
|
|
88
|
-
|
|
89
|
+
common_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
|
89
90
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
|
90
91
|
}
|
|
91
92
|
|
|
@@ -104,7 +105,7 @@ int main(int argc, char ** argv){
|
|
|
104
105
|
|
|
105
106
|
{
|
|
106
107
|
const int64_t t_start_draft_us = ggml_time_us();
|
|
107
|
-
|
|
108
|
+
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
|
|
108
109
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
|
109
110
|
}
|
|
110
111
|
}
|
|
@@ -114,7 +115,7 @@ int main(int argc, char ** argv){
|
|
|
114
115
|
pseudo_output.push_back(inp_slice[pseudo_output.size()]);
|
|
115
116
|
{
|
|
116
117
|
const int64_t t_start_draft_us = ggml_time_us();
|
|
117
|
-
|
|
118
|
+
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
|
|
118
119
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
|
119
120
|
}
|
|
120
121
|
}
|
|
@@ -128,32 +129,32 @@ int main(int argc, char ** argv){
|
|
|
128
129
|
const int64_t eta_min = eta_ms / (60*1000);
|
|
129
130
|
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
|
|
130
131
|
|
|
131
|
-
|
|
132
|
+
LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
|
|
132
133
|
}
|
|
133
134
|
|
|
134
135
|
// After each chunk, update the dynamic ngram cache with the context ngram cache:
|
|
135
|
-
|
|
136
|
+
common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
|
136
137
|
ngram_cache_context.clear();
|
|
137
138
|
}
|
|
138
139
|
|
|
139
|
-
|
|
140
|
+
LOG("\n");
|
|
140
141
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
142
|
+
LOG_INF("\n");
|
|
143
|
+
LOG_INF("n_draft = %d\n", n_draft);
|
|
144
|
+
LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx);
|
|
145
|
+
LOG_INF("n_drafted = %d\n", n_drafted);
|
|
146
|
+
LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
|
147
|
+
LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
|
147
148
|
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
|
148
|
-
|
|
149
|
-
|
|
149
|
+
LOG_INF("n_accept = %d\n", n_accept);
|
|
150
|
+
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
|
150
151
|
|
|
151
152
|
llama_free(ctx);
|
|
152
153
|
llama_free_model(model);
|
|
153
154
|
|
|
154
155
|
llama_backend_free();
|
|
155
156
|
|
|
156
|
-
|
|
157
|
+
LOG("\n\n");
|
|
157
158
|
|
|
158
159
|
return 0;
|
|
159
160
|
}
|