@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,175 +1,201 @@
|
|
|
1
|
-
#include "common.h"
|
|
2
1
|
#include "llama.h"
|
|
3
|
-
|
|
4
|
-
#include <cmath>
|
|
5
2
|
#include <cstdio>
|
|
3
|
+
#include <cstring>
|
|
6
4
|
#include <string>
|
|
7
5
|
#include <vector>
|
|
8
6
|
|
|
9
|
-
static void print_usage(int
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
|
|
14
|
-
LOG_TEE("\n");
|
|
7
|
+
static void print_usage(int, char ** argv) {
|
|
8
|
+
printf("\nexample usage:\n");
|
|
9
|
+
printf("\n %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]);
|
|
10
|
+
printf("\n");
|
|
15
11
|
}
|
|
16
12
|
|
|
17
13
|
int main(int argc, char ** argv) {
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
14
|
+
// path to the model gguf file
|
|
15
|
+
std::string model_path;
|
|
16
|
+
// prompt to generate text from
|
|
17
|
+
std::string prompt = "Hello my name is";
|
|
18
|
+
// number of layers to offload to the GPU
|
|
19
|
+
int ngl = 99;
|
|
20
|
+
// number of tokens to predict
|
|
21
|
+
int n_predict = 32;
|
|
22
|
+
|
|
23
|
+
// parse command line arguments
|
|
24
|
+
|
|
25
|
+
{
|
|
26
|
+
int i = 1;
|
|
27
|
+
for (; i < argc; i++) {
|
|
28
|
+
if (strcmp(argv[i], "-m") == 0) {
|
|
29
|
+
if (i + 1 < argc) {
|
|
30
|
+
model_path = argv[++i];
|
|
31
|
+
} else {
|
|
32
|
+
print_usage(argc, argv);
|
|
33
|
+
return 1;
|
|
34
|
+
}
|
|
35
|
+
} else if (strcmp(argv[i], "-n") == 0) {
|
|
36
|
+
if (i + 1 < argc) {
|
|
37
|
+
try {
|
|
38
|
+
n_predict = std::stoi(argv[++i]);
|
|
39
|
+
} catch (...) {
|
|
40
|
+
print_usage(argc, argv);
|
|
41
|
+
return 1;
|
|
42
|
+
}
|
|
43
|
+
} else {
|
|
44
|
+
print_usage(argc, argv);
|
|
45
|
+
return 1;
|
|
46
|
+
}
|
|
47
|
+
} else if (strcmp(argv[i], "-ngl") == 0) {
|
|
48
|
+
if (i + 1 < argc) {
|
|
49
|
+
try {
|
|
50
|
+
ngl = std::stoi(argv[++i]);
|
|
51
|
+
} catch (...) {
|
|
52
|
+
print_usage(argc, argv);
|
|
53
|
+
return 1;
|
|
54
|
+
}
|
|
55
|
+
} else {
|
|
56
|
+
print_usage(argc, argv);
|
|
57
|
+
return 1;
|
|
58
|
+
}
|
|
59
|
+
} else {
|
|
60
|
+
// prompt starts here
|
|
61
|
+
break;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
if (model_path.empty()) {
|
|
65
|
+
print_usage(argc, argv);
|
|
66
|
+
return 1;
|
|
67
|
+
}
|
|
68
|
+
if (i < argc) {
|
|
69
|
+
prompt = argv[i++];
|
|
70
|
+
for (; i < argc; i++) {
|
|
71
|
+
prompt += " ";
|
|
72
|
+
prompt += argv[i];
|
|
73
|
+
}
|
|
74
|
+
}
|
|
26
75
|
}
|
|
27
76
|
|
|
28
|
-
// total length of the sequence including the prompt
|
|
29
|
-
const int n_predict = params.n_predict;
|
|
30
|
-
|
|
31
|
-
// init LLM
|
|
32
|
-
|
|
33
|
-
llama_backend_init();
|
|
34
|
-
llama_numa_init(params.numa);
|
|
35
|
-
|
|
36
77
|
// initialize the model
|
|
37
78
|
|
|
38
|
-
llama_model_params model_params =
|
|
79
|
+
llama_model_params model_params = llama_model_default_params();
|
|
80
|
+
model_params.n_gpu_layers = ngl;
|
|
39
81
|
|
|
40
|
-
llama_model * model = llama_load_model_from_file(
|
|
82
|
+
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
|
|
41
83
|
|
|
42
84
|
if (model == NULL) {
|
|
43
85
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
|
44
86
|
return 1;
|
|
45
87
|
}
|
|
46
88
|
|
|
47
|
-
//
|
|
48
|
-
|
|
49
|
-
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
|
89
|
+
// tokenize the prompt
|
|
50
90
|
|
|
51
|
-
|
|
91
|
+
// find the number of tokens in the prompt
|
|
92
|
+
const int n_prompt = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
|
52
93
|
|
|
53
|
-
|
|
54
|
-
|
|
94
|
+
// allocate space for the tokens and tokenize the prompt
|
|
95
|
+
std::vector<llama_token> prompt_tokens(n_prompt);
|
|
96
|
+
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
|
|
97
|
+
fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
|
|
55
98
|
return 1;
|
|
56
99
|
}
|
|
57
100
|
|
|
58
|
-
//
|
|
59
|
-
|
|
60
|
-
std::vector<llama_token> tokens_list;
|
|
61
|
-
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
|
101
|
+
// initialize the context
|
|
62
102
|
|
|
63
|
-
|
|
64
|
-
|
|
103
|
+
llama_context_params ctx_params = llama_context_default_params();
|
|
104
|
+
// n_ctx is the context size
|
|
105
|
+
ctx_params.n_ctx = n_prompt + n_predict - 1;
|
|
106
|
+
// n_batch is the maximum number of tokens that can be processed in a single call to llama_decode
|
|
107
|
+
ctx_params.n_batch = n_prompt;
|
|
108
|
+
// enable performance counters
|
|
109
|
+
ctx_params.no_perf = false;
|
|
65
110
|
|
|
66
|
-
|
|
111
|
+
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
|
67
112
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
|
71
|
-
LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__);
|
|
113
|
+
if (ctx == NULL) {
|
|
114
|
+
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
|
72
115
|
return 1;
|
|
73
116
|
}
|
|
74
117
|
|
|
75
|
-
//
|
|
76
|
-
|
|
77
|
-
fprintf(stderr, "\n");
|
|
78
|
-
|
|
79
|
-
for (auto id : tokens_list) {
|
|
80
|
-
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
|
81
|
-
}
|
|
118
|
+
// initialize the sampler
|
|
82
119
|
|
|
83
|
-
|
|
120
|
+
auto sparams = llama_sampler_chain_default_params();
|
|
121
|
+
sparams.no_perf = false;
|
|
122
|
+
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
|
84
123
|
|
|
85
|
-
|
|
86
|
-
// we use this object to submit token data for decoding
|
|
124
|
+
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
|
|
87
125
|
|
|
88
|
-
|
|
126
|
+
// print the prompt token-by-token
|
|
89
127
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
128
|
+
for (auto id : prompt_tokens) {
|
|
129
|
+
char buf[128];
|
|
130
|
+
int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true);
|
|
131
|
+
if (n < 0) {
|
|
132
|
+
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
|
|
133
|
+
return 1;
|
|
134
|
+
}
|
|
135
|
+
std::string s(buf, n);
|
|
136
|
+
printf("%s", s.c_str());
|
|
93
137
|
}
|
|
94
138
|
|
|
95
|
-
//
|
|
96
|
-
batch.logits[batch.n_tokens - 1] = true;
|
|
139
|
+
// prepare a batch for the prompt
|
|
97
140
|
|
|
98
|
-
|
|
99
|
-
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
|
100
|
-
return 1;
|
|
101
|
-
}
|
|
141
|
+
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
|
102
142
|
|
|
103
143
|
// main loop
|
|
104
144
|
|
|
105
|
-
|
|
145
|
+
const auto t_main_start = ggml_time_us();
|
|
106
146
|
int n_decode = 0;
|
|
147
|
+
llama_token new_token_id;
|
|
107
148
|
|
|
108
|
-
|
|
149
|
+
for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) {
|
|
150
|
+
// evaluate the current batch with the transformer model
|
|
151
|
+
if (llama_decode(ctx, batch)) {
|
|
152
|
+
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
|
153
|
+
return 1;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
n_pos += batch.n_tokens;
|
|
109
157
|
|
|
110
|
-
while (n_cur <= n_predict) {
|
|
111
158
|
// sample the next token
|
|
112
159
|
{
|
|
113
|
-
|
|
114
|
-
auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
|
|
115
|
-
|
|
116
|
-
std::vector<llama_token_data> candidates;
|
|
117
|
-
candidates.reserve(n_vocab);
|
|
118
|
-
|
|
119
|
-
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
120
|
-
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
124
|
-
|
|
125
|
-
// sample the most likely token
|
|
126
|
-
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
|
160
|
+
new_token_id = llama_sampler_sample(smpl, ctx, -1);
|
|
127
161
|
|
|
128
162
|
// is it an end of generation?
|
|
129
|
-
if (llama_token_is_eog(model, new_token_id)
|
|
130
|
-
LOG_TEE("\n");
|
|
131
|
-
|
|
163
|
+
if (llama_token_is_eog(model, new_token_id)) {
|
|
132
164
|
break;
|
|
133
165
|
}
|
|
134
166
|
|
|
135
|
-
|
|
167
|
+
char buf[128];
|
|
168
|
+
int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
|
|
169
|
+
if (n < 0) {
|
|
170
|
+
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
|
|
171
|
+
return 1;
|
|
172
|
+
}
|
|
173
|
+
std::string s(buf, n);
|
|
174
|
+
printf("%s", s.c_str());
|
|
136
175
|
fflush(stdout);
|
|
137
176
|
|
|
138
|
-
// prepare the next batch
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
// push this new token for next evaluation
|
|
142
|
-
llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
|
|
177
|
+
// prepare the next batch with the sampled token
|
|
178
|
+
batch = llama_batch_get_one(&new_token_id, 1);
|
|
143
179
|
|
|
144
180
|
n_decode += 1;
|
|
145
181
|
}
|
|
146
|
-
|
|
147
|
-
n_cur += 1;
|
|
148
|
-
|
|
149
|
-
// evaluate the current batch with the transformer model
|
|
150
|
-
if (llama_decode(ctx, batch)) {
|
|
151
|
-
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
|
152
|
-
return 1;
|
|
153
|
-
}
|
|
154
182
|
}
|
|
155
183
|
|
|
156
|
-
|
|
184
|
+
printf("\n");
|
|
157
185
|
|
|
158
186
|
const auto t_main_end = ggml_time_us();
|
|
159
187
|
|
|
160
|
-
|
|
188
|
+
fprintf(stderr, "%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
|
161
189
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
|
162
190
|
|
|
163
|
-
|
|
164
|
-
|
|
191
|
+
fprintf(stderr, "\n");
|
|
192
|
+
llama_perf_sampler_print(smpl);
|
|
193
|
+
llama_perf_context_print(ctx);
|
|
165
194
|
fprintf(stderr, "\n");
|
|
166
195
|
|
|
167
|
-
|
|
168
|
-
|
|
196
|
+
llama_sampler_free(smpl);
|
|
169
197
|
llama_free(ctx);
|
|
170
198
|
llama_free_model(model);
|
|
171
199
|
|
|
172
|
-
llama_backend_free();
|
|
173
|
-
|
|
174
200
|
return 0;
|
|
175
201
|
}
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
#include "llama.h"
|
|
2
|
+
#include <cstdio>
|
|
3
|
+
#include <cstring>
|
|
4
|
+
#include <iostream>
|
|
5
|
+
#include <string>
|
|
6
|
+
#include <vector>
|
|
7
|
+
|
|
8
|
+
static void print_usage(int, char ** argv) {
|
|
9
|
+
printf("\nexample usage:\n");
|
|
10
|
+
printf("\n %s -m model.gguf [-c context_size] [-ngl n_gpu_layers]\n", argv[0]);
|
|
11
|
+
printf("\n");
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
int main(int argc, char ** argv) {
|
|
15
|
+
std::string model_path;
|
|
16
|
+
int ngl = 99;
|
|
17
|
+
int n_ctx = 2048;
|
|
18
|
+
|
|
19
|
+
// parse command line arguments
|
|
20
|
+
for (int i = 1; i < argc; i++) {
|
|
21
|
+
try {
|
|
22
|
+
if (strcmp(argv[i], "-m") == 0) {
|
|
23
|
+
if (i + 1 < argc) {
|
|
24
|
+
model_path = argv[++i];
|
|
25
|
+
} else {
|
|
26
|
+
print_usage(argc, argv);
|
|
27
|
+
return 1;
|
|
28
|
+
}
|
|
29
|
+
} else if (strcmp(argv[i], "-c") == 0) {
|
|
30
|
+
if (i + 1 < argc) {
|
|
31
|
+
n_ctx = std::stoi(argv[++i]);
|
|
32
|
+
} else {
|
|
33
|
+
print_usage(argc, argv);
|
|
34
|
+
return 1;
|
|
35
|
+
}
|
|
36
|
+
} else if (strcmp(argv[i], "-ngl") == 0) {
|
|
37
|
+
if (i + 1 < argc) {
|
|
38
|
+
ngl = std::stoi(argv[++i]);
|
|
39
|
+
} else {
|
|
40
|
+
print_usage(argc, argv);
|
|
41
|
+
return 1;
|
|
42
|
+
}
|
|
43
|
+
} else {
|
|
44
|
+
print_usage(argc, argv);
|
|
45
|
+
return 1;
|
|
46
|
+
}
|
|
47
|
+
} catch (std::exception & e) {
|
|
48
|
+
fprintf(stderr, "error: %s\n", e.what());
|
|
49
|
+
print_usage(argc, argv);
|
|
50
|
+
return 1;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
if (model_path.empty()) {
|
|
54
|
+
print_usage(argc, argv);
|
|
55
|
+
return 1;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// only print errors
|
|
59
|
+
llama_log_set([](enum ggml_log_level level, const char * text, void * /* user_data */) {
|
|
60
|
+
if (level >= GGML_LOG_LEVEL_ERROR) {
|
|
61
|
+
fprintf(stderr, "%s", text);
|
|
62
|
+
}
|
|
63
|
+
}, nullptr);
|
|
64
|
+
|
|
65
|
+
// initialize the model
|
|
66
|
+
llama_model_params model_params = llama_model_default_params();
|
|
67
|
+
model_params.n_gpu_layers = ngl;
|
|
68
|
+
|
|
69
|
+
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
|
|
70
|
+
if (!model) {
|
|
71
|
+
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
|
72
|
+
return 1;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// initialize the context
|
|
76
|
+
llama_context_params ctx_params = llama_context_default_params();
|
|
77
|
+
ctx_params.n_ctx = n_ctx;
|
|
78
|
+
ctx_params.n_batch = n_ctx;
|
|
79
|
+
|
|
80
|
+
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
|
81
|
+
if (!ctx) {
|
|
82
|
+
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
|
83
|
+
return 1;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// initialize the sampler
|
|
87
|
+
llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
|
88
|
+
llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
|
|
89
|
+
llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.8f));
|
|
90
|
+
llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
|
|
91
|
+
|
|
92
|
+
// helper function to evaluate a prompt and generate a response
|
|
93
|
+
auto generate = [&](const std::string & prompt) {
|
|
94
|
+
std::string response;
|
|
95
|
+
|
|
96
|
+
// tokenize the prompt
|
|
97
|
+
const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
|
98
|
+
std::vector<llama_token> prompt_tokens(n_prompt_tokens);
|
|
99
|
+
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
|
|
100
|
+
GGML_ABORT("failed to tokenize the prompt\n");
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// prepare a batch for the prompt
|
|
104
|
+
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
|
105
|
+
llama_token new_token_id;
|
|
106
|
+
while (true) {
|
|
107
|
+
// check if we have enough space in the context to evaluate this batch
|
|
108
|
+
int n_ctx = llama_n_ctx(ctx);
|
|
109
|
+
int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
|
|
110
|
+
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
|
111
|
+
printf("\033[0m\n");
|
|
112
|
+
fprintf(stderr, "context size exceeded\n");
|
|
113
|
+
exit(0);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
if (llama_decode(ctx, batch)) {
|
|
117
|
+
GGML_ABORT("failed to decode\n");
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// sample the next token
|
|
121
|
+
new_token_id = llama_sampler_sample(smpl, ctx, -1);
|
|
122
|
+
|
|
123
|
+
// is it an end of generation?
|
|
124
|
+
if (llama_token_is_eog(model, new_token_id)) {
|
|
125
|
+
break;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// convert the token to a string, print it and add it to the response
|
|
129
|
+
char buf[256];
|
|
130
|
+
int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
|
|
131
|
+
if (n < 0) {
|
|
132
|
+
GGML_ABORT("failed to convert token to piece\n");
|
|
133
|
+
}
|
|
134
|
+
std::string piece(buf, n);
|
|
135
|
+
printf("%s", piece.c_str());
|
|
136
|
+
fflush(stdout);
|
|
137
|
+
response += piece;
|
|
138
|
+
|
|
139
|
+
// prepare the next batch with the sampled token
|
|
140
|
+
batch = llama_batch_get_one(&new_token_id, 1);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return response;
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
std::vector<llama_chat_message> messages;
|
|
147
|
+
std::vector<char> formatted(llama_n_ctx(ctx));
|
|
148
|
+
int prev_len = 0;
|
|
149
|
+
while (true) {
|
|
150
|
+
// get user input
|
|
151
|
+
printf("\033[32m> \033[0m");
|
|
152
|
+
std::string user;
|
|
153
|
+
std::getline(std::cin, user);
|
|
154
|
+
|
|
155
|
+
if (user.empty()) {
|
|
156
|
+
break;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// add the user input to the message list and format it
|
|
160
|
+
messages.push_back({"user", strdup(user.c_str())});
|
|
161
|
+
int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
|
|
162
|
+
if (new_len > (int)formatted.size()) {
|
|
163
|
+
formatted.resize(new_len);
|
|
164
|
+
new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
|
|
165
|
+
}
|
|
166
|
+
if (new_len < 0) {
|
|
167
|
+
fprintf(stderr, "failed to apply the chat template\n");
|
|
168
|
+
return 1;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// remove previous messages to obtain the prompt to generate the response
|
|
172
|
+
std::string prompt(formatted.begin() + prev_len, formatted.begin() + new_len);
|
|
173
|
+
|
|
174
|
+
// generate a response
|
|
175
|
+
printf("\033[33m");
|
|
176
|
+
std::string response = generate(prompt);
|
|
177
|
+
printf("\n\033[0m");
|
|
178
|
+
|
|
179
|
+
// add the response to the messages
|
|
180
|
+
messages.push_back({"assistant", strdup(response.c_str())});
|
|
181
|
+
prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0);
|
|
182
|
+
if (prev_len < 0) {
|
|
183
|
+
fprintf(stderr, "failed to apply the chat template\n");
|
|
184
|
+
return 1;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// free resources
|
|
189
|
+
for (auto & msg : messages) {
|
|
190
|
+
free(const_cast<char *>(msg.content));
|
|
191
|
+
}
|
|
192
|
+
llama_sampler_free(smpl);
|
|
193
|
+
llama_free(ctx);
|
|
194
|
+
llama_free_model(model);
|
|
195
|
+
|
|
196
|
+
return 0;
|
|
197
|
+
}
|