@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
+
//#include "log.h" // TODO: start using log.h
|
|
2
3
|
#include "llama.h"
|
|
3
4
|
|
|
4
|
-
#include <cmath>
|
|
5
5
|
#include <cstdio>
|
|
6
|
+
#include <cstring>
|
|
6
7
|
#include <fstream>
|
|
7
8
|
#include <string>
|
|
8
9
|
#include <vector>
|
|
10
|
+
#include <iostream> // TODO: remove me
|
|
9
11
|
|
|
10
12
|
#if defined(_WIN32)
|
|
11
13
|
#define WIN32_LEAN_AND_MEAN
|
|
@@ -13,25 +15,25 @@
|
|
|
13
15
|
#include <shellapi.h> // For CommandLineToArgvW
|
|
14
16
|
#endif
|
|
15
17
|
|
|
16
|
-
static void print_usage_information(const char * argv0
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
18
|
+
static void print_usage_information(const char * argv0) {
|
|
19
|
+
printf("usage: %s [options]\n\n", argv0);
|
|
20
|
+
printf("The tokenize program tokenizes a prompt using a given model,\n");
|
|
21
|
+
printf("and prints the resulting tokens to standard output.\n\n");
|
|
22
|
+
printf("It needs a model file, a prompt, and optionally other flags\n");
|
|
23
|
+
printf("to control the behavior of the tokenizer.\n\n");
|
|
24
|
+
printf(" The possible options are:\n");
|
|
25
|
+
printf("\n");
|
|
26
|
+
printf(" -h, --help print this help and exit\n");
|
|
27
|
+
printf(" -m MODEL_PATH, --model MODEL_PATH path to model.\n");
|
|
28
|
+
printf(" --ids if given, only print numerical token IDs, and not token strings.\n");
|
|
29
|
+
printf(" The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
|
|
30
|
+
printf(" -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
|
|
31
|
+
printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
|
32
|
+
printf(" --stdin read prompt from standard input.\n");
|
|
33
|
+
printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
|
34
|
+
printf(" --no-parse-special do not parse control tokens.\n");
|
|
35
|
+
printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
|
36
|
+
printf(" --show-count print the total number of tokens.\n");
|
|
35
37
|
}
|
|
36
38
|
|
|
37
39
|
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
|
|
@@ -185,7 +187,7 @@ int main(int raw_argc, char ** raw_argv) {
|
|
|
185
187
|
const int argc = argv.size();
|
|
186
188
|
|
|
187
189
|
if (argc <= 1) {
|
|
188
|
-
print_usage_information(argv[0].c_str()
|
|
190
|
+
print_usage_information(argv[0].c_str());
|
|
189
191
|
return 1;
|
|
190
192
|
}
|
|
191
193
|
|
|
@@ -214,7 +216,7 @@ int main(int raw_argc, char ** raw_argv) {
|
|
|
214
216
|
for (; iarg < argc; ++iarg) {
|
|
215
217
|
std::string arg{argv[iarg]};
|
|
216
218
|
if (arg == "-h" || arg == "--help") {
|
|
217
|
-
print_usage_information(argv[0].c_str()
|
|
219
|
+
print_usage_information(argv[0].c_str());
|
|
218
220
|
return 0;
|
|
219
221
|
}
|
|
220
222
|
else if (arg == "--ids") {
|
|
@@ -323,10 +325,6 @@ int main(int raw_argc, char ** raw_argv) {
|
|
|
323
325
|
// Start actually doing the tokenizing stuff.
|
|
324
326
|
//////
|
|
325
327
|
|
|
326
|
-
#ifdef LOG_DISABLE_LOGS
|
|
327
|
-
disable_logging = true;
|
|
328
|
-
#endif
|
|
329
|
-
|
|
330
328
|
if (disable_logging) {
|
|
331
329
|
llama_log_set(llama_log_callback_null, NULL);
|
|
332
330
|
}
|
|
@@ -362,12 +360,12 @@ int main(int raw_argc, char ** raw_argv) {
|
|
|
362
360
|
prompt = stdin_buffer.str();
|
|
363
361
|
}
|
|
364
362
|
|
|
365
|
-
const bool model_wants_add_bos =
|
|
363
|
+
const bool model_wants_add_bos = llama_add_bos_token(model);
|
|
366
364
|
const bool add_bos = model_wants_add_bos && !no_bos;
|
|
367
365
|
const bool parse_special = !no_parse_special;
|
|
368
366
|
|
|
369
367
|
std::vector<llama_token> tokens;
|
|
370
|
-
tokens =
|
|
368
|
+
tokens = common_tokenize(model, prompt, add_bos, parse_special);
|
|
371
369
|
|
|
372
370
|
if (printing_ids) {
|
|
373
371
|
printf("[");
|
|
@@ -382,7 +380,7 @@ int main(int raw_argc, char ** raw_argv) {
|
|
|
382
380
|
} else {
|
|
383
381
|
bool invalid_utf8 = false;
|
|
384
382
|
printf("%6d -> '", tokens[i]);
|
|
385
|
-
write_utf8_cstr_to_stdout(
|
|
383
|
+
write_utf8_cstr_to_stdout(common_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
|
|
386
384
|
if (invalid_utf8) {
|
|
387
385
|
printf("' (utf-8 decode failure)\n");
|
|
388
386
|
} else {
|
|
@@ -56,6 +56,15 @@ else()
|
|
|
56
56
|
set(GGML_NATIVE_DEFAULT ON)
|
|
57
57
|
endif()
|
|
58
58
|
|
|
59
|
+
# defaults
|
|
60
|
+
if (NOT GGML_LLAMAFILE_DEFAULT)
|
|
61
|
+
set(GGML_LLAMAFILE_DEFAULT OFF)
|
|
62
|
+
endif()
|
|
63
|
+
|
|
64
|
+
if (NOT GGML_CUDA_GRAPHS_DEFAULT)
|
|
65
|
+
set(GGML_CUDA_GRAPHS_DEFAULT OFF)
|
|
66
|
+
endif()
|
|
67
|
+
|
|
59
68
|
# general
|
|
60
69
|
option(GGML_STATIC "ggml: static link libraries" OFF)
|
|
61
70
|
option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})
|
|
@@ -83,6 +92,7 @@ else()
|
|
|
83
92
|
endif()
|
|
84
93
|
|
|
85
94
|
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
|
95
|
+
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
|
86
96
|
|
|
87
97
|
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
|
88
98
|
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
|
@@ -90,6 +100,9 @@ option(GGML_AVX512 "ggml: enable AVX512" OFF)
|
|
|
90
100
|
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
|
91
101
|
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
|
|
92
102
|
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
|
|
103
|
+
option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
|
|
104
|
+
option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
|
|
105
|
+
option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
|
|
93
106
|
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
|
|
94
107
|
if (NOT MSVC)
|
|
95
108
|
option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
|
|
@@ -104,42 +117,40 @@ endif()
|
|
|
104
117
|
|
|
105
118
|
# ggml core
|
|
106
119
|
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
|
|
120
|
+
option(GGML_CPU "ggml: enable CPU backend" ON)
|
|
107
121
|
|
|
108
122
|
# 3rd party libs / backends
|
|
109
123
|
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
|
|
110
124
|
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
|
|
111
125
|
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
|
|
112
126
|
"ggml: BLAS library vendor")
|
|
113
|
-
option(GGML_LLAMAFILE "ggml: use LLAMAFILE"
|
|
127
|
+
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT})
|
|
114
128
|
|
|
115
129
|
option(GGML_CUDA "ggml: use CUDA" OFF)
|
|
116
130
|
option(GGML_MUSA "ggml: use MUSA" OFF)
|
|
117
|
-
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
|
118
131
|
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
|
|
119
132
|
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
|
|
120
|
-
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
|
|
121
|
-
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
|
|
122
133
|
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
|
|
123
|
-
set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
|
|
124
|
-
"ggml: iters./thread per block for Q2_K/Q6_K")
|
|
125
134
|
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
|
126
135
|
"ggml: max. batch size for using peer access")
|
|
127
136
|
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
|
128
137
|
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
|
|
129
138
|
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
|
130
|
-
option(
|
|
139
|
+
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
|
131
140
|
|
|
132
|
-
option(
|
|
133
|
-
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
|
|
141
|
+
option(GGML_HIP "ggml: use HIP" OFF)
|
|
134
142
|
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
|
135
143
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
|
136
144
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
|
137
145
|
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
|
138
146
|
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
|
|
147
|
+
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
|
|
148
|
+
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
|
|
139
149
|
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
|
140
150
|
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
|
141
151
|
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
|
142
152
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
|
153
|
+
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
|
143
154
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
|
144
155
|
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
|
|
145
156
|
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
|
|
@@ -148,6 +159,7 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
|
|
148
159
|
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
|
|
149
160
|
option(GGML_OPENMP "ggml: use OpenMP" ON)
|
|
150
161
|
option(GGML_RPC "ggml: use RPC" OFF)
|
|
162
|
+
option(GGML_AMX "ggml: use AMX" OFF)
|
|
151
163
|
option(GGML_SYCL "ggml: use SYCL" OFF)
|
|
152
164
|
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
|
153
165
|
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
|
@@ -204,12 +216,14 @@ include(CMakePackageConfigHelpers)
|
|
|
204
216
|
# all public headers
|
|
205
217
|
set(GGML_PUBLIC_HEADERS
|
|
206
218
|
include/ggml.h
|
|
219
|
+
include/ggml-cpu.h
|
|
207
220
|
include/ggml-alloc.h
|
|
208
221
|
include/ggml-backend.h
|
|
209
222
|
include/ggml-blas.h
|
|
223
|
+
include/ggml-cann.h
|
|
210
224
|
include/ggml-cuda.h
|
|
211
|
-
include/ggml.h
|
|
212
225
|
include/ggml-kompute.h
|
|
226
|
+
include/ggml-opt.h
|
|
213
227
|
include/ggml-metal.h
|
|
214
228
|
include/ggml-rpc.h
|
|
215
229
|
include/ggml-sycl.h
|
|
@@ -222,12 +236,15 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
|
|
222
236
|
install(TARGETS ggml PUBLIC_HEADER)
|
|
223
237
|
|
|
224
238
|
if (BUILD_SHARED_LIBS)
|
|
225
|
-
install(TARGETS ggml
|
|
239
|
+
install(TARGETS ggml LIBRARY)
|
|
240
|
+
install(TARGETS ggml-base LIBRARY)
|
|
226
241
|
endif()
|
|
227
242
|
|
|
243
|
+
# FIXME: this should be done in the backend cmake files
|
|
228
244
|
if (GGML_METAL)
|
|
245
|
+
# FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
|
|
229
246
|
install(
|
|
230
|
-
FILES src/ggml-metal.metal
|
|
247
|
+
FILES src/ggml-metal/ggml-metal.metal
|
|
231
248
|
PERMISSIONS
|
|
232
249
|
OWNER_READ
|
|
233
250
|
OWNER_WRITE
|
|
@@ -7,8 +7,8 @@ extern "C" {
|
|
|
7
7
|
#endif
|
|
8
8
|
|
|
9
9
|
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
|
10
|
-
typedef struct
|
|
11
|
-
typedef struct
|
|
10
|
+
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
|
11
|
+
typedef struct ggml_backend * ggml_backend_t;
|
|
12
12
|
|
|
13
13
|
// Tensor allocator
|
|
14
14
|
struct ggml_tallocr {
|
|
@@ -24,7 +24,7 @@ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, st
|
|
|
24
24
|
// Graph allocator
|
|
25
25
|
/*
|
|
26
26
|
Example usage:
|
|
27
|
-
ggml_gallocr_t galloc = ggml_gallocr_new(
|
|
27
|
+
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
|
|
28
28
|
|
|
29
29
|
// optional: create a worst-case graph and reserve the buffers to avoid reallocations
|
|
30
30
|
ggml_gallocr_reserve(galloc, build_graph(max_batch));
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
#include "ggml-backend.h"
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
#ifdef __cplusplus
|
|
8
|
+
extern "C" {
|
|
9
|
+
#endif
|
|
10
|
+
|
|
11
|
+
// buffer_type API
|
|
12
|
+
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
|
13
|
+
|
|
14
|
+
GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
|
|
15
|
+
|
|
16
|
+
// backend API
|
|
17
|
+
GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
|
|
18
|
+
|
|
19
|
+
GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
|
|
20
|
+
|
|
21
|
+
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
|
|
22
|
+
|
|
23
|
+
#ifdef __cplusplus
|
|
24
|
+
}
|
|
25
|
+
#endif
|
|
@@ -3,6 +3,20 @@
|
|
|
3
3
|
#include "ggml.h"
|
|
4
4
|
#include "ggml-alloc.h"
|
|
5
5
|
|
|
6
|
+
#ifdef GGML_BACKEND_SHARED
|
|
7
|
+
# if defined(_WIN32) && !defined(__MINGW32__)
|
|
8
|
+
# ifdef GGML_BACKEND_BUILD
|
|
9
|
+
# define GGML_BACKEND_API __declspec(dllexport) extern
|
|
10
|
+
# else
|
|
11
|
+
# define GGML_BACKEND_API __declspec(dllimport) extern
|
|
12
|
+
# endif
|
|
13
|
+
# else
|
|
14
|
+
# define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
|
|
15
|
+
# endif
|
|
16
|
+
#else
|
|
17
|
+
# define GGML_BACKEND_API extern
|
|
18
|
+
#endif
|
|
19
|
+
|
|
6
20
|
#ifdef __cplusplus
|
|
7
21
|
extern "C" {
|
|
8
22
|
#endif
|
|
@@ -12,43 +26,52 @@ extern "C" {
|
|
|
12
26
|
typedef struct ggml_backend_event * ggml_backend_event_t;
|
|
13
27
|
typedef struct ggml_backend * ggml_backend_t;
|
|
14
28
|
typedef void * ggml_backend_graph_plan_t;
|
|
29
|
+
typedef struct ggml_backend_reg * ggml_backend_reg_t;
|
|
30
|
+
typedef struct ggml_backend_device * ggml_backend_dev_t;
|
|
31
|
+
|
|
15
32
|
|
|
16
33
|
//
|
|
17
|
-
// Backend buffer
|
|
34
|
+
// Backend buffer type
|
|
18
35
|
//
|
|
19
36
|
|
|
20
|
-
|
|
21
|
-
GGML_API
|
|
22
|
-
GGML_API
|
|
23
|
-
GGML_API
|
|
24
|
-
GGML_API
|
|
25
|
-
GGML_API
|
|
26
|
-
GGML_API
|
|
37
|
+
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
|
38
|
+
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
|
39
|
+
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
|
40
|
+
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
|
41
|
+
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
|
42
|
+
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
|
43
|
+
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
|
44
|
+
|
|
45
|
+
//
|
|
46
|
+
// Backend buffer
|
|
47
|
+
//
|
|
27
48
|
|
|
28
|
-
// buffer
|
|
29
49
|
enum ggml_backend_buffer_usage {
|
|
30
50
|
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
|
31
51
|
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
|
32
52
|
GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
|
|
33
53
|
};
|
|
34
54
|
|
|
35
|
-
GGML_API
|
|
36
|
-
GGML_API
|
|
37
|
-
GGML_API
|
|
38
|
-
GGML_API
|
|
39
|
-
GGML_API
|
|
40
|
-
GGML_API
|
|
41
|
-
GGML_API
|
|
42
|
-
GGML_API
|
|
43
|
-
GGML_API
|
|
44
|
-
GGML_API
|
|
45
|
-
GGML_API
|
|
46
|
-
GGML_API
|
|
47
|
-
GGML_API
|
|
48
|
-
GGML_API
|
|
55
|
+
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
|
56
|
+
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
|
57
|
+
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
|
58
|
+
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
|
59
|
+
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
|
60
|
+
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
|
61
|
+
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
|
62
|
+
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
|
63
|
+
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
|
64
|
+
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
|
65
|
+
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
|
66
|
+
GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
|
|
67
|
+
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
|
68
|
+
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
|
69
|
+
|
|
70
|
+
// tensor copy between different backends
|
|
71
|
+
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
|
49
72
|
|
|
50
73
|
//
|
|
51
|
-
// Backend
|
|
74
|
+
// Backend (stream)
|
|
52
75
|
//
|
|
53
76
|
|
|
54
77
|
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
|
|
@@ -63,8 +86,10 @@ extern "C" {
|
|
|
63
86
|
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
|
64
87
|
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
|
65
88
|
|
|
66
|
-
|
|
67
|
-
GGML_API
|
|
89
|
+
// "offset" refers to the offset in tensor->data for setting/getting data
|
|
90
|
+
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
|
91
|
+
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
|
92
|
+
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
|
68
93
|
|
|
69
94
|
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
|
70
95
|
|
|
@@ -74,64 +99,126 @@ extern "C" {
|
|
|
74
99
|
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
75
100
|
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
76
101
|
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
102
|
+
|
|
103
|
+
// NOTE: will be removed, use device version instead
|
|
77
104
|
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
78
105
|
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
|
79
106
|
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
80
107
|
|
|
81
|
-
// tensor copy between different backends
|
|
82
|
-
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
|
83
|
-
|
|
84
108
|
// asynchronous copy
|
|
85
109
|
// the copy is performed after all the currently queued operations in backend_src
|
|
86
110
|
// backend_dst will wait for the copy to complete before performing other operations
|
|
87
111
|
// automatic fallback to sync copy if async is not supported
|
|
88
112
|
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
|
|
89
113
|
|
|
90
|
-
|
|
91
|
-
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
|
|
92
|
-
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
|
93
|
-
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
|
|
94
|
-
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
|
95
|
-
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
|
|
114
|
+
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
|
|
96
115
|
|
|
97
116
|
//
|
|
98
|
-
//
|
|
117
|
+
// Events
|
|
99
118
|
//
|
|
100
119
|
|
|
101
|
-
GGML_API
|
|
120
|
+
GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
|
|
121
|
+
GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
|
|
122
|
+
GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
|
|
123
|
+
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
|
124
|
+
GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
|
|
102
125
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
126
|
+
//
|
|
127
|
+
// Backend device
|
|
128
|
+
//
|
|
106
129
|
|
|
107
|
-
|
|
108
|
-
|
|
130
|
+
enum ggml_backend_dev_type {
|
|
131
|
+
// CPU device using system memory
|
|
132
|
+
GGML_BACKEND_DEVICE_TYPE_CPU,
|
|
133
|
+
// GPU device using dedicated memory
|
|
134
|
+
GGML_BACKEND_DEVICE_TYPE_GPU,
|
|
135
|
+
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
|
136
|
+
GGML_BACKEND_DEVICE_TYPE_ACCEL
|
|
137
|
+
};
|
|
109
138
|
|
|
110
|
-
|
|
139
|
+
// functionality supported by the device
|
|
140
|
+
struct ggml_backend_dev_caps {
|
|
141
|
+
// asynchronous operations
|
|
142
|
+
bool async;
|
|
143
|
+
// pinned host buffer
|
|
144
|
+
bool host_buffer;
|
|
145
|
+
// creating buffers from host ptr
|
|
146
|
+
bool buffer_from_host_ptr;
|
|
147
|
+
// event synchronization
|
|
148
|
+
bool events;
|
|
149
|
+
};
|
|
111
150
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
151
|
+
// all the device properties
|
|
152
|
+
struct ggml_backend_dev_props {
|
|
153
|
+
const char * name;
|
|
154
|
+
const char * description;
|
|
155
|
+
size_t memory_free;
|
|
156
|
+
size_t memory_total;
|
|
157
|
+
enum ggml_backend_dev_type type;
|
|
158
|
+
struct ggml_backend_dev_caps caps;
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
|
|
162
|
+
GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device);
|
|
163
|
+
GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
|
|
164
|
+
GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device);
|
|
165
|
+
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
|
|
166
|
+
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
|
|
167
|
+
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
|
|
168
|
+
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
|
|
169
|
+
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
|
|
170
|
+
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
|
|
171
|
+
|
|
172
|
+
GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
|
|
173
|
+
GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
|
|
174
|
+
GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
|
|
115
175
|
|
|
116
176
|
//
|
|
117
|
-
// Backend
|
|
177
|
+
// Backend (reg)
|
|
118
178
|
//
|
|
119
179
|
|
|
120
|
-
|
|
180
|
+
GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg);
|
|
181
|
+
GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
|
|
182
|
+
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
|
183
|
+
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
|
184
|
+
|
|
185
|
+
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
|
|
186
|
+
|
|
187
|
+
// Split buffer type for tensor parallelism
|
|
188
|
+
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
|
|
189
|
+
// Set the number of threads for the backend
|
|
190
|
+
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
|
191
|
+
// Get additional buffer types provided by the device (returns a NULL-terminated array)
|
|
192
|
+
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
|
|
193
|
+
|
|
194
|
+
//
|
|
195
|
+
// Backend registry
|
|
196
|
+
//
|
|
121
197
|
|
|
122
|
-
|
|
123
|
-
GGML_API size_t
|
|
124
|
-
GGML_API
|
|
125
|
-
GGML_API const char *
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
GGML_API
|
|
198
|
+
// Backend (reg) enumeration
|
|
199
|
+
GGML_API size_t ggml_backend_reg_count(void);
|
|
200
|
+
GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
|
|
201
|
+
GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
|
|
202
|
+
|
|
203
|
+
// Device enumeration
|
|
204
|
+
GGML_API size_t ggml_backend_dev_count(void);
|
|
205
|
+
GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
|
|
206
|
+
GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
|
|
207
|
+
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
|
|
208
|
+
|
|
209
|
+
// Direct backend (stream) initialization
|
|
210
|
+
// = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
|
|
211
|
+
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
|
|
212
|
+
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
|
|
213
|
+
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
|
|
214
|
+
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
|
|
215
|
+
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
|
129
216
|
|
|
130
217
|
//
|
|
131
218
|
// Backend scheduler
|
|
132
219
|
//
|
|
133
220
|
|
|
134
|
-
// The backend scheduler allows for multiple
|
|
221
|
+
// The backend scheduler allows for multiple backend devices to be used together
|
|
135
222
|
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
|
|
136
223
|
// The backends are selected based on:
|
|
137
224
|
// - the backend that supports the operation
|
|
@@ -155,20 +242,26 @@ extern "C" {
|
|
|
155
242
|
ggml_backend_sched_reserve(sched, reserve_graph);
|
|
156
243
|
|
|
157
244
|
// compute
|
|
158
|
-
graph = build_graph(sched);
|
|
159
|
-
|
|
245
|
+
graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
|
|
246
|
+
for (int i = 0; i < 10; ++i) {
|
|
247
|
+
ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
|
|
248
|
+
}
|
|
160
249
|
|
|
161
250
|
// if there are graph inputs:
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
251
|
+
graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
|
|
252
|
+
ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
|
|
253
|
+
ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
|
|
254
|
+
ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
|
|
255
|
+
ggml_backend_sched_graph_compute(sched, graph); // execute the graph
|
|
256
|
+
|
|
257
|
+
// as an alternative to the above it is also possible to assign the inputs to a dedicated context and
|
|
258
|
+
// allocate them statically via ggml_backend_alloc_ctx_tensors
|
|
166
259
|
}
|
|
167
260
|
*/
|
|
168
261
|
|
|
169
|
-
struct ggml_backend_sched;
|
|
170
262
|
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
|
171
263
|
|
|
264
|
+
// Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
|
|
172
265
|
// when ask == true, the scheduler wants to know if the user wants to observe this node
|
|
173
266
|
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
|
|
174
267
|
//
|
|
@@ -177,12 +270,12 @@ extern "C" {
|
|
|
177
270
|
//
|
|
178
271
|
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
|
179
272
|
|
|
180
|
-
// Initialize a backend scheduler
|
|
273
|
+
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
|
|
181
274
|
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
|
|
182
275
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
|
183
276
|
|
|
184
277
|
// Initialize backend buffers from a measure graph
|
|
185
|
-
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
|
278
|
+
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
|
186
279
|
|
|
187
280
|
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
|
188
281
|
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
|
|
@@ -197,12 +290,14 @@ extern "C" {
|
|
|
197
290
|
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
|
198
291
|
|
|
199
292
|
// Allocate and compute graph on the backend scheduler
|
|
200
|
-
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
|
293
|
+
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
|
|
201
294
|
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
|
202
295
|
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
|
203
296
|
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
|
204
297
|
|
|
205
|
-
// Reset all assignments and allocators - must be called before changing the node backends
|
|
298
|
+
// Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
|
|
299
|
+
// This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
|
|
300
|
+
// The correct way to use this API is to discard the deallocated tensors and create new ones.
|
|
206
301
|
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
|
207
302
|
|
|
208
303
|
// Set a callback to be called for each resulting node during graph compute
|
|
@@ -223,7 +318,7 @@ extern "C" {
|
|
|
223
318
|
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
|
224
319
|
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
|
225
320
|
|
|
226
|
-
typedef bool (*
|
|
321
|
+
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
|
227
322
|
|
|
228
323
|
// Compare the output of two backends
|
|
229
324
|
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
|
@@ -232,6 +327,9 @@ extern "C" {
|
|
|
232
327
|
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
|
233
328
|
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
|
234
329
|
|
|
330
|
+
// CPU buffer types are always available
|
|
331
|
+
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
|
332
|
+
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
|
235
333
|
|
|
236
334
|
#ifdef __cplusplus
|
|
237
335
|
}
|