@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -16,15 +16,18 @@
|
|
|
16
16
|
#include <sstream>
|
|
17
17
|
#include <string>
|
|
18
18
|
#include <vector>
|
|
19
|
+
#include <thread>
|
|
19
20
|
|
|
20
21
|
#include "ggml.h"
|
|
21
22
|
#include "llama.h"
|
|
22
23
|
#include "common.h"
|
|
23
|
-
#include "ggml-cuda.h"
|
|
24
|
-
#include "ggml-sycl.h"
|
|
25
24
|
|
|
26
|
-
#ifdef
|
|
27
|
-
#
|
|
25
|
+
#ifdef _WIN32
|
|
26
|
+
#define WIN32_LEAN_AND_MEAN
|
|
27
|
+
#ifndef NOMINMAX
|
|
28
|
+
# define NOMINMAX
|
|
29
|
+
#endif
|
|
30
|
+
#include <windows.h>
|
|
28
31
|
#endif
|
|
29
32
|
|
|
30
33
|
// utils
|
|
@@ -73,81 +76,38 @@ static T stdev(const std::vector<T> & v) {
|
|
|
73
76
|
}
|
|
74
77
|
|
|
75
78
|
static std::string get_cpu_info() {
|
|
76
|
-
std::string
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
if (strncmp(buf, "model name", 10) == 0) {
|
|
83
|
-
char * p = strchr(buf, ':');
|
|
84
|
-
if (p) {
|
|
85
|
-
p++;
|
|
86
|
-
while (std::isspace(*p)) {
|
|
87
|
-
p++;
|
|
88
|
-
}
|
|
89
|
-
while (std::isspace(p[strlen(p) - 1])) {
|
|
90
|
-
p[strlen(p) - 1] = '\0';
|
|
91
|
-
}
|
|
92
|
-
id = p;
|
|
93
|
-
break;
|
|
94
|
-
}
|
|
95
|
-
}
|
|
79
|
+
std::vector<std::string> cpu_list;
|
|
80
|
+
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
|
81
|
+
auto * dev = ggml_backend_dev_get(i);
|
|
82
|
+
auto dev_type = ggml_backend_dev_type(dev);
|
|
83
|
+
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
|
|
84
|
+
cpu_list.push_back(ggml_backend_dev_description(dev));
|
|
96
85
|
}
|
|
97
|
-
fclose(f);
|
|
98
86
|
}
|
|
99
|
-
|
|
100
|
-
// TODO: other platforms
|
|
101
|
-
return id;
|
|
87
|
+
return join(cpu_list, ", ");
|
|
102
88
|
}
|
|
103
89
|
|
|
104
90
|
static std::string get_gpu_info() {
|
|
105
|
-
std::string
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
id += buf;
|
|
112
|
-
if (i < count - 1) {
|
|
113
|
-
id += "/";
|
|
91
|
+
std::vector<std::string> gpu_list;
|
|
92
|
+
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
|
93
|
+
auto * dev = ggml_backend_dev_get(i);
|
|
94
|
+
auto dev_type = ggml_backend_dev_type(dev);
|
|
95
|
+
if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
|
96
|
+
gpu_list.push_back(ggml_backend_dev_description(dev));
|
|
114
97
|
}
|
|
115
98
|
}
|
|
116
|
-
|
|
117
|
-
#ifdef GGML_USE_SYCL
|
|
118
|
-
int count = ggml_backend_sycl_get_device_count();
|
|
119
|
-
for (int i = 0; i < count; i++) {
|
|
120
|
-
char buf[128];
|
|
121
|
-
ggml_sycl_get_device_description(i, buf, sizeof(buf));
|
|
122
|
-
id += buf;
|
|
123
|
-
if (i < count - 1) {
|
|
124
|
-
id += "/";
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
#endif
|
|
128
|
-
#ifdef GGML_USE_CANN
|
|
129
|
-
uint32_t count = ggml_backend_cann_get_device_count();
|
|
130
|
-
for (uint32_t i = 0; i < count; i++) {
|
|
131
|
-
char buf[128];
|
|
132
|
-
ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
|
|
133
|
-
id += buf;
|
|
134
|
-
if (i < count - 1) {
|
|
135
|
-
id += "/";
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
#endif
|
|
139
|
-
// TODO: other backends
|
|
140
|
-
return id;
|
|
99
|
+
return join(gpu_list, ", ");
|
|
141
100
|
}
|
|
142
101
|
|
|
143
102
|
// command line params
|
|
144
|
-
enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
|
|
103
|
+
enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};
|
|
145
104
|
|
|
146
105
|
static const char * output_format_str(output_formats format) {
|
|
147
106
|
switch (format) {
|
|
148
107
|
case NONE: return "none";
|
|
149
108
|
case CSV: return "csv";
|
|
150
109
|
case JSON: return "json";
|
|
110
|
+
case JSONL: return "jsonl";
|
|
151
111
|
case MARKDOWN: return "md";
|
|
152
112
|
case SQL: return "sql";
|
|
153
113
|
default: GGML_ABORT("invalid output format");
|
|
@@ -161,6 +121,8 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
|
|
|
161
121
|
format = CSV;
|
|
162
122
|
} else if (s == "json") {
|
|
163
123
|
format = JSON;
|
|
124
|
+
} else if (s == "jsonl") {
|
|
125
|
+
format = JSONL;
|
|
164
126
|
} else if (s == "md") {
|
|
165
127
|
format = MARKDOWN;
|
|
166
128
|
} else if (s == "sql") {
|
|
@@ -196,6 +158,9 @@ struct cmd_params {
|
|
|
196
158
|
std::vector<ggml_type> type_k;
|
|
197
159
|
std::vector<ggml_type> type_v;
|
|
198
160
|
std::vector<int> n_threads;
|
|
161
|
+
std::vector<std::string> cpu_mask;
|
|
162
|
+
std::vector<bool> cpu_strict;
|
|
163
|
+
std::vector<int> poll;
|
|
199
164
|
std::vector<int> n_gpu_layers;
|
|
200
165
|
std::vector<std::string> rpc_servers;
|
|
201
166
|
std::vector<llama_split_mode> split_mode;
|
|
@@ -207,7 +172,10 @@ struct cmd_params {
|
|
|
207
172
|
std::vector<bool> embeddings;
|
|
208
173
|
ggml_numa_strategy numa;
|
|
209
174
|
int reps;
|
|
175
|
+
ggml_sched_priority prio;
|
|
176
|
+
int delay;
|
|
210
177
|
bool verbose;
|
|
178
|
+
bool progress;
|
|
211
179
|
output_formats output_format;
|
|
212
180
|
output_formats output_format_stderr;
|
|
213
181
|
};
|
|
@@ -222,6 +190,9 @@ static const cmd_params cmd_params_defaults = {
|
|
|
222
190
|
/* type_k */ {GGML_TYPE_F16},
|
|
223
191
|
/* type_v */ {GGML_TYPE_F16},
|
|
224
192
|
/* n_threads */ {cpu_get_num_math()},
|
|
193
|
+
/* cpu_mask */ {"0x0"},
|
|
194
|
+
/* cpu_strict */ {false},
|
|
195
|
+
/* poll */ {50},
|
|
225
196
|
/* n_gpu_layers */ {99},
|
|
226
197
|
/* rpc_servers */ {""},
|
|
227
198
|
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
|
@@ -233,7 +204,10 @@ static const cmd_params cmd_params_defaults = {
|
|
|
233
204
|
/* embeddings */ {false},
|
|
234
205
|
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
|
235
206
|
/* reps */ 5,
|
|
207
|
+
/* prio */ GGML_SCHED_PRIO_NORMAL,
|
|
208
|
+
/* delay */ 0,
|
|
236
209
|
/* verbose */ false,
|
|
210
|
+
/* progress */ false,
|
|
237
211
|
/* output_format */ MARKDOWN,
|
|
238
212
|
/* output_format_stderr */ NONE,
|
|
239
213
|
};
|
|
@@ -243,29 +217,37 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
243
217
|
printf("\n");
|
|
244
218
|
printf("options:\n");
|
|
245
219
|
printf(" -h, --help\n");
|
|
246
|
-
printf(" -m, --model <filename>
|
|
247
|
-
printf(" -p, --n-prompt <n>
|
|
248
|
-
printf(" -n, --n-gen <n>
|
|
249
|
-
printf(" -pg <pp,tg>
|
|
250
|
-
printf(" -b, --batch-size <n>
|
|
251
|
-
printf(" -ub, --ubatch-size <n>
|
|
252
|
-
printf(" -ctk, --cache-type-k <t>
|
|
253
|
-
printf(" -ctv, --cache-type-v <t>
|
|
254
|
-
printf(" -t, --threads <n>
|
|
255
|
-
printf(" -
|
|
256
|
-
printf(" -
|
|
257
|
-
printf("
|
|
258
|
-
printf(" -
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
printf(" --
|
|
263
|
-
printf(" -
|
|
264
|
-
printf(" -
|
|
265
|
-
printf(" -
|
|
266
|
-
printf(" -
|
|
267
|
-
printf("
|
|
268
|
-
printf(" -
|
|
220
|
+
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
|
221
|
+
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
|
222
|
+
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
|
223
|
+
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
|
224
|
+
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
|
225
|
+
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
|
|
226
|
+
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
|
227
|
+
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
|
228
|
+
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
|
229
|
+
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
|
|
230
|
+
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
|
|
231
|
+
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
|
232
|
+
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
|
233
|
+
if (llama_supports_rpc()) {
|
|
234
|
+
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
|
235
|
+
}
|
|
236
|
+
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
|
237
|
+
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
|
238
|
+
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
|
239
|
+
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
|
240
|
+
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
|
241
|
+
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
|
242
|
+
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
|
243
|
+
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
|
244
|
+
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
|
245
|
+
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
|
|
246
|
+
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
|
|
247
|
+
printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
|
248
|
+
printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
|
|
249
|
+
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
|
250
|
+
printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
|
|
269
251
|
printf("\n");
|
|
270
252
|
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
|
271
253
|
}
|
|
@@ -274,6 +256,9 @@ static ggml_type ggml_type_from_name(const std::string & s) {
|
|
|
274
256
|
if (s == "f16") {
|
|
275
257
|
return GGML_TYPE_F16;
|
|
276
258
|
}
|
|
259
|
+
if (s == "bf16") {
|
|
260
|
+
return GGML_TYPE_BF16;
|
|
261
|
+
}
|
|
277
262
|
if (s == "q8_0") {
|
|
278
263
|
return GGML_TYPE_Q8_0;
|
|
279
264
|
}
|
|
@@ -309,6 +294,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
309
294
|
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
|
310
295
|
params.reps = cmd_params_defaults.reps;
|
|
311
296
|
params.numa = cmd_params_defaults.numa;
|
|
297
|
+
params.prio = cmd_params_defaults.prio;
|
|
298
|
+
params.delay = cmd_params_defaults.delay;
|
|
299
|
+
params.progress = cmd_params_defaults.progress;
|
|
312
300
|
|
|
313
301
|
for (int i = 1; i < argc; i++) {
|
|
314
302
|
arg = argv[i];
|
|
@@ -380,6 +368,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
380
368
|
}
|
|
381
369
|
types.push_back(gt);
|
|
382
370
|
}
|
|
371
|
+
if (invalid_param) {
|
|
372
|
+
break;
|
|
373
|
+
}
|
|
383
374
|
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
|
|
384
375
|
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
|
385
376
|
if (++i >= argc) {
|
|
@@ -396,6 +387,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
396
387
|
}
|
|
397
388
|
types.push_back(gt);
|
|
398
389
|
}
|
|
390
|
+
if (invalid_param) {
|
|
391
|
+
break;
|
|
392
|
+
}
|
|
399
393
|
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
|
|
400
394
|
} else if (arg == "-t" || arg == "--threads") {
|
|
401
395
|
if (++i >= argc) {
|
|
@@ -404,6 +398,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
404
398
|
}
|
|
405
399
|
auto p = string_split<int>(argv[i], split_delim);
|
|
406
400
|
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
|
401
|
+
} else if (arg == "-C" || arg == "--cpu-mask") {
|
|
402
|
+
if (++i >= argc) {
|
|
403
|
+
invalid_param = true;
|
|
404
|
+
break;
|
|
405
|
+
}
|
|
406
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
407
|
+
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
|
|
408
|
+
} else if (arg == "--cpu-strict") {
|
|
409
|
+
if (++i >= argc) {
|
|
410
|
+
invalid_param = true;
|
|
411
|
+
break;
|
|
412
|
+
}
|
|
413
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
414
|
+
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
|
|
415
|
+
} else if (arg == "--poll") {
|
|
416
|
+
if (++i >= argc) {
|
|
417
|
+
invalid_param = true;
|
|
418
|
+
break;
|
|
419
|
+
}
|
|
420
|
+
auto p = string_split<int>(argv[i], split_delim);
|
|
421
|
+
params.poll.insert(params.poll.end(), p.begin(), p.end());
|
|
407
422
|
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
|
408
423
|
if (++i >= argc) {
|
|
409
424
|
invalid_param = true;
|
|
@@ -411,7 +426,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
411
426
|
}
|
|
412
427
|
auto p = string_split<int>(argv[i], split_delim);
|
|
413
428
|
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
|
414
|
-
} else if (arg == "-rpc" || arg == "--rpc") {
|
|
429
|
+
} else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
|
|
415
430
|
if (++i >= argc) {
|
|
416
431
|
invalid_param = true;
|
|
417
432
|
break;
|
|
@@ -438,6 +453,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
438
453
|
}
|
|
439
454
|
modes.push_back(mode);
|
|
440
455
|
}
|
|
456
|
+
if (invalid_param) {
|
|
457
|
+
break;
|
|
458
|
+
}
|
|
441
459
|
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
|
|
442
460
|
} else if (arg == "-mg" || arg == "--main-gpu") {
|
|
443
461
|
if (++i >= argc) {
|
|
@@ -512,6 +530,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
512
530
|
break;
|
|
513
531
|
}
|
|
514
532
|
params.reps = std::stoi(argv[i]);
|
|
533
|
+
} else if (arg == "--prio") {
|
|
534
|
+
if (++i >= argc) {
|
|
535
|
+
invalid_param = true;
|
|
536
|
+
break;
|
|
537
|
+
}
|
|
538
|
+
params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
|
|
539
|
+
} else if (arg == "--delay") {
|
|
540
|
+
if (++i >= argc) {
|
|
541
|
+
invalid_param = true;
|
|
542
|
+
break;
|
|
543
|
+
}
|
|
544
|
+
params.delay = std::stoi(argv[i]);
|
|
515
545
|
} else if (arg == "-o" || arg == "--output") {
|
|
516
546
|
if (++i >= argc) {
|
|
517
547
|
invalid_param = true;
|
|
@@ -526,6 +556,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
526
556
|
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
|
|
527
557
|
} else if (arg == "-v" || arg == "--verbose") {
|
|
528
558
|
params.verbose = true;
|
|
559
|
+
} else if (arg == "--progress") {
|
|
560
|
+
params.progress = true;
|
|
529
561
|
} else {
|
|
530
562
|
invalid_param = true;
|
|
531
563
|
break;
|
|
@@ -556,6 +588,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
556
588
|
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
|
557
589
|
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
|
558
590
|
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
|
591
|
+
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
|
|
592
|
+
if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
|
|
593
|
+
if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
|
|
559
594
|
|
|
560
595
|
return params;
|
|
561
596
|
}
|
|
@@ -569,6 +604,9 @@ struct cmd_params_instance {
|
|
|
569
604
|
ggml_type type_k;
|
|
570
605
|
ggml_type type_v;
|
|
571
606
|
int n_threads;
|
|
607
|
+
std::string cpu_mask;
|
|
608
|
+
bool cpu_strict;
|
|
609
|
+
int poll;
|
|
572
610
|
int n_gpu_layers;
|
|
573
611
|
std::string rpc_servers;
|
|
574
612
|
llama_split_mode split_mode;
|
|
@@ -638,7 +676,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
638
676
|
for (const auto & tv : params.type_v)
|
|
639
677
|
for (const auto & nkvo : params.no_kv_offload)
|
|
640
678
|
for (const auto & fa : params.flash_attn)
|
|
641
|
-
for (const auto & nt : params.n_threads)
|
|
679
|
+
for (const auto & nt : params.n_threads)
|
|
680
|
+
for (const auto & cm : params.cpu_mask)
|
|
681
|
+
for (const auto & cs : params.cpu_strict)
|
|
682
|
+
for (const auto & pl : params.poll) {
|
|
642
683
|
for (const auto & n_prompt : params.n_prompt) {
|
|
643
684
|
if (n_prompt == 0) {
|
|
644
685
|
continue;
|
|
@@ -652,6 +693,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
652
693
|
/* .type_k = */ tk,
|
|
653
694
|
/* .type_v = */ tv,
|
|
654
695
|
/* .n_threads = */ nt,
|
|
696
|
+
/* .cpu_mask = */ cm,
|
|
697
|
+
/* .cpu_strict = */ cs,
|
|
698
|
+
/* .poll = */ pl,
|
|
655
699
|
/* .n_gpu_layers = */ nl,
|
|
656
700
|
/* .rpc_servers = */ rpc,
|
|
657
701
|
/* .split_mode = */ sm,
|
|
@@ -678,6 +722,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
678
722
|
/* .type_k = */ tk,
|
|
679
723
|
/* .type_v = */ tv,
|
|
680
724
|
/* .n_threads = */ nt,
|
|
725
|
+
/* .cpu_mask = */ cm,
|
|
726
|
+
/* .cpu_strict = */ cs,
|
|
727
|
+
/* .poll = */ pl,
|
|
681
728
|
/* .n_gpu_layers = */ nl,
|
|
682
729
|
/* .rpc_servers = */ rpc,
|
|
683
730
|
/* .split_mode = */ sm,
|
|
@@ -704,6 +751,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
704
751
|
/* .type_k = */ tk,
|
|
705
752
|
/* .type_v = */ tv,
|
|
706
753
|
/* .n_threads = */ nt,
|
|
754
|
+
/* .cpu_mask = */ cm,
|
|
755
|
+
/* .cpu_strict = */ cs,
|
|
756
|
+
/* .poll = */ pl,
|
|
707
757
|
/* .n_gpu_layers = */ nl,
|
|
708
758
|
/* .rpc_servers = */ rpc,
|
|
709
759
|
/* .split_mode = */ sm,
|
|
@@ -724,13 +774,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
724
774
|
struct test {
|
|
725
775
|
static const std::string build_commit;
|
|
726
776
|
static const int build_number;
|
|
727
|
-
static const bool cuda;
|
|
728
|
-
static const bool vulkan;
|
|
729
|
-
static const bool kompute;
|
|
730
|
-
static const bool metal;
|
|
731
|
-
static const bool sycl;
|
|
732
|
-
static const bool gpu_blas;
|
|
733
|
-
static const bool blas;
|
|
734
777
|
static const std::string cpu_info;
|
|
735
778
|
static const std::string gpu_info;
|
|
736
779
|
std::string model_filename;
|
|
@@ -740,7 +783,9 @@ struct test {
|
|
|
740
783
|
int n_batch;
|
|
741
784
|
int n_ubatch;
|
|
742
785
|
int n_threads;
|
|
743
|
-
|
|
786
|
+
std::string cpu_mask;
|
|
787
|
+
bool cpu_strict;
|
|
788
|
+
int poll;
|
|
744
789
|
ggml_type type_k;
|
|
745
790
|
ggml_type type_v;
|
|
746
791
|
int n_gpu_layers;
|
|
@@ -766,7 +811,9 @@ struct test {
|
|
|
766
811
|
n_batch = inst.n_batch;
|
|
767
812
|
n_ubatch = inst.n_ubatch;
|
|
768
813
|
n_threads = inst.n_threads;
|
|
769
|
-
|
|
814
|
+
cpu_mask = inst.cpu_mask;
|
|
815
|
+
cpu_strict = inst.cpu_strict;
|
|
816
|
+
poll = inst.poll;
|
|
770
817
|
type_k = inst.type_k;
|
|
771
818
|
type_v = inst.type_v;
|
|
772
819
|
n_gpu_layers = inst.n_gpu_layers;
|
|
@@ -811,45 +858,31 @@ struct test {
|
|
|
811
858
|
}
|
|
812
859
|
|
|
813
860
|
static std::string get_backend() {
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
return "Kompute";
|
|
822
|
-
}
|
|
823
|
-
if (metal) {
|
|
824
|
-
return "Metal";
|
|
825
|
-
}
|
|
826
|
-
if (sycl) {
|
|
827
|
-
return GGML_SYCL_NAME;
|
|
828
|
-
}
|
|
829
|
-
if (gpu_blas) {
|
|
830
|
-
return "GPU BLAS";
|
|
831
|
-
}
|
|
832
|
-
if (blas) {
|
|
833
|
-
return "BLAS";
|
|
861
|
+
std::vector<std::string> backends;
|
|
862
|
+
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
|
863
|
+
auto * reg = ggml_backend_reg_get(i);
|
|
864
|
+
std::string name = ggml_backend_reg_name(reg);
|
|
865
|
+
if (name != "CPU") {
|
|
866
|
+
backends.push_back(ggml_backend_reg_name(reg));
|
|
867
|
+
}
|
|
834
868
|
}
|
|
835
|
-
|
|
836
|
-
return "CPU";
|
|
869
|
+
return backends.empty() ? "CPU" : join(backends, ",");
|
|
837
870
|
}
|
|
838
871
|
|
|
839
872
|
static const std::vector<std::string> & get_fields() {
|
|
840
873
|
static const std::vector<std::string> fields = {
|
|
841
874
|
"build_commit", "build_number",
|
|
842
|
-
"
|
|
843
|
-
"cpu_info", "gpu_info",
|
|
875
|
+
"cpu_info", "gpu_info", "backends",
|
|
844
876
|
"model_filename", "model_type", "model_size", "model_n_params",
|
|
845
877
|
"n_batch", "n_ubatch",
|
|
846
|
-
"n_threads", "
|
|
878
|
+
"n_threads", "cpu_mask", "cpu_strict", "poll",
|
|
879
|
+
"type_k", "type_v",
|
|
847
880
|
"n_gpu_layers", "split_mode",
|
|
848
881
|
"main_gpu", "no_kv_offload", "flash_attn",
|
|
849
882
|
"tensor_split", "use_mmap", "embeddings",
|
|
850
883
|
"n_prompt", "n_gen", "test_time",
|
|
851
884
|
"avg_ns", "stddev_ns",
|
|
852
|
-
"avg_ts", "stddev_ts"
|
|
885
|
+
"avg_ts", "stddev_ts",
|
|
853
886
|
};
|
|
854
887
|
return fields;
|
|
855
888
|
}
|
|
@@ -858,15 +891,15 @@ struct test {
|
|
|
858
891
|
|
|
859
892
|
static field_type get_field_type(const std::string & field) {
|
|
860
893
|
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
|
|
861
|
-
field == "n_threads" ||
|
|
894
|
+
field == "n_threads" || field == "poll" ||
|
|
862
895
|
field == "model_size" || field == "model_n_params" ||
|
|
863
896
|
field == "n_gpu_layers" || field == "main_gpu" ||
|
|
864
897
|
field == "n_prompt" || field == "n_gen" ||
|
|
865
898
|
field == "avg_ns" || field == "stddev_ns") {
|
|
866
899
|
return INT;
|
|
867
900
|
}
|
|
868
|
-
if (field == "
|
|
869
|
-
field == "
|
|
901
|
+
if (field == "f16_kv" || field == "no_kv_offload" ||
|
|
902
|
+
field == "cpu_strict" ||
|
|
870
903
|
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
|
871
904
|
return BOOL;
|
|
872
905
|
}
|
|
@@ -894,12 +927,11 @@ struct test {
|
|
|
894
927
|
}
|
|
895
928
|
std::vector<std::string> values = {
|
|
896
929
|
build_commit, std::to_string(build_number),
|
|
897
|
-
|
|
898
|
-
std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas),
|
|
899
|
-
cpu_info, gpu_info,
|
|
930
|
+
cpu_info, gpu_info, get_backend(),
|
|
900
931
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
|
901
932
|
std::to_string(n_batch), std::to_string(n_ubatch),
|
|
902
|
-
std::to_string(n_threads),
|
|
933
|
+
std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
|
|
934
|
+
ggml_type_name(type_k), ggml_type_name(type_v),
|
|
903
935
|
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
|
904
936
|
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
|
905
937
|
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
|
@@ -922,13 +954,6 @@ struct test {
|
|
|
922
954
|
|
|
923
955
|
const std::string test::build_commit = LLAMA_COMMIT;
|
|
924
956
|
const int test::build_number = LLAMA_BUILD_NUMBER;
|
|
925
|
-
const bool test::cuda = !!ggml_cpu_has_cuda();
|
|
926
|
-
const bool test::vulkan = !!ggml_cpu_has_vulkan();
|
|
927
|
-
const bool test::kompute = !!ggml_cpu_has_kompute();
|
|
928
|
-
const bool test::metal = !!ggml_cpu_has_metal();
|
|
929
|
-
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
|
930
|
-
const bool test::blas = !!ggml_cpu_has_blas();
|
|
931
|
-
const bool test::sycl = !!ggml_cpu_has_sycl();
|
|
932
957
|
const std::string test::cpu_info = get_cpu_info();
|
|
933
958
|
const std::string test::gpu_info = get_gpu_info();
|
|
934
959
|
|
|
@@ -967,37 +992,38 @@ struct csv_printer : public printer {
|
|
|
967
992
|
}
|
|
968
993
|
};
|
|
969
994
|
|
|
970
|
-
struct json_printer : public printer {
|
|
971
|
-
bool first = true;
|
|
972
995
|
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
}
|
|
996
|
+
static std::string escape_json(const std::string & value) {
|
|
997
|
+
std::string escaped;
|
|
998
|
+
for (auto c : value) {
|
|
999
|
+
if (c == '"') {
|
|
1000
|
+
escaped += "\\\"";
|
|
1001
|
+
} else if (c == '\\') {
|
|
1002
|
+
escaped += "\\\\";
|
|
1003
|
+
} else if (c <= 0x1f) {
|
|
1004
|
+
char buf[8];
|
|
1005
|
+
snprintf(buf, sizeof(buf), "\\u%04x", c);
|
|
1006
|
+
escaped += buf;
|
|
1007
|
+
} else {
|
|
1008
|
+
escaped += c;
|
|
987
1009
|
}
|
|
988
|
-
return escaped;
|
|
989
1010
|
}
|
|
1011
|
+
return escaped;
|
|
1012
|
+
}
|
|
990
1013
|
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
}
|
|
1014
|
+
static std::string format_json_value(const std::string & field, const std::string & value) {
|
|
1015
|
+
switch (test::get_field_type(field)) {
|
|
1016
|
+
case test::STRING:
|
|
1017
|
+
return "\"" + escape_json(value) + "\"";
|
|
1018
|
+
case test::BOOL:
|
|
1019
|
+
return value == "0" ? "false" : "true";
|
|
1020
|
+
default:
|
|
1021
|
+
return value;
|
|
1000
1022
|
}
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
struct json_printer : public printer {
|
|
1026
|
+
bool first = true;
|
|
1001
1027
|
|
|
1002
1028
|
void print_header(const cmd_params & params) override {
|
|
1003
1029
|
fprintf(fout, "[\n");
|
|
@@ -1007,7 +1033,7 @@ struct json_printer : public printer {
|
|
|
1007
1033
|
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
|
1008
1034
|
assert(fields.size() == values.size());
|
|
1009
1035
|
for (size_t i = 0; i < fields.size(); i++) {
|
|
1010
|
-
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(),
|
|
1036
|
+
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
|
|
1011
1037
|
}
|
|
1012
1038
|
}
|
|
1013
1039
|
|
|
@@ -1030,6 +1056,25 @@ struct json_printer : public printer {
|
|
|
1030
1056
|
}
|
|
1031
1057
|
};
|
|
1032
1058
|
|
|
1059
|
+
|
|
1060
|
+
struct jsonl_printer : public printer {
|
|
1061
|
+
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
|
1062
|
+
assert(fields.size() == values.size());
|
|
1063
|
+
for (size_t i = 0; i < fields.size(); i++) {
|
|
1064
|
+
fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
|
|
1065
|
+
}
|
|
1066
|
+
}
|
|
1067
|
+
|
|
1068
|
+
void print_test(const test & t) override {
|
|
1069
|
+
fprintf(fout, "{");
|
|
1070
|
+
print_fields(test::get_fields(), t.get_values());
|
|
1071
|
+
fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
|
|
1072
|
+
fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
|
|
1073
|
+
fprintf(fout, "}\n");
|
|
1074
|
+
fflush(fout);
|
|
1075
|
+
}
|
|
1076
|
+
};
|
|
1077
|
+
|
|
1033
1078
|
struct markdown_printer : public printer {
|
|
1034
1079
|
std::vector<std::string> fields;
|
|
1035
1080
|
|
|
@@ -1038,7 +1083,7 @@ struct markdown_printer : public printer {
|
|
|
1038
1083
|
return -30;
|
|
1039
1084
|
}
|
|
1040
1085
|
if (field == "t/s") {
|
|
1041
|
-
return
|
|
1086
|
+
return 20;
|
|
1042
1087
|
}
|
|
1043
1088
|
if (field == "size" || field == "params") {
|
|
1044
1089
|
return 10;
|
|
@@ -1113,13 +1158,23 @@ struct markdown_printer : public printer {
|
|
|
1113
1158
|
fields.emplace_back("size");
|
|
1114
1159
|
fields.emplace_back("params");
|
|
1115
1160
|
fields.emplace_back("backend");
|
|
1116
|
-
bool is_cpu_backend = test::get_backend()
|
|
1161
|
+
bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos ||
|
|
1162
|
+
test::get_backend().find("BLAS") != std::string::npos;
|
|
1117
1163
|
if (!is_cpu_backend) {
|
|
1118
1164
|
fields.emplace_back("n_gpu_layers");
|
|
1119
1165
|
}
|
|
1120
1166
|
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
|
1121
1167
|
fields.emplace_back("n_threads");
|
|
1122
1168
|
}
|
|
1169
|
+
if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
|
|
1170
|
+
fields.emplace_back("cpu_mask");
|
|
1171
|
+
}
|
|
1172
|
+
if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
|
|
1173
|
+
fields.emplace_back("cpu_strict");
|
|
1174
|
+
}
|
|
1175
|
+
if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
|
|
1176
|
+
fields.emplace_back("poll");
|
|
1177
|
+
}
|
|
1123
1178
|
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
|
1124
1179
|
fields.emplace_back("n_batch");
|
|
1125
1180
|
}
|
|
@@ -1194,9 +1249,6 @@ struct markdown_printer : public printer {
|
|
|
1194
1249
|
value = buf;
|
|
1195
1250
|
} else if (field == "backend") {
|
|
1196
1251
|
value = test::get_backend();
|
|
1197
|
-
if (t.has_rpc) {
|
|
1198
|
-
value += "+RPC";
|
|
1199
|
-
}
|
|
1200
1252
|
} else if (field == "test") {
|
|
1201
1253
|
if (t.n_prompt > 0 && t.n_gen == 0) {
|
|
1202
1254
|
snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
|
|
@@ -1269,7 +1321,7 @@ struct sql_printer : public printer {
|
|
|
1269
1321
|
}
|
|
1270
1322
|
};
|
|
1271
1323
|
|
|
1272
|
-
static void test_prompt(llama_context * ctx, int n_prompt, int
|
|
1324
|
+
static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
|
|
1273
1325
|
llama_set_n_threads(ctx, n_threads, n_threads);
|
|
1274
1326
|
|
|
1275
1327
|
const llama_model * model = llama_get_model(ctx);
|
|
@@ -1285,14 +1337,14 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
|
|
|
1285
1337
|
for (int i = 1; i < n_tokens; i++) {
|
|
1286
1338
|
tokens[i] = std::rand() % n_vocab;
|
|
1287
1339
|
}
|
|
1288
|
-
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens
|
|
1340
|
+
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
|
|
1289
1341
|
n_processed += n_tokens;
|
|
1290
1342
|
}
|
|
1291
1343
|
|
|
1292
1344
|
llama_synchronize(ctx);
|
|
1293
1345
|
}
|
|
1294
1346
|
|
|
1295
|
-
static void test_gen(llama_context * ctx, int n_gen, int
|
|
1347
|
+
static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
|
|
1296
1348
|
llama_set_n_threads(ctx, n_threads, n_threads);
|
|
1297
1349
|
|
|
1298
1350
|
const llama_model * model = llama_get_model(ctx);
|
|
@@ -1301,7 +1353,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads)
|
|
|
1301
1353
|
llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
|
|
1302
1354
|
|
|
1303
1355
|
for (int i = 0; i < n_gen; i++) {
|
|
1304
|
-
llama_decode(ctx, llama_batch_get_one(&token, 1
|
|
1356
|
+
llama_decode(ctx, llama_batch_get_one(&token, 1));
|
|
1305
1357
|
llama_synchronize(ctx);
|
|
1306
1358
|
token = std::rand() % n_vocab;
|
|
1307
1359
|
}
|
|
@@ -1321,6 +1373,8 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
|
|
|
1321
1373
|
return std::unique_ptr<printer>(new csv_printer());
|
|
1322
1374
|
case JSON:
|
|
1323
1375
|
return std::unique_ptr<printer>(new json_printer());
|
|
1376
|
+
case JSONL:
|
|
1377
|
+
return std::unique_ptr<printer>(new jsonl_printer());
|
|
1324
1378
|
case MARKDOWN:
|
|
1325
1379
|
return std::unique_ptr<printer>(new markdown_printer());
|
|
1326
1380
|
case SQL:
|
|
@@ -1354,6 +1408,8 @@ int main(int argc, char ** argv) {
|
|
|
1354
1408
|
llama_backend_init();
|
|
1355
1409
|
llama_numa_init(params.numa);
|
|
1356
1410
|
|
|
1411
|
+
set_process_priority(params.prio);
|
|
1412
|
+
|
|
1357
1413
|
// initialize printer
|
|
1358
1414
|
std::unique_ptr<printer> p = create_printer(params.output_format);
|
|
1359
1415
|
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
|
|
@@ -1373,7 +1429,13 @@ int main(int argc, char ** argv) {
|
|
|
1373
1429
|
llama_model * lmodel = nullptr;
|
|
1374
1430
|
const cmd_params_instance * prev_inst = nullptr;
|
|
1375
1431
|
|
|
1432
|
+
int params_idx = 0;
|
|
1433
|
+
auto params_count = params_instances.size();
|
|
1376
1434
|
for (const auto & inst : params_instances) {
|
|
1435
|
+
params_idx ++;
|
|
1436
|
+
if (params.progress) {
|
|
1437
|
+
fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
|
|
1438
|
+
}
|
|
1377
1439
|
// keep the same model between tests when possible
|
|
1378
1440
|
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
|
1379
1441
|
if (lmodel) {
|
|
@@ -1399,13 +1461,41 @@ int main(int argc, char ** argv) {
|
|
|
1399
1461
|
|
|
1400
1462
|
llama_kv_cache_clear(ctx);
|
|
1401
1463
|
|
|
1464
|
+
// cool off before the test
|
|
1465
|
+
if (params.delay) {
|
|
1466
|
+
std::this_thread::sleep_for(std::chrono::seconds(params.delay));
|
|
1467
|
+
}
|
|
1468
|
+
|
|
1469
|
+
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
|
|
1470
|
+
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
|
|
1471
|
+
fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
|
|
1472
|
+
exit(1);
|
|
1473
|
+
}
|
|
1474
|
+
tpp.strict_cpu = t.cpu_strict;
|
|
1475
|
+
tpp.poll = t.poll;
|
|
1476
|
+
tpp.prio = params.prio;
|
|
1477
|
+
|
|
1478
|
+
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
|
|
1479
|
+
if (!threadpool) {
|
|
1480
|
+
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
|
1481
|
+
exit(1);
|
|
1482
|
+
}
|
|
1483
|
+
|
|
1484
|
+
llama_attach_threadpool(ctx, threadpool, NULL);
|
|
1485
|
+
|
|
1402
1486
|
// warmup run
|
|
1403
1487
|
if (t.n_prompt > 0) {
|
|
1488
|
+
if (params.progress) {
|
|
1489
|
+
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
|
|
1490
|
+
}
|
|
1404
1491
|
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
|
1405
|
-
test_prompt(ctx, t.n_prompt,
|
|
1492
|
+
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
|
1406
1493
|
}
|
|
1407
1494
|
if (t.n_gen > 0) {
|
|
1408
|
-
|
|
1495
|
+
if (params.progress) {
|
|
1496
|
+
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
|
|
1497
|
+
}
|
|
1498
|
+
test_gen(ctx, 1, t.n_threads);
|
|
1409
1499
|
}
|
|
1410
1500
|
|
|
1411
1501
|
for (int i = 0; i < params.reps; i++) {
|
|
@@ -1414,10 +1504,16 @@ int main(int argc, char ** argv) {
|
|
|
1414
1504
|
uint64_t t_start = get_time_ns();
|
|
1415
1505
|
|
|
1416
1506
|
if (t.n_prompt > 0) {
|
|
1417
|
-
|
|
1507
|
+
if (params.progress) {
|
|
1508
|
+
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
|
|
1509
|
+
}
|
|
1510
|
+
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
|
1418
1511
|
}
|
|
1419
1512
|
if (t.n_gen > 0) {
|
|
1420
|
-
|
|
1513
|
+
if (params.progress) {
|
|
1514
|
+
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
|
|
1515
|
+
}
|
|
1516
|
+
test_gen(ctx, t.n_gen, t.n_threads);
|
|
1421
1517
|
}
|
|
1422
1518
|
|
|
1423
1519
|
uint64_t t_ns = get_time_ns() - t_start;
|
|
@@ -1434,9 +1530,11 @@ int main(int argc, char ** argv) {
|
|
|
1434
1530
|
fflush(p_err->fout);
|
|
1435
1531
|
}
|
|
1436
1532
|
|
|
1437
|
-
|
|
1533
|
+
llama_perf_context_print(ctx);
|
|
1438
1534
|
|
|
1439
1535
|
llama_free(ctx);
|
|
1536
|
+
|
|
1537
|
+
ggml_threadpool_free(threadpool);
|
|
1440
1538
|
}
|
|
1441
1539
|
|
|
1442
1540
|
llama_free_model(lmodel);
|