@fugood/llama.node 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +15 -5
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +1 -1
- package/src/LlamaContext.cpp +81 -18
- package/src/LlamaContext.h +2 -0
- package/src/llama.cpp/.github/workflows/build.yml +197 -159
- package/src/llama.cpp/.github/workflows/docker.yml +5 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +11 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -2
- package/src/llama.cpp/common/arg.cpp +426 -245
- package/src/llama.cpp/common/common.cpp +143 -80
- package/src/llama.cpp/common/common.h +81 -24
- package/src/llama.cpp/common/sampling.cpp +53 -19
- package/src/llama.cpp/common/sampling.h +22 -1
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +101 -148
- package/src/llama.cpp/examples/CMakeLists.txt +32 -13
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +5 -4
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +262 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/llava.cpp +46 -19
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +9 -5
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
- package/src/llama.cpp/examples/server/server.cpp +1758 -886
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +94 -304
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +4 -0
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
- package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml.h +106 -24
- package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
- package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
- package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
- package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
- package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
- package/src/llama.cpp/ggml/src/ggml.c +367 -207
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +26 -19
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/CMakeLists.txt +2 -7
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +35 -90
- package/src/llama.cpp/src/llama-vocab.cpp +6 -1
- package/src/llama.cpp/src/llama.cpp +1748 -640
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -37
- package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
- package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
- package/src/llama.cpp/tests/test-rope.cpp +61 -20
- package/src/llama.cpp/tests/test-sampling.cpp +2 -2
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
|
@@ -6,28 +6,28 @@
|
|
|
6
6
|
#include <clocale>
|
|
7
7
|
#include <cmath>
|
|
8
8
|
#include <cstdio>
|
|
9
|
+
#include <cstdlib>
|
|
9
10
|
#include <cstring>
|
|
10
11
|
#include <ctime>
|
|
11
|
-
#include <cstdlib>
|
|
12
12
|
#include <iterator>
|
|
13
13
|
#include <map>
|
|
14
14
|
#include <numeric>
|
|
15
15
|
#include <regex>
|
|
16
16
|
#include <sstream>
|
|
17
17
|
#include <string>
|
|
18
|
-
#include <vector>
|
|
19
18
|
#include <thread>
|
|
19
|
+
#include <vector>
|
|
20
20
|
|
|
21
|
+
#include "common.h"
|
|
21
22
|
#include "ggml.h"
|
|
22
23
|
#include "llama.h"
|
|
23
|
-
#include "common.h"
|
|
24
24
|
|
|
25
25
|
#ifdef _WIN32
|
|
26
|
-
#define WIN32_LEAN_AND_MEAN
|
|
27
|
-
#ifndef NOMINMAX
|
|
28
|
-
#
|
|
29
|
-
#endif
|
|
30
|
-
#include <windows.h>
|
|
26
|
+
# define WIN32_LEAN_AND_MEAN
|
|
27
|
+
# ifndef NOMINMAX
|
|
28
|
+
# define NOMINMAX
|
|
29
|
+
# endif
|
|
30
|
+
# include <windows.h>
|
|
31
31
|
#endif
|
|
32
32
|
|
|
33
33
|
// utils
|
|
@@ -36,8 +36,7 @@ static uint64_t get_time_ns() {
|
|
|
36
36
|
return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
|
|
37
37
|
}
|
|
38
38
|
|
|
39
|
-
template<class T>
|
|
40
|
-
static std::string join(const std::vector<T> & values, const std::string & delim) {
|
|
39
|
+
template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
|
|
41
40
|
std::ostringstream str;
|
|
42
41
|
for (size_t i = 0; i < values.size(); i++) {
|
|
43
42
|
str << values[i];
|
|
@@ -48,38 +47,35 @@ static std::string join(const std::vector<T> & values, const std::string & delim
|
|
|
48
47
|
return str.str();
|
|
49
48
|
}
|
|
50
49
|
|
|
51
|
-
template<typename T, typename F>
|
|
52
|
-
static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
|
|
50
|
+
template <typename T, typename F> static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
|
|
53
51
|
std::vector<std::string> str_values;
|
|
54
52
|
std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
|
|
55
53
|
return str_values;
|
|
56
54
|
}
|
|
57
55
|
|
|
58
|
-
template<typename T>
|
|
59
|
-
static T avg(const std::vector<T> & v) {
|
|
56
|
+
template <typename T> static T avg(const std::vector<T> & v) {
|
|
60
57
|
if (v.empty()) {
|
|
61
58
|
return 0;
|
|
62
59
|
}
|
|
63
60
|
T sum = std::accumulate(v.begin(), v.end(), T(0));
|
|
64
|
-
return sum / (T)v.size();
|
|
61
|
+
return sum / (T) v.size();
|
|
65
62
|
}
|
|
66
63
|
|
|
67
|
-
template<typename T>
|
|
68
|
-
static T stdev(const std::vector<T> & v) {
|
|
64
|
+
template <typename T> static T stdev(const std::vector<T> & v) {
|
|
69
65
|
if (v.size() <= 1) {
|
|
70
66
|
return 0;
|
|
71
67
|
}
|
|
72
|
-
T mean
|
|
68
|
+
T mean = avg(v);
|
|
73
69
|
T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
|
|
74
|
-
T stdev
|
|
70
|
+
T stdev = std::sqrt(sq_sum / (T) (v.size() - 1) - mean * mean * (T) v.size() / (T) (v.size() - 1));
|
|
75
71
|
return stdev;
|
|
76
72
|
}
|
|
77
73
|
|
|
78
74
|
static std::string get_cpu_info() {
|
|
79
75
|
std::vector<std::string> cpu_list;
|
|
80
76
|
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
|
81
|
-
auto * dev
|
|
82
|
-
auto
|
|
77
|
+
auto * dev = ggml_backend_dev_get(i);
|
|
78
|
+
auto dev_type = ggml_backend_dev_type(dev);
|
|
83
79
|
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
|
|
84
80
|
cpu_list.push_back(ggml_backend_dev_description(dev));
|
|
85
81
|
}
|
|
@@ -90,8 +86,8 @@ static std::string get_cpu_info() {
|
|
|
90
86
|
static std::string get_gpu_info() {
|
|
91
87
|
std::vector<std::string> gpu_list;
|
|
92
88
|
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
|
93
|
-
auto * dev
|
|
94
|
-
auto
|
|
89
|
+
auto * dev = ggml_backend_dev_get(i);
|
|
90
|
+
auto dev_type = ggml_backend_dev_type(dev);
|
|
95
91
|
if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
|
96
92
|
gpu_list.push_back(ggml_backend_dev_description(dev));
|
|
97
93
|
}
|
|
@@ -100,17 +96,24 @@ static std::string get_gpu_info() {
|
|
|
100
96
|
}
|
|
101
97
|
|
|
102
98
|
// command line params
|
|
103
|
-
enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};
|
|
99
|
+
enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL };
|
|
104
100
|
|
|
105
101
|
static const char * output_format_str(output_formats format) {
|
|
106
102
|
switch (format) {
|
|
107
|
-
case NONE:
|
|
108
|
-
|
|
109
|
-
case
|
|
110
|
-
|
|
111
|
-
case
|
|
112
|
-
|
|
113
|
-
|
|
103
|
+
case NONE:
|
|
104
|
+
return "none";
|
|
105
|
+
case CSV:
|
|
106
|
+
return "csv";
|
|
107
|
+
case JSON:
|
|
108
|
+
return "json";
|
|
109
|
+
case JSONL:
|
|
110
|
+
return "jsonl";
|
|
111
|
+
case MARKDOWN:
|
|
112
|
+
return "md";
|
|
113
|
+
case SQL:
|
|
114
|
+
return "sql";
|
|
115
|
+
default:
|
|
116
|
+
GGML_ABORT("invalid output format");
|
|
114
117
|
}
|
|
115
118
|
}
|
|
116
119
|
|
|
@@ -135,10 +138,14 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
|
|
|
135
138
|
|
|
136
139
|
static const char * split_mode_str(llama_split_mode mode) {
|
|
137
140
|
switch (mode) {
|
|
138
|
-
case LLAMA_SPLIT_MODE_NONE:
|
|
139
|
-
|
|
140
|
-
case
|
|
141
|
-
|
|
141
|
+
case LLAMA_SPLIT_MODE_NONE:
|
|
142
|
+
return "none";
|
|
143
|
+
case LLAMA_SPLIT_MODE_LAYER:
|
|
144
|
+
return "layer";
|
|
145
|
+
case LLAMA_SPLIT_MODE_ROW:
|
|
146
|
+
return "row";
|
|
147
|
+
default:
|
|
148
|
+
GGML_ABORT("invalid split mode");
|
|
142
149
|
}
|
|
143
150
|
}
|
|
144
151
|
|
|
@@ -149,59 +156,59 @@ static std::string pair_str(const std::pair<int, int> & p) {
|
|
|
149
156
|
}
|
|
150
157
|
|
|
151
158
|
struct cmd_params {
|
|
152
|
-
std::vector<std::string>
|
|
153
|
-
std::vector<int>
|
|
154
|
-
std::vector<int>
|
|
159
|
+
std::vector<std::string> model;
|
|
160
|
+
std::vector<int> n_prompt;
|
|
161
|
+
std::vector<int> n_gen;
|
|
155
162
|
std::vector<std::pair<int, int>> n_pg;
|
|
156
|
-
std::vector<int>
|
|
157
|
-
std::vector<int>
|
|
158
|
-
std::vector<ggml_type>
|
|
159
|
-
std::vector<ggml_type>
|
|
160
|
-
std::vector<int>
|
|
161
|
-
std::vector<std::string>
|
|
162
|
-
std::vector<bool>
|
|
163
|
-
std::vector<int>
|
|
164
|
-
std::vector<int>
|
|
165
|
-
std::vector<std::string>
|
|
166
|
-
std::vector<llama_split_mode>
|
|
167
|
-
std::vector<int>
|
|
168
|
-
std::vector<bool>
|
|
169
|
-
std::vector<bool>
|
|
170
|
-
std::vector<std::vector<float>>
|
|
171
|
-
std::vector<bool>
|
|
172
|
-
std::vector<bool>
|
|
173
|
-
ggml_numa_strategy
|
|
174
|
-
int
|
|
175
|
-
ggml_sched_priority
|
|
176
|
-
int
|
|
177
|
-
bool
|
|
178
|
-
bool
|
|
179
|
-
output_formats
|
|
180
|
-
output_formats
|
|
163
|
+
std::vector<int> n_batch;
|
|
164
|
+
std::vector<int> n_ubatch;
|
|
165
|
+
std::vector<ggml_type> type_k;
|
|
166
|
+
std::vector<ggml_type> type_v;
|
|
167
|
+
std::vector<int> n_threads;
|
|
168
|
+
std::vector<std::string> cpu_mask;
|
|
169
|
+
std::vector<bool> cpu_strict;
|
|
170
|
+
std::vector<int> poll;
|
|
171
|
+
std::vector<int> n_gpu_layers;
|
|
172
|
+
std::vector<std::string> rpc_servers;
|
|
173
|
+
std::vector<llama_split_mode> split_mode;
|
|
174
|
+
std::vector<int> main_gpu;
|
|
175
|
+
std::vector<bool> no_kv_offload;
|
|
176
|
+
std::vector<bool> flash_attn;
|
|
177
|
+
std::vector<std::vector<float>> tensor_split;
|
|
178
|
+
std::vector<bool> use_mmap;
|
|
179
|
+
std::vector<bool> embeddings;
|
|
180
|
+
ggml_numa_strategy numa;
|
|
181
|
+
int reps;
|
|
182
|
+
ggml_sched_priority prio;
|
|
183
|
+
int delay;
|
|
184
|
+
bool verbose;
|
|
185
|
+
bool progress;
|
|
186
|
+
output_formats output_format;
|
|
187
|
+
output_formats output_format_stderr;
|
|
181
188
|
};
|
|
182
189
|
|
|
183
190
|
static const cmd_params cmd_params_defaults = {
|
|
184
|
-
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
|
185
|
-
/* n_prompt */ {512},
|
|
186
|
-
/* n_gen */ {128},
|
|
191
|
+
/* model */ { "models/7B/ggml-model-q4_0.gguf" },
|
|
192
|
+
/* n_prompt */ { 512 },
|
|
193
|
+
/* n_gen */ { 128 },
|
|
187
194
|
/* n_pg */ {},
|
|
188
|
-
/* n_batch */ {2048},
|
|
189
|
-
/* n_ubatch */ {512},
|
|
190
|
-
/* type_k */ {GGML_TYPE_F16},
|
|
191
|
-
/* type_v */ {GGML_TYPE_F16},
|
|
192
|
-
/* n_threads */ {cpu_get_num_math()},
|
|
193
|
-
/* cpu_mask */ {"0x0"},
|
|
194
|
-
/* cpu_strict */ {false},
|
|
195
|
-
/* poll */ {50},
|
|
196
|
-
/* n_gpu_layers */ {99},
|
|
197
|
-
/* rpc_servers */ {""},
|
|
198
|
-
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
|
199
|
-
/* main_gpu */ {0},
|
|
200
|
-
/* no_kv_offload */ {false},
|
|
201
|
-
/* flash_attn */ {false},
|
|
202
|
-
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
|
203
|
-
/* use_mmap */ {true},
|
|
204
|
-
/* embeddings */ {false},
|
|
195
|
+
/* n_batch */ { 2048 },
|
|
196
|
+
/* n_ubatch */ { 512 },
|
|
197
|
+
/* type_k */ { GGML_TYPE_F16 },
|
|
198
|
+
/* type_v */ { GGML_TYPE_F16 },
|
|
199
|
+
/* n_threads */ { cpu_get_num_math() },
|
|
200
|
+
/* cpu_mask */ { "0x0" },
|
|
201
|
+
/* cpu_strict */ { false },
|
|
202
|
+
/* poll */ { 50 },
|
|
203
|
+
/* n_gpu_layers */ { 99 },
|
|
204
|
+
/* rpc_servers */ { "" },
|
|
205
|
+
/* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
|
|
206
|
+
/* main_gpu */ { 0 },
|
|
207
|
+
/* no_kv_offload */ { false },
|
|
208
|
+
/* flash_attn */ { false },
|
|
209
|
+
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
|
|
210
|
+
/* use_mmap */ { true },
|
|
211
|
+
/* embeddings */ { false },
|
|
205
212
|
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
|
206
213
|
/* reps */ 5,
|
|
207
214
|
/* prio */ GGML_SCHED_PRIO_NORMAL,
|
|
@@ -218,38 +225,59 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
218
225
|
printf("options:\n");
|
|
219
226
|
printf(" -h, --help\n");
|
|
220
227
|
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
|
221
|
-
printf(" -p, --n-prompt <n> (default: %s)\n",
|
|
228
|
+
printf(" -p, --n-prompt <n> (default: %s)\n",
|
|
229
|
+
join(cmd_params_defaults.n_prompt, ",").c_str());
|
|
222
230
|
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
|
223
|
-
printf(" -pg <pp,tg> (default: %s)\n",
|
|
224
|
-
|
|
225
|
-
printf(" -
|
|
226
|
-
|
|
227
|
-
printf(" -
|
|
228
|
-
|
|
229
|
-
printf(" -
|
|
230
|
-
|
|
231
|
+
printf(" -pg <pp,tg> (default: %s)\n",
|
|
232
|
+
join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
|
233
|
+
printf(" -b, --batch-size <n> (default: %s)\n",
|
|
234
|
+
join(cmd_params_defaults.n_batch, ",").c_str());
|
|
235
|
+
printf(" -ub, --ubatch-size <n> (default: %s)\n",
|
|
236
|
+
join(cmd_params_defaults.n_ubatch, ",").c_str());
|
|
237
|
+
printf(" -ctk, --cache-type-k <t> (default: %s)\n",
|
|
238
|
+
join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
|
239
|
+
printf(" -ctv, --cache-type-v <t> (default: %s)\n",
|
|
240
|
+
join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
|
241
|
+
printf(" -t, --threads <n> (default: %s)\n",
|
|
242
|
+
join(cmd_params_defaults.n_threads, ",").c_str());
|
|
243
|
+
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
|
|
244
|
+
join(cmd_params_defaults.cpu_mask, ",").c_str());
|
|
245
|
+
printf(" --cpu-strict <0|1> (default: %s)\n",
|
|
246
|
+
join(cmd_params_defaults.cpu_strict, ",").c_str());
|
|
231
247
|
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
|
232
|
-
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n",
|
|
248
|
+
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n",
|
|
249
|
+
join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
|
233
250
|
if (llama_supports_rpc()) {
|
|
234
|
-
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n",
|
|
251
|
+
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n",
|
|
252
|
+
join(cmd_params_defaults.rpc_servers, ",").c_str());
|
|
235
253
|
}
|
|
236
|
-
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
|
|
237
|
-
|
|
238
|
-
printf(" -
|
|
239
|
-
|
|
240
|
-
printf(" -
|
|
254
|
+
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
|
|
255
|
+
join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
|
256
|
+
printf(" -mg, --main-gpu <i> (default: %s)\n",
|
|
257
|
+
join(cmd_params_defaults.main_gpu, ",").c_str());
|
|
258
|
+
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n",
|
|
259
|
+
join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
|
260
|
+
printf(" -fa, --flash-attn <0|1> (default: %s)\n",
|
|
261
|
+
join(cmd_params_defaults.flash_attn, ",").c_str());
|
|
262
|
+
printf(" -mmp, --mmap <0|1> (default: %s)\n",
|
|
263
|
+
join(cmd_params_defaults.use_mmap, ",").c_str());
|
|
241
264
|
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
|
242
|
-
printf(" -embd, --embeddings <0|1> (default: %s)\n",
|
|
265
|
+
printf(" -embd, --embeddings <0|1> (default: %s)\n",
|
|
266
|
+
join(cmd_params_defaults.embeddings, ",").c_str());
|
|
243
267
|
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
|
244
268
|
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
|
245
269
|
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
|
|
246
270
|
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
|
|
247
|
-
printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n",
|
|
248
|
-
|
|
271
|
+
printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n",
|
|
272
|
+
output_format_str(cmd_params_defaults.output_format));
|
|
273
|
+
printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n",
|
|
274
|
+
output_format_str(cmd_params_defaults.output_format_stderr));
|
|
249
275
|
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
|
250
276
|
printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
|
|
251
277
|
printf("\n");
|
|
252
|
-
printf(
|
|
278
|
+
printf(
|
|
279
|
+
"Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter "
|
|
280
|
+
"multiple times.\n");
|
|
253
281
|
}
|
|
254
282
|
|
|
255
283
|
static ggml_type ggml_type_from_name(const std::string & s) {
|
|
@@ -281,22 +309,21 @@ static ggml_type ggml_type_from_name(const std::string & s) {
|
|
|
281
309
|
return GGML_TYPE_COUNT;
|
|
282
310
|
}
|
|
283
311
|
|
|
284
|
-
|
|
285
312
|
static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
286
|
-
cmd_params
|
|
287
|
-
std::string
|
|
288
|
-
bool
|
|
289
|
-
const std::string arg_prefix
|
|
290
|
-
const char
|
|
291
|
-
|
|
292
|
-
params.verbose
|
|
293
|
-
params.output_format
|
|
313
|
+
cmd_params params;
|
|
314
|
+
std::string arg;
|
|
315
|
+
bool invalid_param = false;
|
|
316
|
+
const std::string arg_prefix = "--";
|
|
317
|
+
const char split_delim = ',';
|
|
318
|
+
|
|
319
|
+
params.verbose = cmd_params_defaults.verbose;
|
|
320
|
+
params.output_format = cmd_params_defaults.output_format;
|
|
294
321
|
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
|
295
|
-
params.reps
|
|
296
|
-
params.numa
|
|
297
|
-
params.prio
|
|
298
|
-
params.delay
|
|
299
|
-
params.progress
|
|
322
|
+
params.reps = cmd_params_defaults.reps;
|
|
323
|
+
params.numa = cmd_params_defaults.numa;
|
|
324
|
+
params.prio = cmd_params_defaults.prio;
|
|
325
|
+
params.delay = cmd_params_defaults.delay;
|
|
326
|
+
params.progress = cmd_params_defaults.progress;
|
|
300
327
|
|
|
301
328
|
for (int i = 1; i < argc; i++) {
|
|
302
329
|
arg = argv[i];
|
|
@@ -338,7 +365,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
338
365
|
invalid_param = true;
|
|
339
366
|
break;
|
|
340
367
|
}
|
|
341
|
-
params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
|
|
368
|
+
params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
|
|
342
369
|
} else if (arg == "-b" || arg == "--batch-size") {
|
|
343
370
|
if (++i >= argc) {
|
|
344
371
|
invalid_param = true;
|
|
@@ -358,7 +385,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
358
385
|
invalid_param = true;
|
|
359
386
|
break;
|
|
360
387
|
}
|
|
361
|
-
auto
|
|
388
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
362
389
|
std::vector<ggml_type> types;
|
|
363
390
|
for (const auto & t : p) {
|
|
364
391
|
ggml_type gt = ggml_type_from_name(t);
|
|
@@ -377,7 +404,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
377
404
|
invalid_param = true;
|
|
378
405
|
break;
|
|
379
406
|
}
|
|
380
|
-
auto
|
|
407
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
381
408
|
std::vector<ggml_type> types;
|
|
382
409
|
for (const auto & t : p) {
|
|
383
410
|
ggml_type gt = ggml_type_from_name(t);
|
|
@@ -437,7 +464,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
437
464
|
invalid_param = true;
|
|
438
465
|
break;
|
|
439
466
|
}
|
|
440
|
-
auto
|
|
467
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
441
468
|
std::vector<llama_split_mode> modes;
|
|
442
469
|
for (const auto & m : p) {
|
|
443
470
|
llama_split_mode mode;
|
|
@@ -476,10 +503,16 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
476
503
|
break;
|
|
477
504
|
} else {
|
|
478
505
|
std::string value(argv[i]);
|
|
479
|
-
/**/ if (value == "distribute" || value == ""
|
|
480
|
-
|
|
481
|
-
else if (value == "
|
|
482
|
-
|
|
506
|
+
/**/ if (value == "distribute" || value == "") {
|
|
507
|
+
params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
|
|
508
|
+
} else if (value == "isolate") {
|
|
509
|
+
params.numa = GGML_NUMA_STRATEGY_ISOLATE;
|
|
510
|
+
} else if (value == "numactl") {
|
|
511
|
+
params.numa = GGML_NUMA_STRATEGY_NUMACTL;
|
|
512
|
+
} else {
|
|
513
|
+
invalid_param = true;
|
|
514
|
+
break;
|
|
515
|
+
}
|
|
483
516
|
}
|
|
484
517
|
} else if (arg == "-fa" || arg == "--flash-attn") {
|
|
485
518
|
if (++i >= argc) {
|
|
@@ -509,9 +542,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
509
542
|
}
|
|
510
543
|
for (auto ts : string_split<std::string>(argv[i], split_delim)) {
|
|
511
544
|
// split string by ; and /
|
|
512
|
-
const std::regex
|
|
513
|
-
std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
|
|
514
|
-
std::vector<std::string>
|
|
545
|
+
const std::regex regex{ R"([;/]+)" };
|
|
546
|
+
std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
|
|
547
|
+
std::vector<std::string> split_arg{ it, {} };
|
|
515
548
|
GGML_ASSERT(split_arg.size() <= llama_max_devices());
|
|
516
549
|
|
|
517
550
|
std::vector<float> tensor_split(llama_max_devices());
|
|
@@ -570,52 +603,94 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
570
603
|
}
|
|
571
604
|
|
|
572
605
|
// set defaults
|
|
573
|
-
if (params.model.empty())
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
if (params.
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
if (params.
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
if (params.
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
if (params.
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
if (params.
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
if (params.
|
|
592
|
-
|
|
593
|
-
|
|
606
|
+
if (params.model.empty()) {
|
|
607
|
+
params.model = cmd_params_defaults.model;
|
|
608
|
+
}
|
|
609
|
+
if (params.n_prompt.empty()) {
|
|
610
|
+
params.n_prompt = cmd_params_defaults.n_prompt;
|
|
611
|
+
}
|
|
612
|
+
if (params.n_gen.empty()) {
|
|
613
|
+
params.n_gen = cmd_params_defaults.n_gen;
|
|
614
|
+
}
|
|
615
|
+
if (params.n_pg.empty()) {
|
|
616
|
+
params.n_pg = cmd_params_defaults.n_pg;
|
|
617
|
+
}
|
|
618
|
+
if (params.n_batch.empty()) {
|
|
619
|
+
params.n_batch = cmd_params_defaults.n_batch;
|
|
620
|
+
}
|
|
621
|
+
if (params.n_ubatch.empty()) {
|
|
622
|
+
params.n_ubatch = cmd_params_defaults.n_ubatch;
|
|
623
|
+
}
|
|
624
|
+
if (params.type_k.empty()) {
|
|
625
|
+
params.type_k = cmd_params_defaults.type_k;
|
|
626
|
+
}
|
|
627
|
+
if (params.type_v.empty()) {
|
|
628
|
+
params.type_v = cmd_params_defaults.type_v;
|
|
629
|
+
}
|
|
630
|
+
if (params.n_gpu_layers.empty()) {
|
|
631
|
+
params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
|
|
632
|
+
}
|
|
633
|
+
if (params.rpc_servers.empty()) {
|
|
634
|
+
params.rpc_servers = cmd_params_defaults.rpc_servers;
|
|
635
|
+
}
|
|
636
|
+
if (params.split_mode.empty()) {
|
|
637
|
+
params.split_mode = cmd_params_defaults.split_mode;
|
|
638
|
+
}
|
|
639
|
+
if (params.main_gpu.empty()) {
|
|
640
|
+
params.main_gpu = cmd_params_defaults.main_gpu;
|
|
641
|
+
}
|
|
642
|
+
if (params.no_kv_offload.empty()) {
|
|
643
|
+
params.no_kv_offload = cmd_params_defaults.no_kv_offload;
|
|
644
|
+
}
|
|
645
|
+
if (params.flash_attn.empty()) {
|
|
646
|
+
params.flash_attn = cmd_params_defaults.flash_attn;
|
|
647
|
+
}
|
|
648
|
+
if (params.tensor_split.empty()) {
|
|
649
|
+
params.tensor_split = cmd_params_defaults.tensor_split;
|
|
650
|
+
}
|
|
651
|
+
if (params.use_mmap.empty()) {
|
|
652
|
+
params.use_mmap = cmd_params_defaults.use_mmap;
|
|
653
|
+
}
|
|
654
|
+
if (params.embeddings.empty()) {
|
|
655
|
+
params.embeddings = cmd_params_defaults.embeddings;
|
|
656
|
+
}
|
|
657
|
+
if (params.n_threads.empty()) {
|
|
658
|
+
params.n_threads = cmd_params_defaults.n_threads;
|
|
659
|
+
}
|
|
660
|
+
if (params.cpu_mask.empty()) {
|
|
661
|
+
params.cpu_mask = cmd_params_defaults.cpu_mask;
|
|
662
|
+
}
|
|
663
|
+
if (params.cpu_strict.empty()) {
|
|
664
|
+
params.cpu_strict = cmd_params_defaults.cpu_strict;
|
|
665
|
+
}
|
|
666
|
+
if (params.poll.empty()) {
|
|
667
|
+
params.poll = cmd_params_defaults.poll;
|
|
668
|
+
}
|
|
594
669
|
|
|
595
670
|
return params;
|
|
596
671
|
}
|
|
597
672
|
|
|
598
673
|
struct cmd_params_instance {
|
|
599
|
-
std::string
|
|
600
|
-
int
|
|
601
|
-
int
|
|
602
|
-
int
|
|
603
|
-
int
|
|
604
|
-
ggml_type
|
|
605
|
-
ggml_type
|
|
606
|
-
int
|
|
607
|
-
std::string
|
|
608
|
-
bool
|
|
609
|
-
int
|
|
610
|
-
int
|
|
611
|
-
std::string
|
|
612
|
-
llama_split_mode
|
|
613
|
-
int
|
|
614
|
-
bool
|
|
615
|
-
bool
|
|
674
|
+
std::string model;
|
|
675
|
+
int n_prompt;
|
|
676
|
+
int n_gen;
|
|
677
|
+
int n_batch;
|
|
678
|
+
int n_ubatch;
|
|
679
|
+
ggml_type type_k;
|
|
680
|
+
ggml_type type_v;
|
|
681
|
+
int n_threads;
|
|
682
|
+
std::string cpu_mask;
|
|
683
|
+
bool cpu_strict;
|
|
684
|
+
int poll;
|
|
685
|
+
int n_gpu_layers;
|
|
686
|
+
std::string rpc_servers;
|
|
687
|
+
llama_split_mode split_mode;
|
|
688
|
+
int main_gpu;
|
|
689
|
+
bool no_kv_offload;
|
|
690
|
+
bool flash_attn;
|
|
616
691
|
std::vector<float> tensor_split;
|
|
617
|
-
bool
|
|
618
|
-
bool
|
|
692
|
+
bool use_mmap;
|
|
693
|
+
bool embeddings;
|
|
619
694
|
|
|
620
695
|
llama_model_params to_llama_mparams() const {
|
|
621
696
|
llama_model_params mparams = llama_model_default_params();
|
|
@@ -624,35 +699,31 @@ struct cmd_params_instance {
|
|
|
624
699
|
if (!rpc_servers.empty()) {
|
|
625
700
|
mparams.rpc_servers = rpc_servers.c_str();
|
|
626
701
|
}
|
|
627
|
-
mparams.split_mode
|
|
628
|
-
mparams.main_gpu
|
|
702
|
+
mparams.split_mode = split_mode;
|
|
703
|
+
mparams.main_gpu = main_gpu;
|
|
629
704
|
mparams.tensor_split = tensor_split.data();
|
|
630
|
-
mparams.use_mmap
|
|
705
|
+
mparams.use_mmap = use_mmap;
|
|
631
706
|
|
|
632
707
|
return mparams;
|
|
633
708
|
}
|
|
634
709
|
|
|
635
710
|
bool equal_mparams(const cmd_params_instance & other) const {
|
|
636
|
-
return model == other.model &&
|
|
637
|
-
|
|
638
|
-
rpc_servers == other.rpc_servers &&
|
|
639
|
-
split_mode == other.split_mode &&
|
|
640
|
-
main_gpu == other.main_gpu &&
|
|
641
|
-
use_mmap == other.use_mmap &&
|
|
711
|
+
return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers &&
|
|
712
|
+
split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
|
|
642
713
|
tensor_split == other.tensor_split;
|
|
643
714
|
}
|
|
644
715
|
|
|
645
716
|
llama_context_params to_llama_cparams() const {
|
|
646
717
|
llama_context_params cparams = llama_context_default_params();
|
|
647
718
|
|
|
648
|
-
cparams.n_ctx
|
|
649
|
-
cparams.n_batch
|
|
650
|
-
cparams.n_ubatch
|
|
651
|
-
cparams.type_k
|
|
652
|
-
cparams.type_v
|
|
719
|
+
cparams.n_ctx = n_prompt + n_gen;
|
|
720
|
+
cparams.n_batch = n_batch;
|
|
721
|
+
cparams.n_ubatch = n_ubatch;
|
|
722
|
+
cparams.type_k = type_k;
|
|
723
|
+
cparams.type_v = type_v;
|
|
653
724
|
cparams.offload_kqv = !no_kv_offload;
|
|
654
|
-
cparams.flash_attn
|
|
655
|
-
cparams.embeddings
|
|
725
|
+
cparams.flash_attn = flash_attn;
|
|
726
|
+
cparams.embeddings = embeddings;
|
|
656
727
|
|
|
657
728
|
return cparams;
|
|
658
729
|
}
|
|
@@ -662,6 +733,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
662
733
|
std::vector<cmd_params_instance> instances;
|
|
663
734
|
|
|
664
735
|
// this ordering minimizes the number of times that each model needs to be reloaded
|
|
736
|
+
// clang-format off
|
|
665
737
|
for (const auto & m : params.model)
|
|
666
738
|
for (const auto & nl : params.n_gpu_layers)
|
|
667
739
|
for (const auto & rpc : params.rpc_servers)
|
|
@@ -767,100 +839,94 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
767
839
|
instances.push_back(instance);
|
|
768
840
|
}
|
|
769
841
|
}
|
|
842
|
+
// clang-format on
|
|
770
843
|
|
|
771
844
|
return instances;
|
|
772
845
|
}
|
|
773
846
|
|
|
774
847
|
struct test {
|
|
775
848
|
static const std::string build_commit;
|
|
776
|
-
static const int
|
|
849
|
+
static const int build_number;
|
|
777
850
|
static const std::string cpu_info;
|
|
778
851
|
static const std::string gpu_info;
|
|
779
|
-
std::string
|
|
780
|
-
std::string
|
|
781
|
-
uint64_t
|
|
782
|
-
uint64_t
|
|
783
|
-
int
|
|
784
|
-
int
|
|
785
|
-
int
|
|
786
|
-
std::string
|
|
787
|
-
bool
|
|
788
|
-
int
|
|
789
|
-
ggml_type
|
|
790
|
-
ggml_type
|
|
791
|
-
int
|
|
792
|
-
llama_split_mode
|
|
793
|
-
int
|
|
794
|
-
bool
|
|
795
|
-
bool
|
|
796
|
-
std::vector<float>
|
|
797
|
-
bool
|
|
798
|
-
bool
|
|
799
|
-
int
|
|
800
|
-
int
|
|
801
|
-
std::string
|
|
802
|
-
std::vector<uint64_t>
|
|
852
|
+
std::string model_filename;
|
|
853
|
+
std::string model_type;
|
|
854
|
+
uint64_t model_size;
|
|
855
|
+
uint64_t model_n_params;
|
|
856
|
+
int n_batch;
|
|
857
|
+
int n_ubatch;
|
|
858
|
+
int n_threads;
|
|
859
|
+
std::string cpu_mask;
|
|
860
|
+
bool cpu_strict;
|
|
861
|
+
int poll;
|
|
862
|
+
ggml_type type_k;
|
|
863
|
+
ggml_type type_v;
|
|
864
|
+
int n_gpu_layers;
|
|
865
|
+
llama_split_mode split_mode;
|
|
866
|
+
int main_gpu;
|
|
867
|
+
bool no_kv_offload;
|
|
868
|
+
bool flash_attn;
|
|
869
|
+
std::vector<float> tensor_split;
|
|
870
|
+
bool use_mmap;
|
|
871
|
+
bool embeddings;
|
|
872
|
+
int n_prompt;
|
|
873
|
+
int n_gen;
|
|
874
|
+
std::string test_time;
|
|
875
|
+
std::vector<uint64_t> samples_ns;
|
|
803
876
|
|
|
804
877
|
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
|
|
805
878
|
model_filename = inst.model;
|
|
806
879
|
char buf[128];
|
|
807
880
|
llama_model_desc(lmodel, buf, sizeof(buf));
|
|
808
|
-
model_type
|
|
809
|
-
model_size
|
|
881
|
+
model_type = buf;
|
|
882
|
+
model_size = llama_model_size(lmodel);
|
|
810
883
|
model_n_params = llama_model_n_params(lmodel);
|
|
811
|
-
n_batch
|
|
812
|
-
n_ubatch
|
|
813
|
-
n_threads
|
|
814
|
-
cpu_mask
|
|
815
|
-
cpu_strict
|
|
816
|
-
poll
|
|
817
|
-
type_k
|
|
818
|
-
type_v
|
|
819
|
-
n_gpu_layers
|
|
820
|
-
split_mode
|
|
821
|
-
main_gpu
|
|
822
|
-
no_kv_offload
|
|
823
|
-
flash_attn
|
|
824
|
-
tensor_split
|
|
825
|
-
use_mmap
|
|
826
|
-
embeddings
|
|
827
|
-
n_prompt
|
|
828
|
-
n_gen
|
|
884
|
+
n_batch = inst.n_batch;
|
|
885
|
+
n_ubatch = inst.n_ubatch;
|
|
886
|
+
n_threads = inst.n_threads;
|
|
887
|
+
cpu_mask = inst.cpu_mask;
|
|
888
|
+
cpu_strict = inst.cpu_strict;
|
|
889
|
+
poll = inst.poll;
|
|
890
|
+
type_k = inst.type_k;
|
|
891
|
+
type_v = inst.type_v;
|
|
892
|
+
n_gpu_layers = inst.n_gpu_layers;
|
|
893
|
+
split_mode = inst.split_mode;
|
|
894
|
+
main_gpu = inst.main_gpu;
|
|
895
|
+
no_kv_offload = inst.no_kv_offload;
|
|
896
|
+
flash_attn = inst.flash_attn;
|
|
897
|
+
tensor_split = inst.tensor_split;
|
|
898
|
+
use_mmap = inst.use_mmap;
|
|
899
|
+
embeddings = inst.embeddings;
|
|
900
|
+
n_prompt = inst.n_prompt;
|
|
901
|
+
n_gen = inst.n_gen;
|
|
829
902
|
// RFC 3339 date-time format
|
|
830
|
-
time_t t
|
|
903
|
+
time_t t = time(NULL);
|
|
831
904
|
std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
|
|
832
905
|
test_time = buf;
|
|
833
906
|
|
|
834
907
|
(void) ctx;
|
|
835
908
|
}
|
|
836
909
|
|
|
837
|
-
uint64_t avg_ns() const {
|
|
838
|
-
return ::avg(samples_ns);
|
|
839
|
-
}
|
|
910
|
+
uint64_t avg_ns() const { return ::avg(samples_ns); }
|
|
840
911
|
|
|
841
|
-
uint64_t stdev_ns() const {
|
|
842
|
-
return ::stdev(samples_ns);
|
|
843
|
-
}
|
|
912
|
+
uint64_t stdev_ns() const { return ::stdev(samples_ns); }
|
|
844
913
|
|
|
845
914
|
std::vector<double> get_ts() const {
|
|
846
|
-
int
|
|
915
|
+
int n_tokens = n_prompt + n_gen;
|
|
847
916
|
std::vector<double> ts;
|
|
848
|
-
std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts),
|
|
917
|
+
std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts),
|
|
918
|
+
[n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
|
|
849
919
|
return ts;
|
|
850
920
|
}
|
|
851
921
|
|
|
852
|
-
double avg_ts() const {
|
|
853
|
-
return ::avg(get_ts());
|
|
854
|
-
}
|
|
922
|
+
double avg_ts() const { return ::avg(get_ts()); }
|
|
855
923
|
|
|
856
|
-
double stdev_ts() const {
|
|
857
|
-
return ::stdev(get_ts());
|
|
858
|
-
}
|
|
924
|
+
double stdev_ts() const { return ::stdev(get_ts()); }
|
|
859
925
|
|
|
860
926
|
static std::string get_backend() {
|
|
861
927
|
std::vector<std::string> backends;
|
|
862
928
|
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
|
863
|
-
auto *
|
|
929
|
+
auto * reg = ggml_backend_reg_get(i);
|
|
864
930
|
std::string name = ggml_backend_reg_name(reg);
|
|
865
931
|
if (name != "CPU") {
|
|
866
932
|
backends.push_back(ggml_backend_reg_name(reg));
|
|
@@ -871,36 +937,27 @@ struct test {
|
|
|
871
937
|
|
|
872
938
|
static const std::vector<std::string> & get_fields() {
|
|
873
939
|
static const std::vector<std::string> fields = {
|
|
874
|
-
"build_commit", "build_number",
|
|
875
|
-
"
|
|
876
|
-
"
|
|
877
|
-
"
|
|
878
|
-
"
|
|
879
|
-
"
|
|
880
|
-
"n_gpu_layers", "split_mode",
|
|
881
|
-
"main_gpu", "no_kv_offload", "flash_attn",
|
|
882
|
-
"tensor_split", "use_mmap", "embeddings",
|
|
883
|
-
"n_prompt", "n_gen", "test_time",
|
|
884
|
-
"avg_ns", "stddev_ns",
|
|
885
|
-
"avg_ts", "stddev_ts",
|
|
940
|
+
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
|
|
941
|
+
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
|
|
942
|
+
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
|
943
|
+
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap",
|
|
944
|
+
"embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns",
|
|
945
|
+
"avg_ts", "stddev_ts",
|
|
886
946
|
};
|
|
887
947
|
return fields;
|
|
888
948
|
}
|
|
889
949
|
|
|
890
|
-
enum field_type {STRING, BOOL, INT, FLOAT};
|
|
950
|
+
enum field_type { STRING, BOOL, INT, FLOAT };
|
|
891
951
|
|
|
892
952
|
static field_type get_field_type(const std::string & field) {
|
|
893
|
-
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
|
|
894
|
-
field == "
|
|
895
|
-
field == "
|
|
896
|
-
field == "
|
|
897
|
-
field == "n_prompt" || field == "n_gen" ||
|
|
898
|
-
field == "avg_ns" || field == "stddev_ns") {
|
|
953
|
+
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
|
|
954
|
+
field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
|
|
955
|
+
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
|
|
956
|
+
field == "stddev_ns") {
|
|
899
957
|
return INT;
|
|
900
958
|
}
|
|
901
|
-
if (field == "f16_kv" || field == "no_kv_offload" ||
|
|
902
|
-
field == "
|
|
903
|
-
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
|
959
|
+
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
|
|
960
|
+
field == "use_mmap" || field == "embeddings") {
|
|
904
961
|
return BOOL;
|
|
905
962
|
}
|
|
906
963
|
if (field == "avg_ts" || field == "stddev_ts") {
|
|
@@ -911,7 +968,7 @@ struct test {
|
|
|
911
968
|
|
|
912
969
|
std::vector<std::string> get_values() const {
|
|
913
970
|
std::string tensor_split_str;
|
|
914
|
-
int
|
|
971
|
+
int max_nonzero = 0;
|
|
915
972
|
for (size_t i = 0; i < llama_max_devices(); i++) {
|
|
916
973
|
if (tensor_split[i] > 0) {
|
|
917
974
|
max_nonzero = i;
|
|
@@ -925,29 +982,47 @@ struct test {
|
|
|
925
982
|
tensor_split_str += "/";
|
|
926
983
|
}
|
|
927
984
|
}
|
|
928
|
-
std::vector<std::string> values = {
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
985
|
+
std::vector<std::string> values = { build_commit,
|
|
986
|
+
std::to_string(build_number),
|
|
987
|
+
cpu_info,
|
|
988
|
+
gpu_info,
|
|
989
|
+
get_backend(),
|
|
990
|
+
model_filename,
|
|
991
|
+
model_type,
|
|
992
|
+
std::to_string(model_size),
|
|
993
|
+
std::to_string(model_n_params),
|
|
994
|
+
std::to_string(n_batch),
|
|
995
|
+
std::to_string(n_ubatch),
|
|
996
|
+
std::to_string(n_threads),
|
|
997
|
+
cpu_mask,
|
|
998
|
+
std::to_string(cpu_strict),
|
|
999
|
+
std::to_string(poll),
|
|
1000
|
+
ggml_type_name(type_k),
|
|
1001
|
+
ggml_type_name(type_v),
|
|
1002
|
+
std::to_string(n_gpu_layers),
|
|
1003
|
+
split_mode_str(split_mode),
|
|
1004
|
+
std::to_string(main_gpu),
|
|
1005
|
+
std::to_string(no_kv_offload),
|
|
1006
|
+
std::to_string(flash_attn),
|
|
1007
|
+
tensor_split_str,
|
|
1008
|
+
std::to_string(use_mmap),
|
|
1009
|
+
std::to_string(embeddings),
|
|
1010
|
+
std::to_string(n_prompt),
|
|
1011
|
+
std::to_string(n_gen),
|
|
1012
|
+
test_time,
|
|
1013
|
+
std::to_string(avg_ns()),
|
|
1014
|
+
std::to_string(stdev_ns()),
|
|
1015
|
+
std::to_string(avg_ts()),
|
|
1016
|
+
std::to_string(stdev_ts()) };
|
|
942
1017
|
return values;
|
|
943
1018
|
}
|
|
944
1019
|
|
|
945
1020
|
std::map<std::string, std::string> get_map() const {
|
|
946
1021
|
std::map<std::string, std::string> map;
|
|
947
|
-
auto
|
|
948
|
-
auto
|
|
949
|
-
std::transform(fields.begin(), fields.end(), values.begin(),
|
|
950
|
-
|
|
1022
|
+
auto fields = get_fields();
|
|
1023
|
+
auto values = get_values();
|
|
1024
|
+
std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()),
|
|
1025
|
+
std::make_pair<const std::string &, const std::string &>);
|
|
951
1026
|
return map;
|
|
952
1027
|
}
|
|
953
1028
|
};
|
|
@@ -961,9 +1036,12 @@ struct printer {
|
|
|
961
1036
|
virtual ~printer() {}
|
|
962
1037
|
|
|
963
1038
|
FILE * fout;
|
|
1039
|
+
|
|
964
1040
|
virtual void print_header(const cmd_params & params) { (void) params; }
|
|
1041
|
+
|
|
965
1042
|
virtual void print_test(const test & t) = 0;
|
|
966
|
-
|
|
1043
|
+
|
|
1044
|
+
virtual void print_footer() {}
|
|
967
1045
|
};
|
|
968
1046
|
|
|
969
1047
|
struct csv_printer : public printer {
|
|
@@ -979,7 +1057,7 @@ struct csv_printer : public printer {
|
|
|
979
1057
|
return escaped;
|
|
980
1058
|
}
|
|
981
1059
|
|
|
982
|
-
void print_header(const cmd_params & params) override
|
|
1060
|
+
void print_header(const cmd_params & params) override {
|
|
983
1061
|
std::vector<std::string> fields = test::get_fields();
|
|
984
1062
|
fprintf(fout, "%s\n", join(fields, ",").c_str());
|
|
985
1063
|
(void) params;
|
|
@@ -992,7 +1070,6 @@ struct csv_printer : public printer {
|
|
|
992
1070
|
}
|
|
993
1071
|
};
|
|
994
1072
|
|
|
995
|
-
|
|
996
1073
|
static std::string escape_json(const std::string & value) {
|
|
997
1074
|
std::string escaped;
|
|
998
1075
|
for (auto c : value) {
|
|
@@ -1000,7 +1077,7 @@ static std::string escape_json(const std::string & value) {
|
|
|
1000
1077
|
escaped += "\\\"";
|
|
1001
1078
|
} else if (c == '\\') {
|
|
1002
1079
|
escaped += "\\\\";
|
|
1003
|
-
} else
|
|
1080
|
+
} else if (c <= 0x1f) {
|
|
1004
1081
|
char buf[8];
|
|
1005
1082
|
snprintf(buf, sizeof(buf), "\\u%04x", c);
|
|
1006
1083
|
escaped += buf;
|
|
@@ -1033,7 +1110,8 @@ struct json_printer : public printer {
|
|
|
1033
1110
|
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
|
1034
1111
|
assert(fields.size() == values.size());
|
|
1035
1112
|
for (size_t i = 0; i < fields.size(); i++) {
|
|
1036
|
-
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(),
|
|
1113
|
+
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(),
|
|
1114
|
+
format_json_value(fields.at(i), values.at(i)).c_str());
|
|
1037
1115
|
}
|
|
1038
1116
|
}
|
|
1039
1117
|
|
|
@@ -1051,12 +1129,9 @@ struct json_printer : public printer {
|
|
|
1051
1129
|
fflush(fout);
|
|
1052
1130
|
}
|
|
1053
1131
|
|
|
1054
|
-
void print_footer() override {
|
|
1055
|
-
fprintf(fout, "\n]\n");
|
|
1056
|
-
}
|
|
1132
|
+
void print_footer() override { fprintf(fout, "\n]\n"); }
|
|
1057
1133
|
};
|
|
1058
1134
|
|
|
1059
|
-
|
|
1060
1135
|
struct jsonl_printer : public printer {
|
|
1061
1136
|
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
|
1062
1137
|
assert(fields.size() == values.size());
|
|
@@ -1116,7 +1191,7 @@ struct markdown_printer : public printer {
|
|
|
1116
1191
|
return 13;
|
|
1117
1192
|
}
|
|
1118
1193
|
|
|
1119
|
-
int width = std::max((int)field.length(), 10);
|
|
1194
|
+
int width = std::max((int) field.length(), 10);
|
|
1120
1195
|
|
|
1121
1196
|
if (test::get_field_type(field) == test::STRING) {
|
|
1122
1197
|
return -width;
|
|
@@ -1230,18 +1305,18 @@ struct markdown_printer : public printer {
|
|
|
1230
1305
|
fprintf(fout, "|");
|
|
1231
1306
|
for (const auto & field : fields) {
|
|
1232
1307
|
std::string value;
|
|
1233
|
-
char
|
|
1308
|
+
char buf[128];
|
|
1234
1309
|
if (field == "model") {
|
|
1235
1310
|
value = t.model_type;
|
|
1236
1311
|
} else if (field == "size") {
|
|
1237
|
-
if (t.model_size < 1024*1024*1024) {
|
|
1312
|
+
if (t.model_size < 1024 * 1024 * 1024) {
|
|
1238
1313
|
snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
|
|
1239
1314
|
} else {
|
|
1240
1315
|
snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
|
|
1241
1316
|
}
|
|
1242
1317
|
value = buf;
|
|
1243
1318
|
} else if (field == "params") {
|
|
1244
|
-
if (t.model_n_params < 1000*1000*1000) {
|
|
1319
|
+
if (t.model_n_params < 1000 * 1000 * 1000) {
|
|
1245
1320
|
snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
|
|
1246
1321
|
} else {
|
|
1247
1322
|
snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
|
|
@@ -1303,7 +1378,8 @@ struct sql_printer : public printer {
|
|
|
1303
1378
|
std::vector<std::string> fields = test::get_fields();
|
|
1304
1379
|
fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
|
|
1305
1380
|
for (size_t i = 0; i < fields.size(); i++) {
|
|
1306
|
-
fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
|
|
1381
|
+
fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
|
|
1382
|
+
i < fields.size() - 1 ? "," : "");
|
|
1307
1383
|
}
|
|
1308
1384
|
fprintf(fout, ");\n");
|
|
1309
1385
|
fprintf(fout, "\n");
|
|
@@ -1324,8 +1400,8 @@ struct sql_printer : public printer {
|
|
|
1324
1400
|
static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
|
|
1325
1401
|
llama_set_n_threads(ctx, n_threads, n_threads);
|
|
1326
1402
|
|
|
1327
|
-
const llama_model * model
|
|
1328
|
-
const int32_t
|
|
1403
|
+
const llama_model * model = llama_get_model(ctx);
|
|
1404
|
+
const int32_t n_vocab = llama_n_vocab(model);
|
|
1329
1405
|
|
|
1330
1406
|
std::vector<llama_token> tokens(n_batch);
|
|
1331
1407
|
|
|
@@ -1333,7 +1409,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
|
|
|
1333
1409
|
|
|
1334
1410
|
while (n_processed < n_prompt) {
|
|
1335
1411
|
int n_tokens = std::min(n_prompt - n_processed, n_batch);
|
|
1336
|
-
tokens[0]
|
|
1412
|
+
tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
|
|
1337
1413
|
for (int i = 1; i < n_tokens; i++) {
|
|
1338
1414
|
tokens[i] = std::rand() % n_vocab;
|
|
1339
1415
|
}
|
|
@@ -1347,8 +1423,8 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
|
|
|
1347
1423
|
static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
|
|
1348
1424
|
llama_set_n_threads(ctx, n_threads, n_threads);
|
|
1349
1425
|
|
|
1350
|
-
const llama_model * model
|
|
1351
|
-
const int32_t
|
|
1426
|
+
const llama_model * model = llama_get_model(ctx);
|
|
1427
|
+
const int32_t n_vocab = llama_n_vocab(model);
|
|
1352
1428
|
|
|
1353
1429
|
llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
|
|
1354
1430
|
|
|
@@ -1401,6 +1477,17 @@ int main(int argc, char ** argv) {
|
|
|
1401
1477
|
|
|
1402
1478
|
cmd_params params = parse_cmd_params(argc, argv);
|
|
1403
1479
|
|
|
1480
|
+
// initialize backends
|
|
1481
|
+
ggml_backend_load_all();
|
|
1482
|
+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
1483
|
+
if (!cpu_dev) {
|
|
1484
|
+
fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
|
|
1485
|
+
return 1;
|
|
1486
|
+
}
|
|
1487
|
+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
|
1488
|
+
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
|
|
1489
|
+
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
|
|
1490
|
+
|
|
1404
1491
|
// initialize llama.cpp
|
|
1405
1492
|
if (!params.verbose) {
|
|
1406
1493
|
llama_log_set(llama_null_log_callback, NULL);
|
|
@@ -1411,7 +1498,7 @@ int main(int argc, char ** argv) {
|
|
|
1411
1498
|
set_process_priority(params.prio);
|
|
1412
1499
|
|
|
1413
1500
|
// initialize printer
|
|
1414
|
-
std::unique_ptr<printer> p
|
|
1501
|
+
std::unique_ptr<printer> p = create_printer(params.output_format);
|
|
1415
1502
|
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
|
|
1416
1503
|
|
|
1417
1504
|
if (p) {
|
|
@@ -1426,15 +1513,15 @@ int main(int argc, char ** argv) {
|
|
|
1426
1513
|
|
|
1427
1514
|
std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
|
|
1428
1515
|
|
|
1429
|
-
llama_model *
|
|
1516
|
+
llama_model * lmodel = nullptr;
|
|
1430
1517
|
const cmd_params_instance * prev_inst = nullptr;
|
|
1431
1518
|
|
|
1432
|
-
int
|
|
1519
|
+
int params_idx = 0;
|
|
1433
1520
|
auto params_count = params_instances.size();
|
|
1434
1521
|
for (const auto & inst : params_instances) {
|
|
1435
|
-
params_idx
|
|
1522
|
+
params_idx++;
|
|
1436
1523
|
if (params.progress) {
|
|
1437
|
-
fprintf(stderr, "llama-bench: benchmark %d/%
|
|
1524
|
+
fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
|
|
1438
1525
|
}
|
|
1439
1526
|
// keep the same model between tests when possible
|
|
1440
1527
|
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
|
@@ -1475,7 +1562,7 @@ int main(int argc, char ** argv) {
|
|
|
1475
1562
|
tpp.poll = t.poll;
|
|
1476
1563
|
tpp.prio = params.prio;
|
|
1477
1564
|
|
|
1478
|
-
struct ggml_threadpool* threadpool =
|
|
1565
|
+
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
|
|
1479
1566
|
if (!threadpool) {
|
|
1480
1567
|
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
|
1481
1568
|
exit(1);
|
|
@@ -1486,14 +1573,14 @@ int main(int argc, char ** argv) {
|
|
|
1486
1573
|
// warmup run
|
|
1487
1574
|
if (t.n_prompt > 0) {
|
|
1488
1575
|
if (params.progress) {
|
|
1489
|
-
fprintf(stderr, "llama-bench: benchmark %d/%
|
|
1576
|
+
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
|
|
1490
1577
|
}
|
|
1491
1578
|
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
|
1492
1579
|
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
|
1493
1580
|
}
|
|
1494
1581
|
if (t.n_gen > 0) {
|
|
1495
1582
|
if (params.progress) {
|
|
1496
|
-
fprintf(stderr, "llama-bench: benchmark %d/%
|
|
1583
|
+
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
|
|
1497
1584
|
}
|
|
1498
1585
|
test_gen(ctx, 1, t.n_threads);
|
|
1499
1586
|
}
|
|
@@ -1505,13 +1592,15 @@ int main(int argc, char ** argv) {
|
|
|
1505
1592
|
|
|
1506
1593
|
if (t.n_prompt > 0) {
|
|
1507
1594
|
if (params.progress) {
|
|
1508
|
-
fprintf(stderr, "llama-bench: benchmark %d/%
|
|
1595
|
+
fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
|
|
1596
|
+
i + 1, params.reps);
|
|
1509
1597
|
}
|
|
1510
1598
|
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
|
1511
1599
|
}
|
|
1512
1600
|
if (t.n_gen > 0) {
|
|
1513
1601
|
if (params.progress) {
|
|
1514
|
-
fprintf(stderr, "llama-bench: benchmark %d/%
|
|
1602
|
+
fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
|
|
1603
|
+
i + 1, params.reps);
|
|
1515
1604
|
}
|
|
1516
1605
|
test_gen(ctx, t.n_gen, t.n_threads);
|
|
1517
1606
|
}
|
|
@@ -1534,7 +1623,7 @@ int main(int argc, char ** argv) {
|
|
|
1534
1623
|
|
|
1535
1624
|
llama_free(ctx);
|
|
1536
1625
|
|
|
1537
|
-
|
|
1626
|
+
ggml_threadpool_free_fn(threadpool);
|
|
1538
1627
|
}
|
|
1539
1628
|
|
|
1540
1629
|
llama_free_model(lmodel);
|