@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
#include <sstream>
|
|
17
17
|
#include <string>
|
|
18
18
|
#include <vector>
|
|
19
|
+
#include <thread>
|
|
19
20
|
|
|
20
21
|
#include "ggml.h"
|
|
21
22
|
#include "llama.h"
|
|
@@ -27,6 +28,14 @@
|
|
|
27
28
|
#include "ggml-cann.h"
|
|
28
29
|
#endif
|
|
29
30
|
|
|
31
|
+
#ifdef _WIN32
|
|
32
|
+
#define WIN32_LEAN_AND_MEAN
|
|
33
|
+
#ifndef NOMINMAX
|
|
34
|
+
# define NOMINMAX
|
|
35
|
+
#endif
|
|
36
|
+
#include <windows.h>
|
|
37
|
+
#endif
|
|
38
|
+
|
|
30
39
|
// utils
|
|
31
40
|
static uint64_t get_time_ns() {
|
|
32
41
|
using clock = std::chrono::high_resolution_clock;
|
|
@@ -96,6 +105,30 @@ static std::string get_cpu_info() {
|
|
|
96
105
|
}
|
|
97
106
|
fclose(f);
|
|
98
107
|
}
|
|
108
|
+
#elif defined(_WIN32)
|
|
109
|
+
HKEY hKey;
|
|
110
|
+
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
|
|
111
|
+
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
|
|
112
|
+
0,
|
|
113
|
+
KEY_READ,
|
|
114
|
+
&hKey) != ERROR_SUCCESS) {
|
|
115
|
+
// fail to open registry key
|
|
116
|
+
return "";
|
|
117
|
+
}
|
|
118
|
+
char cpu_brand[256];
|
|
119
|
+
DWORD cpu_brand_size = sizeof(cpu_brand);
|
|
120
|
+
if (RegQueryValueExA(hKey,
|
|
121
|
+
TEXT("ProcessorNameString"),
|
|
122
|
+
NULL,
|
|
123
|
+
NULL,
|
|
124
|
+
(LPBYTE)cpu_brand,
|
|
125
|
+
&cpu_brand_size) == ERROR_SUCCESS) {
|
|
126
|
+
id.assign(cpu_brand, cpu_brand_size);
|
|
127
|
+
if (id.find('\0') != std::string::npos) {
|
|
128
|
+
id.resize(id.find('\0'));
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
RegCloseKey(hKey);
|
|
99
132
|
#endif
|
|
100
133
|
// TODO: other platforms
|
|
101
134
|
return id;
|
|
@@ -141,13 +174,14 @@ static std::string get_gpu_info() {
|
|
|
141
174
|
}
|
|
142
175
|
|
|
143
176
|
// command line params
|
|
144
|
-
enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
|
|
177
|
+
enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};
|
|
145
178
|
|
|
146
179
|
static const char * output_format_str(output_formats format) {
|
|
147
180
|
switch (format) {
|
|
148
181
|
case NONE: return "none";
|
|
149
182
|
case CSV: return "csv";
|
|
150
183
|
case JSON: return "json";
|
|
184
|
+
case JSONL: return "jsonl";
|
|
151
185
|
case MARKDOWN: return "md";
|
|
152
186
|
case SQL: return "sql";
|
|
153
187
|
default: GGML_ABORT("invalid output format");
|
|
@@ -161,6 +195,8 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
|
|
|
161
195
|
format = CSV;
|
|
162
196
|
} else if (s == "json") {
|
|
163
197
|
format = JSON;
|
|
198
|
+
} else if (s == "jsonl") {
|
|
199
|
+
format = JSONL;
|
|
164
200
|
} else if (s == "md") {
|
|
165
201
|
format = MARKDOWN;
|
|
166
202
|
} else if (s == "sql") {
|
|
@@ -196,6 +232,9 @@ struct cmd_params {
|
|
|
196
232
|
std::vector<ggml_type> type_k;
|
|
197
233
|
std::vector<ggml_type> type_v;
|
|
198
234
|
std::vector<int> n_threads;
|
|
235
|
+
std::vector<std::string> cpu_mask;
|
|
236
|
+
std::vector<bool> cpu_strict;
|
|
237
|
+
std::vector<int> poll;
|
|
199
238
|
std::vector<int> n_gpu_layers;
|
|
200
239
|
std::vector<std::string> rpc_servers;
|
|
201
240
|
std::vector<llama_split_mode> split_mode;
|
|
@@ -207,7 +246,10 @@ struct cmd_params {
|
|
|
207
246
|
std::vector<bool> embeddings;
|
|
208
247
|
ggml_numa_strategy numa;
|
|
209
248
|
int reps;
|
|
249
|
+
ggml_sched_priority prio;
|
|
250
|
+
int delay;
|
|
210
251
|
bool verbose;
|
|
252
|
+
bool progress;
|
|
211
253
|
output_formats output_format;
|
|
212
254
|
output_formats output_format_stderr;
|
|
213
255
|
};
|
|
@@ -222,6 +264,9 @@ static const cmd_params cmd_params_defaults = {
|
|
|
222
264
|
/* type_k */ {GGML_TYPE_F16},
|
|
223
265
|
/* type_v */ {GGML_TYPE_F16},
|
|
224
266
|
/* n_threads */ {cpu_get_num_math()},
|
|
267
|
+
/* cpu_mask */ {"0x0"},
|
|
268
|
+
/* cpu_strict */ {false},
|
|
269
|
+
/* poll */ {50},
|
|
225
270
|
/* n_gpu_layers */ {99},
|
|
226
271
|
/* rpc_servers */ {""},
|
|
227
272
|
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
|
@@ -233,7 +278,10 @@ static const cmd_params cmd_params_defaults = {
|
|
|
233
278
|
/* embeddings */ {false},
|
|
234
279
|
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
|
235
280
|
/* reps */ 5,
|
|
281
|
+
/* prio */ GGML_SCHED_PRIO_NORMAL,
|
|
282
|
+
/* delay */ 0,
|
|
236
283
|
/* verbose */ false,
|
|
284
|
+
/* progress */ false,
|
|
237
285
|
/* output_format */ MARKDOWN,
|
|
238
286
|
/* output_format_stderr */ NONE,
|
|
239
287
|
};
|
|
@@ -243,29 +291,37 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
243
291
|
printf("\n");
|
|
244
292
|
printf("options:\n");
|
|
245
293
|
printf(" -h, --help\n");
|
|
246
|
-
printf(" -m, --model <filename>
|
|
247
|
-
printf(" -p, --n-prompt <n>
|
|
248
|
-
printf(" -n, --n-gen <n>
|
|
249
|
-
printf(" -pg <pp,tg>
|
|
250
|
-
printf(" -b, --batch-size <n>
|
|
251
|
-
printf(" -ub, --ubatch-size <n>
|
|
252
|
-
printf(" -ctk, --cache-type-k <t>
|
|
253
|
-
printf(" -ctv, --cache-type-v <t>
|
|
254
|
-
printf(" -t, --threads <n>
|
|
255
|
-
printf(" -
|
|
256
|
-
printf(" -
|
|
257
|
-
printf("
|
|
258
|
-
printf(" -
|
|
259
|
-
|
|
260
|
-
printf(" -
|
|
261
|
-
|
|
262
|
-
printf(" --
|
|
263
|
-
printf(" -
|
|
264
|
-
printf(" -
|
|
265
|
-
printf(" -
|
|
266
|
-
printf(" -
|
|
267
|
-
printf("
|
|
268
|
-
printf(" -
|
|
294
|
+
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
|
295
|
+
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
|
296
|
+
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
|
297
|
+
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
|
298
|
+
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
|
299
|
+
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
|
|
300
|
+
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
|
301
|
+
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
|
302
|
+
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
|
303
|
+
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
|
|
304
|
+
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
|
|
305
|
+
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
|
306
|
+
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
|
307
|
+
#ifdef GGML_USE_RPC
|
|
308
|
+
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
|
309
|
+
#endif
|
|
310
|
+
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
|
311
|
+
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
|
312
|
+
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
|
313
|
+
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
|
314
|
+
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
|
315
|
+
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
|
316
|
+
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
|
317
|
+
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
|
318
|
+
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
|
319
|
+
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
|
|
320
|
+
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
|
|
321
|
+
printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
|
322
|
+
printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
|
|
323
|
+
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
|
324
|
+
printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
|
|
269
325
|
printf("\n");
|
|
270
326
|
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
|
271
327
|
}
|
|
@@ -309,6 +365,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
309
365
|
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
|
310
366
|
params.reps = cmd_params_defaults.reps;
|
|
311
367
|
params.numa = cmd_params_defaults.numa;
|
|
368
|
+
params.prio = cmd_params_defaults.prio;
|
|
369
|
+
params.delay = cmd_params_defaults.delay;
|
|
370
|
+
params.progress = cmd_params_defaults.progress;
|
|
312
371
|
|
|
313
372
|
for (int i = 1; i < argc; i++) {
|
|
314
373
|
arg = argv[i];
|
|
@@ -380,6 +439,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
380
439
|
}
|
|
381
440
|
types.push_back(gt);
|
|
382
441
|
}
|
|
442
|
+
if (invalid_param) {
|
|
443
|
+
break;
|
|
444
|
+
}
|
|
383
445
|
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
|
|
384
446
|
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
|
385
447
|
if (++i >= argc) {
|
|
@@ -396,6 +458,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
396
458
|
}
|
|
397
459
|
types.push_back(gt);
|
|
398
460
|
}
|
|
461
|
+
if (invalid_param) {
|
|
462
|
+
break;
|
|
463
|
+
}
|
|
399
464
|
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
|
|
400
465
|
} else if (arg == "-t" || arg == "--threads") {
|
|
401
466
|
if (++i >= argc) {
|
|
@@ -404,6 +469,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
404
469
|
}
|
|
405
470
|
auto p = string_split<int>(argv[i], split_delim);
|
|
406
471
|
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
|
472
|
+
} else if (arg == "-C" || arg == "--cpu-mask") {
|
|
473
|
+
if (++i >= argc) {
|
|
474
|
+
invalid_param = true;
|
|
475
|
+
break;
|
|
476
|
+
}
|
|
477
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
478
|
+
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
|
|
479
|
+
} else if (arg == "--cpu-strict") {
|
|
480
|
+
if (++i >= argc) {
|
|
481
|
+
invalid_param = true;
|
|
482
|
+
break;
|
|
483
|
+
}
|
|
484
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
485
|
+
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
|
|
486
|
+
} else if (arg == "--poll") {
|
|
487
|
+
if (++i >= argc) {
|
|
488
|
+
invalid_param = true;
|
|
489
|
+
break;
|
|
490
|
+
}
|
|
491
|
+
auto p = string_split<int>(argv[i], split_delim);
|
|
492
|
+
params.poll.insert(params.poll.end(), p.begin(), p.end());
|
|
407
493
|
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
|
408
494
|
if (++i >= argc) {
|
|
409
495
|
invalid_param = true;
|
|
@@ -411,12 +497,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
411
497
|
}
|
|
412
498
|
auto p = string_split<int>(argv[i], split_delim);
|
|
413
499
|
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
|
500
|
+
#ifdef GGML_USE_RPC
|
|
414
501
|
} else if (arg == "-rpc" || arg == "--rpc") {
|
|
415
502
|
if (++i >= argc) {
|
|
416
503
|
invalid_param = true;
|
|
417
504
|
break;
|
|
418
505
|
}
|
|
419
506
|
params.rpc_servers.push_back(argv[i]);
|
|
507
|
+
#endif
|
|
420
508
|
} else if (arg == "-sm" || arg == "--split-mode") {
|
|
421
509
|
if (++i >= argc) {
|
|
422
510
|
invalid_param = true;
|
|
@@ -438,6 +526,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
438
526
|
}
|
|
439
527
|
modes.push_back(mode);
|
|
440
528
|
}
|
|
529
|
+
if (invalid_param) {
|
|
530
|
+
break;
|
|
531
|
+
}
|
|
441
532
|
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
|
|
442
533
|
} else if (arg == "-mg" || arg == "--main-gpu") {
|
|
443
534
|
if (++i >= argc) {
|
|
@@ -512,6 +603,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
512
603
|
break;
|
|
513
604
|
}
|
|
514
605
|
params.reps = std::stoi(argv[i]);
|
|
606
|
+
} else if (arg == "--prio") {
|
|
607
|
+
if (++i >= argc) {
|
|
608
|
+
invalid_param = true;
|
|
609
|
+
break;
|
|
610
|
+
}
|
|
611
|
+
params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
|
|
612
|
+
} else if (arg == "--delay") {
|
|
613
|
+
if (++i >= argc) {
|
|
614
|
+
invalid_param = true;
|
|
615
|
+
break;
|
|
616
|
+
}
|
|
617
|
+
params.delay = std::stoi(argv[i]);
|
|
515
618
|
} else if (arg == "-o" || arg == "--output") {
|
|
516
619
|
if (++i >= argc) {
|
|
517
620
|
invalid_param = true;
|
|
@@ -526,6 +629,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
526
629
|
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
|
|
527
630
|
} else if (arg == "-v" || arg == "--verbose") {
|
|
528
631
|
params.verbose = true;
|
|
632
|
+
} else if (arg == "--progress") {
|
|
633
|
+
params.progress = true;
|
|
529
634
|
} else {
|
|
530
635
|
invalid_param = true;
|
|
531
636
|
break;
|
|
@@ -556,6 +661,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
556
661
|
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
|
557
662
|
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
|
558
663
|
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
|
664
|
+
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
|
|
665
|
+
if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
|
|
666
|
+
if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
|
|
559
667
|
|
|
560
668
|
return params;
|
|
561
669
|
}
|
|
@@ -569,6 +677,9 @@ struct cmd_params_instance {
|
|
|
569
677
|
ggml_type type_k;
|
|
570
678
|
ggml_type type_v;
|
|
571
679
|
int n_threads;
|
|
680
|
+
std::string cpu_mask;
|
|
681
|
+
bool cpu_strict;
|
|
682
|
+
int poll;
|
|
572
683
|
int n_gpu_layers;
|
|
573
684
|
std::string rpc_servers;
|
|
574
685
|
llama_split_mode split_mode;
|
|
@@ -638,7 +749,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
638
749
|
for (const auto & tv : params.type_v)
|
|
639
750
|
for (const auto & nkvo : params.no_kv_offload)
|
|
640
751
|
for (const auto & fa : params.flash_attn)
|
|
641
|
-
for (const auto & nt : params.n_threads)
|
|
752
|
+
for (const auto & nt : params.n_threads)
|
|
753
|
+
for (const auto & cm : params.cpu_mask)
|
|
754
|
+
for (const auto & cs : params.cpu_strict)
|
|
755
|
+
for (const auto & pl : params.poll) {
|
|
642
756
|
for (const auto & n_prompt : params.n_prompt) {
|
|
643
757
|
if (n_prompt == 0) {
|
|
644
758
|
continue;
|
|
@@ -652,6 +766,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
652
766
|
/* .type_k = */ tk,
|
|
653
767
|
/* .type_v = */ tv,
|
|
654
768
|
/* .n_threads = */ nt,
|
|
769
|
+
/* .cpu_mask = */ cm,
|
|
770
|
+
/* .cpu_strict = */ cs,
|
|
771
|
+
/* .poll = */ pl,
|
|
655
772
|
/* .n_gpu_layers = */ nl,
|
|
656
773
|
/* .rpc_servers = */ rpc,
|
|
657
774
|
/* .split_mode = */ sm,
|
|
@@ -678,6 +795,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
678
795
|
/* .type_k = */ tk,
|
|
679
796
|
/* .type_v = */ tv,
|
|
680
797
|
/* .n_threads = */ nt,
|
|
798
|
+
/* .cpu_mask = */ cm,
|
|
799
|
+
/* .cpu_strict = */ cs,
|
|
800
|
+
/* .poll = */ pl,
|
|
681
801
|
/* .n_gpu_layers = */ nl,
|
|
682
802
|
/* .rpc_servers = */ rpc,
|
|
683
803
|
/* .split_mode = */ sm,
|
|
@@ -704,6 +824,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
704
824
|
/* .type_k = */ tk,
|
|
705
825
|
/* .type_v = */ tv,
|
|
706
826
|
/* .n_threads = */ nt,
|
|
827
|
+
/* .cpu_mask = */ cm,
|
|
828
|
+
/* .cpu_strict = */ cs,
|
|
829
|
+
/* .poll = */ pl,
|
|
707
830
|
/* .n_gpu_layers = */ nl,
|
|
708
831
|
/* .rpc_servers = */ rpc,
|
|
709
832
|
/* .split_mode = */ sm,
|
|
@@ -740,6 +863,9 @@ struct test {
|
|
|
740
863
|
int n_batch;
|
|
741
864
|
int n_ubatch;
|
|
742
865
|
int n_threads;
|
|
866
|
+
std::string cpu_mask;
|
|
867
|
+
bool cpu_strict;
|
|
868
|
+
int poll;
|
|
743
869
|
bool has_rpc;
|
|
744
870
|
ggml_type type_k;
|
|
745
871
|
ggml_type type_v;
|
|
@@ -766,6 +892,9 @@ struct test {
|
|
|
766
892
|
n_batch = inst.n_batch;
|
|
767
893
|
n_ubatch = inst.n_ubatch;
|
|
768
894
|
n_threads = inst.n_threads;
|
|
895
|
+
cpu_mask = inst.cpu_mask;
|
|
896
|
+
cpu_strict = inst.cpu_strict;
|
|
897
|
+
poll = inst.poll;
|
|
769
898
|
has_rpc = !inst.rpc_servers.empty();
|
|
770
899
|
type_k = inst.type_k;
|
|
771
900
|
type_v = inst.type_v;
|
|
@@ -843,13 +972,14 @@ struct test {
|
|
|
843
972
|
"cpu_info", "gpu_info",
|
|
844
973
|
"model_filename", "model_type", "model_size", "model_n_params",
|
|
845
974
|
"n_batch", "n_ubatch",
|
|
846
|
-
"n_threads", "
|
|
975
|
+
"n_threads", "cpu_mask", "cpu_strict", "poll",
|
|
976
|
+
"type_k", "type_v",
|
|
847
977
|
"n_gpu_layers", "split_mode",
|
|
848
978
|
"main_gpu", "no_kv_offload", "flash_attn",
|
|
849
979
|
"tensor_split", "use_mmap", "embeddings",
|
|
850
980
|
"n_prompt", "n_gen", "test_time",
|
|
851
981
|
"avg_ns", "stddev_ns",
|
|
852
|
-
"avg_ts", "stddev_ts"
|
|
982
|
+
"avg_ts", "stddev_ts",
|
|
853
983
|
};
|
|
854
984
|
return fields;
|
|
855
985
|
}
|
|
@@ -858,7 +988,7 @@ struct test {
|
|
|
858
988
|
|
|
859
989
|
static field_type get_field_type(const std::string & field) {
|
|
860
990
|
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
|
|
861
|
-
field == "n_threads" ||
|
|
991
|
+
field == "n_threads" || field == "poll" ||
|
|
862
992
|
field == "model_size" || field == "model_n_params" ||
|
|
863
993
|
field == "n_gpu_layers" || field == "main_gpu" ||
|
|
864
994
|
field == "n_prompt" || field == "n_gen" ||
|
|
@@ -867,6 +997,7 @@ struct test {
|
|
|
867
997
|
}
|
|
868
998
|
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
|
869
999
|
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
|
1000
|
+
field == "cpu_strict" ||
|
|
870
1001
|
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
|
871
1002
|
return BOOL;
|
|
872
1003
|
}
|
|
@@ -899,7 +1030,8 @@ struct test {
|
|
|
899
1030
|
cpu_info, gpu_info,
|
|
900
1031
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
|
901
1032
|
std::to_string(n_batch), std::to_string(n_ubatch),
|
|
902
|
-
std::to_string(n_threads),
|
|
1033
|
+
std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
|
|
1034
|
+
ggml_type_name(type_k), ggml_type_name(type_v),
|
|
903
1035
|
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
|
904
1036
|
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
|
905
1037
|
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
|
@@ -967,37 +1099,38 @@ struct csv_printer : public printer {
|
|
|
967
1099
|
}
|
|
968
1100
|
};
|
|
969
1101
|
|
|
970
|
-
struct json_printer : public printer {
|
|
971
|
-
bool first = true;
|
|
972
1102
|
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
}
|
|
1103
|
+
static std::string escape_json(const std::string & value) {
|
|
1104
|
+
std::string escaped;
|
|
1105
|
+
for (auto c : value) {
|
|
1106
|
+
if (c == '"') {
|
|
1107
|
+
escaped += "\\\"";
|
|
1108
|
+
} else if (c == '\\') {
|
|
1109
|
+
escaped += "\\\\";
|
|
1110
|
+
} else if (c <= 0x1f) {
|
|
1111
|
+
char buf[8];
|
|
1112
|
+
snprintf(buf, sizeof(buf), "\\u%04x", c);
|
|
1113
|
+
escaped += buf;
|
|
1114
|
+
} else {
|
|
1115
|
+
escaped += c;
|
|
987
1116
|
}
|
|
988
|
-
return escaped;
|
|
989
1117
|
}
|
|
1118
|
+
return escaped;
|
|
1119
|
+
}
|
|
990
1120
|
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
}
|
|
1121
|
+
static std::string format_json_value(const std::string & field, const std::string & value) {
|
|
1122
|
+
switch (test::get_field_type(field)) {
|
|
1123
|
+
case test::STRING:
|
|
1124
|
+
return "\"" + escape_json(value) + "\"";
|
|
1125
|
+
case test::BOOL:
|
|
1126
|
+
return value == "0" ? "false" : "true";
|
|
1127
|
+
default:
|
|
1128
|
+
return value;
|
|
1000
1129
|
}
|
|
1130
|
+
}
|
|
1131
|
+
|
|
1132
|
+
struct json_printer : public printer {
|
|
1133
|
+
bool first = true;
|
|
1001
1134
|
|
|
1002
1135
|
void print_header(const cmd_params & params) override {
|
|
1003
1136
|
fprintf(fout, "[\n");
|
|
@@ -1007,7 +1140,7 @@ struct json_printer : public printer {
|
|
|
1007
1140
|
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
|
1008
1141
|
assert(fields.size() == values.size());
|
|
1009
1142
|
for (size_t i = 0; i < fields.size(); i++) {
|
|
1010
|
-
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(),
|
|
1143
|
+
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
|
|
1011
1144
|
}
|
|
1012
1145
|
}
|
|
1013
1146
|
|
|
@@ -1030,6 +1163,25 @@ struct json_printer : public printer {
|
|
|
1030
1163
|
}
|
|
1031
1164
|
};
|
|
1032
1165
|
|
|
1166
|
+
|
|
1167
|
+
struct jsonl_printer : public printer {
|
|
1168
|
+
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
|
1169
|
+
assert(fields.size() == values.size());
|
|
1170
|
+
for (size_t i = 0; i < fields.size(); i++) {
|
|
1171
|
+
fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
|
|
1172
|
+
}
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
void print_test(const test & t) override {
|
|
1176
|
+
fprintf(fout, "{");
|
|
1177
|
+
print_fields(test::get_fields(), t.get_values());
|
|
1178
|
+
fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
|
|
1179
|
+
fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
|
|
1180
|
+
fprintf(fout, "}\n");
|
|
1181
|
+
fflush(fout);
|
|
1182
|
+
}
|
|
1183
|
+
};
|
|
1184
|
+
|
|
1033
1185
|
struct markdown_printer : public printer {
|
|
1034
1186
|
std::vector<std::string> fields;
|
|
1035
1187
|
|
|
@@ -1038,7 +1190,7 @@ struct markdown_printer : public printer {
|
|
|
1038
1190
|
return -30;
|
|
1039
1191
|
}
|
|
1040
1192
|
if (field == "t/s") {
|
|
1041
|
-
return
|
|
1193
|
+
return 20;
|
|
1042
1194
|
}
|
|
1043
1195
|
if (field == "size" || field == "params") {
|
|
1044
1196
|
return 10;
|
|
@@ -1120,6 +1272,15 @@ struct markdown_printer : public printer {
|
|
|
1120
1272
|
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
|
1121
1273
|
fields.emplace_back("n_threads");
|
|
1122
1274
|
}
|
|
1275
|
+
if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
|
|
1276
|
+
fields.emplace_back("cpu_mask");
|
|
1277
|
+
}
|
|
1278
|
+
if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
|
|
1279
|
+
fields.emplace_back("cpu_strict");
|
|
1280
|
+
}
|
|
1281
|
+
if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
|
|
1282
|
+
fields.emplace_back("poll");
|
|
1283
|
+
}
|
|
1123
1284
|
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
|
1124
1285
|
fields.emplace_back("n_batch");
|
|
1125
1286
|
}
|
|
@@ -1321,6 +1482,8 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
|
|
|
1321
1482
|
return std::unique_ptr<printer>(new csv_printer());
|
|
1322
1483
|
case JSON:
|
|
1323
1484
|
return std::unique_ptr<printer>(new json_printer());
|
|
1485
|
+
case JSONL:
|
|
1486
|
+
return std::unique_ptr<printer>(new jsonl_printer());
|
|
1324
1487
|
case MARKDOWN:
|
|
1325
1488
|
return std::unique_ptr<printer>(new markdown_printer());
|
|
1326
1489
|
case SQL:
|
|
@@ -1354,6 +1517,8 @@ int main(int argc, char ** argv) {
|
|
|
1354
1517
|
llama_backend_init();
|
|
1355
1518
|
llama_numa_init(params.numa);
|
|
1356
1519
|
|
|
1520
|
+
set_process_priority(params.prio);
|
|
1521
|
+
|
|
1357
1522
|
// initialize printer
|
|
1358
1523
|
std::unique_ptr<printer> p = create_printer(params.output_format);
|
|
1359
1524
|
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
|
|
@@ -1373,7 +1538,13 @@ int main(int argc, char ** argv) {
|
|
|
1373
1538
|
llama_model * lmodel = nullptr;
|
|
1374
1539
|
const cmd_params_instance * prev_inst = nullptr;
|
|
1375
1540
|
|
|
1541
|
+
int params_idx = 0;
|
|
1542
|
+
auto params_count = params_instances.size();
|
|
1376
1543
|
for (const auto & inst : params_instances) {
|
|
1544
|
+
params_idx ++;
|
|
1545
|
+
if (params.progress) {
|
|
1546
|
+
fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
|
|
1547
|
+
}
|
|
1377
1548
|
// keep the same model between tests when possible
|
|
1378
1549
|
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
|
1379
1550
|
if (lmodel) {
|
|
@@ -1399,12 +1570,40 @@ int main(int argc, char ** argv) {
|
|
|
1399
1570
|
|
|
1400
1571
|
llama_kv_cache_clear(ctx);
|
|
1401
1572
|
|
|
1573
|
+
// cool off before the test
|
|
1574
|
+
if (params.delay) {
|
|
1575
|
+
std::this_thread::sleep_for(std::chrono::seconds(params.delay));
|
|
1576
|
+
}
|
|
1577
|
+
|
|
1578
|
+
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
|
|
1579
|
+
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
|
|
1580
|
+
fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
|
|
1581
|
+
exit(1);
|
|
1582
|
+
}
|
|
1583
|
+
tpp.strict_cpu = t.cpu_strict;
|
|
1584
|
+
tpp.poll = t.poll;
|
|
1585
|
+
tpp.prio = params.prio;
|
|
1586
|
+
|
|
1587
|
+
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
|
|
1588
|
+
if (!threadpool) {
|
|
1589
|
+
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
|
1590
|
+
exit(1);
|
|
1591
|
+
}
|
|
1592
|
+
|
|
1593
|
+
llama_attach_threadpool(ctx, threadpool, NULL);
|
|
1594
|
+
|
|
1402
1595
|
// warmup run
|
|
1403
1596
|
if (t.n_prompt > 0) {
|
|
1597
|
+
if (params.progress) {
|
|
1598
|
+
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
|
|
1599
|
+
}
|
|
1404
1600
|
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
|
1405
1601
|
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
|
1406
1602
|
}
|
|
1407
1603
|
if (t.n_gen > 0) {
|
|
1604
|
+
if (params.progress) {
|
|
1605
|
+
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
|
|
1606
|
+
}
|
|
1408
1607
|
test_gen(ctx, 1, 0, t.n_threads);
|
|
1409
1608
|
}
|
|
1410
1609
|
|
|
@@ -1414,9 +1613,15 @@ int main(int argc, char ** argv) {
|
|
|
1414
1613
|
uint64_t t_start = get_time_ns();
|
|
1415
1614
|
|
|
1416
1615
|
if (t.n_prompt > 0) {
|
|
1616
|
+
if (params.progress) {
|
|
1617
|
+
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
|
|
1618
|
+
}
|
|
1417
1619
|
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
|
1418
1620
|
}
|
|
1419
1621
|
if (t.n_gen > 0) {
|
|
1622
|
+
if (params.progress) {
|
|
1623
|
+
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
|
|
1624
|
+
}
|
|
1420
1625
|
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
|
|
1421
1626
|
}
|
|
1422
1627
|
|
|
@@ -1434,9 +1639,11 @@ int main(int argc, char ** argv) {
|
|
|
1434
1639
|
fflush(p_err->fout);
|
|
1435
1640
|
}
|
|
1436
1641
|
|
|
1437
|
-
|
|
1642
|
+
llama_perf_context_print(ctx);
|
|
1438
1643
|
|
|
1439
1644
|
llama_free(ctx);
|
|
1645
|
+
|
|
1646
|
+
ggml_threadpool_free(threadpool);
|
|
1440
1647
|
}
|
|
1441
1648
|
|
|
1442
1649
|
llama_free_model(lmodel);
|