@fugood/llama.node 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +0 -9
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -178,7 +178,11 @@ For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](ht
|
|
|
178
178
|
cmake --build build --config Release
|
|
179
179
|
```
|
|
180
180
|
|
|
181
|
-
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
|
|
181
|
+
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
|
|
182
|
+
|
|
183
|
+
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
|
|
184
|
+
|
|
185
|
+
The following compilation options are also available to tweak performance:
|
|
182
186
|
|
|
183
187
|
| Option | Legal values | Default | Description |
|
|
184
188
|
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
@@ -348,6 +352,37 @@ cmake --build build --config Release
|
|
|
348
352
|
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
|
349
353
|
```
|
|
350
354
|
|
|
355
|
+
### CANN
|
|
356
|
+
This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
|
|
357
|
+
|
|
358
|
+
For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
|
|
359
|
+
|
|
360
|
+
Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
|
|
361
|
+
|
|
362
|
+
Go to `llama.cpp` directory and build using CMake.
|
|
363
|
+
```bash
|
|
364
|
+
cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
|
|
365
|
+
cmake --build build --config release
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
You can test with:
|
|
369
|
+
|
|
370
|
+
`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
|
|
371
|
+
|
|
372
|
+
If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
|
|
373
|
+
```bash
|
|
374
|
+
llm_load_tensors: CANN buffer size = 13313.00 MiB
|
|
375
|
+
llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
|
|
379
|
+
|
|
351
380
|
### Android
|
|
352
381
|
|
|
353
382
|
To read documentation for how to build on Android, [click here](./android.md)
|
|
383
|
+
|
|
384
|
+
### Arm CPU optimized mulmat kernels
|
|
385
|
+
|
|
386
|
+
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
|
|
387
|
+
|
|
388
|
+
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
#include "ggml.h"
|
|
2
2
|
#include "train.h"
|
|
3
3
|
|
|
4
|
-
#include <vector>
|
|
5
4
|
#include <cassert>
|
|
6
5
|
#include <cstdlib>
|
|
7
6
|
#include <cstring>
|
|
@@ -19,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
|
|
|
19
18
|
#endif
|
|
20
19
|
|
|
21
20
|
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
|
22
|
-
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
|
21
|
+
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
|
23
22
|
|
|
24
23
|
if (plan.work_size > 0) {
|
|
25
24
|
buf.resize(plan.work_size);
|
|
@@ -1,18 +1,17 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
3
|
+
#include "log.h"
|
|
2
4
|
#include "llama.h"
|
|
3
5
|
|
|
4
6
|
#include <algorithm>
|
|
5
|
-
#include <cmath>
|
|
6
7
|
#include <cstdio>
|
|
7
8
|
#include <string>
|
|
8
9
|
#include <vector>
|
|
9
10
|
|
|
10
|
-
static void print_usage(int
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
|
|
15
|
-
LOG_TEE("\n");
|
|
11
|
+
static void print_usage(int, char ** argv) {
|
|
12
|
+
LOG("\nexample usage:\n");
|
|
13
|
+
LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
|
|
14
|
+
LOG("\n");
|
|
16
15
|
}
|
|
17
16
|
|
|
18
17
|
int main(int argc, char ** argv) {
|
|
@@ -21,11 +20,11 @@ int main(int argc, char ** argv) {
|
|
|
21
20
|
params.prompt = "Hello my name is";
|
|
22
21
|
params.n_predict = 32;
|
|
23
22
|
|
|
24
|
-
if (!gpt_params_parse(argc, argv, params)) {
|
|
25
|
-
print_usage(argc, argv, params);
|
|
23
|
+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
|
26
24
|
return 1;
|
|
27
25
|
}
|
|
28
26
|
|
|
27
|
+
gpt_init();
|
|
29
28
|
|
|
30
29
|
// number of parallel batches
|
|
31
30
|
int n_parallel = params.n_parallel;
|
|
@@ -45,7 +44,7 @@ int main(int argc, char ** argv) {
|
|
|
45
44
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
|
46
45
|
|
|
47
46
|
if (model == NULL) {
|
|
48
|
-
|
|
47
|
+
LOG_ERR("%s: error: unable to load model\n" , __func__);
|
|
49
48
|
return 1;
|
|
50
49
|
}
|
|
51
50
|
|
|
@@ -65,32 +64,39 @@ int main(int argc, char ** argv) {
|
|
|
65
64
|
|
|
66
65
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
|
67
66
|
|
|
67
|
+
auto sparams = llama_sampler_chain_default_params();
|
|
68
|
+
|
|
69
|
+
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
|
70
|
+
|
|
71
|
+
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
|
|
72
|
+
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
|
|
73
|
+
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
|
|
74
|
+
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
|
|
75
|
+
|
|
68
76
|
if (ctx == NULL) {
|
|
69
|
-
|
|
77
|
+
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
|
|
70
78
|
return 1;
|
|
71
79
|
}
|
|
72
80
|
|
|
73
81
|
const int n_ctx = llama_n_ctx(ctx);
|
|
74
82
|
|
|
75
|
-
|
|
83
|
+
LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
|
76
84
|
|
|
77
85
|
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
|
78
86
|
if (n_kv_req > n_ctx) {
|
|
79
|
-
|
|
80
|
-
|
|
87
|
+
LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
|
|
88
|
+
LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__);
|
|
81
89
|
return 1;
|
|
82
90
|
}
|
|
83
91
|
|
|
84
92
|
// print the prompt token-by-token
|
|
85
93
|
|
|
86
|
-
|
|
94
|
+
LOG("\n");
|
|
87
95
|
|
|
88
96
|
for (auto id : tokens_list) {
|
|
89
|
-
|
|
97
|
+
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
|
90
98
|
}
|
|
91
99
|
|
|
92
|
-
fflush(stderr);
|
|
93
|
-
|
|
94
100
|
// create a llama_batch
|
|
95
101
|
// we use this object to submit token data for decoding
|
|
96
102
|
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
|
|
@@ -108,7 +114,7 @@ int main(int argc, char ** argv) {
|
|
|
108
114
|
|
|
109
115
|
if (llama_model_has_encoder(model)) {
|
|
110
116
|
if (llama_encode(ctx, batch)) {
|
|
111
|
-
|
|
117
|
+
LOG_ERR("%s : failed to eval\n", __func__);
|
|
112
118
|
return 1;
|
|
113
119
|
}
|
|
114
120
|
|
|
@@ -125,7 +131,7 @@ int main(int argc, char ** argv) {
|
|
|
125
131
|
batch.logits[batch.n_tokens - 1] = true;
|
|
126
132
|
|
|
127
133
|
if (llama_decode(ctx, batch) != 0) {
|
|
128
|
-
|
|
134
|
+
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
|
129
135
|
return 1;
|
|
130
136
|
}
|
|
131
137
|
|
|
@@ -136,7 +142,7 @@ int main(int argc, char ** argv) {
|
|
|
136
142
|
//}
|
|
137
143
|
|
|
138
144
|
if (n_parallel > 1) {
|
|
139
|
-
|
|
145
|
+
LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
|
|
140
146
|
}
|
|
141
147
|
|
|
142
148
|
// main loop
|
|
@@ -164,36 +170,14 @@ int main(int argc, char ** argv) {
|
|
|
164
170
|
continue;
|
|
165
171
|
}
|
|
166
172
|
|
|
167
|
-
|
|
168
|
-
auto * logits = llama_get_logits_ith(ctx, i_batch[i]);
|
|
169
|
-
|
|
170
|
-
std::vector<llama_token_data> candidates;
|
|
171
|
-
candidates.reserve(n_vocab);
|
|
172
|
-
|
|
173
|
-
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
174
|
-
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
178
|
-
|
|
179
|
-
const int top_k = 40;
|
|
180
|
-
const float top_p = 0.9f;
|
|
181
|
-
const float temp = 0.4f;
|
|
182
|
-
|
|
183
|
-
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
|
|
184
|
-
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
|
185
|
-
llama_sample_temp (ctx, &candidates_p, temp);
|
|
186
|
-
|
|
187
|
-
const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
|
|
188
|
-
|
|
189
|
-
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
|
173
|
+
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
|
|
190
174
|
|
|
191
175
|
// is it an end of generation? -> mark the stream as finished
|
|
192
176
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
|
193
177
|
i_batch[i] = -1;
|
|
194
|
-
|
|
178
|
+
LOG("\n");
|
|
195
179
|
if (n_parallel > 1) {
|
|
196
|
-
|
|
180
|
+
LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
|
|
197
181
|
}
|
|
198
182
|
|
|
199
183
|
continue;
|
|
@@ -201,8 +185,7 @@ int main(int argc, char ** argv) {
|
|
|
201
185
|
|
|
202
186
|
// if there is only one stream, we print immediately to stdout
|
|
203
187
|
if (n_parallel == 1) {
|
|
204
|
-
|
|
205
|
-
fflush(stdout);
|
|
188
|
+
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
|
206
189
|
}
|
|
207
190
|
|
|
208
191
|
streams[i] += llama_token_to_piece(ctx, new_token_id);
|
|
@@ -224,32 +207,33 @@ int main(int argc, char ** argv) {
|
|
|
224
207
|
|
|
225
208
|
// evaluate the current batch with the transformer model
|
|
226
209
|
if (llama_decode(ctx, batch)) {
|
|
227
|
-
|
|
210
|
+
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
|
|
228
211
|
return 1;
|
|
229
212
|
}
|
|
230
213
|
}
|
|
231
214
|
|
|
232
|
-
LOG_TEE("\n");
|
|
233
|
-
|
|
234
215
|
if (n_parallel > 1) {
|
|
235
|
-
|
|
216
|
+
LOG("\n");
|
|
236
217
|
|
|
237
218
|
for (int32_t i = 0; i < n_parallel; ++i) {
|
|
238
|
-
|
|
219
|
+
LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
|
|
239
220
|
}
|
|
240
221
|
}
|
|
241
222
|
|
|
242
223
|
const auto t_main_end = ggml_time_us();
|
|
243
224
|
|
|
244
|
-
|
|
225
|
+
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
|
245
226
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
|
246
227
|
|
|
247
|
-
|
|
228
|
+
LOG("\n");
|
|
229
|
+
llama_perf_sampler_print(smpl);
|
|
230
|
+
llama_perf_context_print(ctx);
|
|
248
231
|
|
|
249
232
|
fprintf(stderr, "\n");
|
|
250
233
|
|
|
251
234
|
llama_batch_free(batch);
|
|
252
235
|
|
|
236
|
+
llama_sampler_free(smpl);
|
|
253
237
|
llama_free(ctx);
|
|
254
238
|
llama_free_model(model);
|
|
255
239
|
|
|
@@ -1,49 +1,28 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
3
|
+
#include "log.h"
|
|
2
4
|
#include "llama.h"
|
|
3
5
|
|
|
4
6
|
#include <algorithm>
|
|
5
|
-
#include <cmath>
|
|
6
7
|
#include <cstdio>
|
|
7
8
|
#include <string>
|
|
8
9
|
#include <vector>
|
|
9
10
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
char * q = p;
|
|
15
|
-
|
|
16
|
-
while (*p) {
|
|
17
|
-
if (*p == ',') {
|
|
18
|
-
*p = '\0';
|
|
19
|
-
ret.push_back(std::atoi(q));
|
|
20
|
-
q = p + 1;
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
++p;
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
ret.push_back(std::atoi(q));
|
|
27
|
-
|
|
28
|
-
return ret;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
|
32
|
-
gpt_params_print_usage(argc, argv, params);
|
|
33
|
-
|
|
34
|
-
LOG_TEE("\nexample usage:\n");
|
|
35
|
-
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
|
36
|
-
LOG_TEE("\n");
|
|
11
|
+
static void print_usage(int, char ** argv) {
|
|
12
|
+
LOG("\nexample usage:\n");
|
|
13
|
+
LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
|
14
|
+
LOG("\n");
|
|
37
15
|
}
|
|
38
16
|
|
|
39
17
|
int main(int argc, char ** argv) {
|
|
40
18
|
gpt_params params;
|
|
41
19
|
|
|
42
|
-
if (!gpt_params_parse(argc, argv, params)) {
|
|
43
|
-
print_usage(argc, argv, params);
|
|
20
|
+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
|
|
44
21
|
return 1;
|
|
45
22
|
}
|
|
46
23
|
|
|
24
|
+
gpt_init();
|
|
25
|
+
|
|
47
26
|
int is_pp_shared = params.is_pp_shared;
|
|
48
27
|
|
|
49
28
|
std::vector<int> n_pp = params.n_pp;
|
|
@@ -69,7 +48,7 @@ int main(int argc, char ** argv) {
|
|
|
69
48
|
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
|
70
49
|
|
|
71
50
|
// ensure enough sequences are available
|
|
72
|
-
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
|
|
51
|
+
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
|
|
73
52
|
|
|
74
53
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
|
75
54
|
|
|
@@ -100,7 +79,7 @@ int main(int argc, char ** argv) {
|
|
|
100
79
|
|
|
101
80
|
const int ret = llama_decode(ctx, batch_view);
|
|
102
81
|
if (ret != 0) {
|
|
103
|
-
|
|
82
|
+
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
|
104
83
|
return false;
|
|
105
84
|
}
|
|
106
85
|
|
|
@@ -117,17 +96,18 @@ int main(int argc, char ** argv) {
|
|
|
117
96
|
}
|
|
118
97
|
|
|
119
98
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
|
120
|
-
|
|
99
|
+
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
|
121
100
|
return 1;
|
|
122
101
|
}
|
|
123
102
|
}
|
|
124
103
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
104
|
+
if (!params.batched_bench_output_jsonl) {
|
|
105
|
+
LOG("\n");
|
|
106
|
+
LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
|
107
|
+
LOG("\n");
|
|
108
|
+
LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
|
109
|
+
LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
|
110
|
+
}
|
|
131
111
|
|
|
132
112
|
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
|
133
113
|
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
|
|
@@ -156,7 +136,7 @@ int main(int argc, char ** argv) {
|
|
|
156
136
|
llama_kv_cache_clear(ctx);
|
|
157
137
|
|
|
158
138
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
|
159
|
-
|
|
139
|
+
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
|
160
140
|
return 1;
|
|
161
141
|
}
|
|
162
142
|
|
|
@@ -178,7 +158,7 @@ int main(int argc, char ** argv) {
|
|
|
178
158
|
}
|
|
179
159
|
|
|
180
160
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
|
181
|
-
|
|
161
|
+
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
|
182
162
|
return 1;
|
|
183
163
|
}
|
|
184
164
|
}
|
|
@@ -195,12 +175,22 @@ int main(int argc, char ** argv) {
|
|
|
195
175
|
const float speed_tg = pl*tg / t_tg;
|
|
196
176
|
const float speed = n_kv / t;
|
|
197
177
|
|
|
198
|
-
|
|
178
|
+
if(params.batched_bench_output_jsonl) {
|
|
179
|
+
LOG(
|
|
180
|
+
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
|
|
181
|
+
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
|
|
182
|
+
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
|
|
183
|
+
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
|
|
184
|
+
);
|
|
185
|
+
} else {
|
|
186
|
+
LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
|
187
|
+
}
|
|
199
188
|
}
|
|
200
189
|
}
|
|
201
190
|
}
|
|
202
191
|
|
|
203
|
-
|
|
192
|
+
LOG("\n");
|
|
193
|
+
llama_perf_context_print(ctx);
|
|
204
194
|
|
|
205
195
|
llama_batch_free(batch);
|
|
206
196
|
|
|
@@ -209,7 +199,7 @@ int main(int argc, char ** argv) {
|
|
|
209
199
|
|
|
210
200
|
llama_backend_free();
|
|
211
201
|
|
|
212
|
-
|
|
202
|
+
LOG("\n\n");
|
|
213
203
|
|
|
214
204
|
return 0;
|
|
215
205
|
}
|