@fugood/llama.node 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +0 -9
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -1,9 +1,11 @@
1
1
  #include "train.h"
2
2
  #include "common.h"
3
3
 
4
+ #include <algorithm>
4
5
  #include <random>
5
6
  #include <sstream>
6
7
  #include <functional>
8
+ #include <cstring>
7
9
 
8
10
  struct random_normal_distribution {
9
11
  std::mt19937 gen;
@@ -178,7 +178,11 @@ For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](ht
178
178
  cmake --build build --config Release
179
179
  ```
180
180
 
181
- The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
181
+ The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
182
+
183
+ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.
184
+
185
+ The following compilation options are also available to tweak performance:
182
186
 
183
187
  | Option | Legal values | Default | Description |
184
188
  |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
@@ -348,6 +352,37 @@ cmake --build build --config Release
348
352
  # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
349
353
  ```
350
354
 
355
+ ### CANN
356
+ This provides NPU acceleration using the AI cores of your Ascend NPU. And [CANN](https://www.hiascend.com/en/software/cann) is a hierarchical APIs to help you to quickly build AI applications and service based on Ascend NPU.
357
+
358
+ For more information about Ascend NPU in [Ascend Community](https://www.hiascend.com/en/).
359
+
360
+ Make sure to have the CANN toolkit installed. You can download it from here: [CANN Toolkit](https://www.hiascend.com/developer/download/community/result?module=cann)
361
+
362
+ Go to `llama.cpp` directory and build using CMake.
363
+ ```bash
364
+ cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release
365
+ cmake --build build --config release
366
+ ```
367
+
368
+ You can test with:
369
+
370
+ `./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
371
+
372
+ If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
373
+ ```bash
374
+ llm_load_tensors: CANN buffer size = 13313.00 MiB
375
+ llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
376
+ ```
377
+
378
+ For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
379
+
351
380
  ### Android
352
381
 
353
382
  To read documentation for how to build on Android, [click here](./android.md)
383
+
384
+ ### Arm CPU optimized mulmat kernels
385
+
386
+ Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
387
+
388
+ To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
@@ -16,7 +16,6 @@ else()
16
16
  add_subdirectory(baby-llama)
17
17
  add_subdirectory(batched-bench)
18
18
  add_subdirectory(batched)
19
- add_subdirectory(benchmark)
20
19
  add_subdirectory(convert-llama2c-to-ggml)
21
20
  add_subdirectory(embedding)
22
21
  add_subdirectory(eval-callback)
@@ -1,7 +1,6 @@
1
1
  #include "ggml.h"
2
2
  #include "train.h"
3
3
 
4
- #include <vector>
5
4
  #include <cassert>
6
5
  #include <cstdlib>
7
6
  #include <cstring>
@@ -19,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
19
18
  #endif
20
19
 
21
20
  static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
22
- struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
21
+ struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
23
22
 
24
23
  if (plan.work_size > 0) {
25
24
  buf.resize(plan.work_size);
@@ -1,18 +1,17 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
3
+ #include "log.h"
2
4
  #include "llama.h"
3
5
 
4
6
  #include <algorithm>
5
- #include <cmath>
6
7
  #include <cstdio>
7
8
  #include <string>
8
9
  #include <vector>
9
10
 
10
- static void print_usage(int argc, char ** argv, const gpt_params & params) {
11
- gpt_params_print_usage(argc, argv, params);
12
-
13
- LOG_TEE("\nexample usage:\n");
14
- LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
15
- LOG_TEE("\n");
11
+ static void print_usage(int, char ** argv) {
12
+ LOG("\nexample usage:\n");
13
+ LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
14
+ LOG("\n");
16
15
  }
17
16
 
18
17
  int main(int argc, char ** argv) {
@@ -21,11 +20,11 @@ int main(int argc, char ** argv) {
21
20
  params.prompt = "Hello my name is";
22
21
  params.n_predict = 32;
23
22
 
24
- if (!gpt_params_parse(argc, argv, params)) {
25
- print_usage(argc, argv, params);
23
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
26
24
  return 1;
27
25
  }
28
26
 
27
+ gpt_init();
29
28
 
30
29
  // number of parallel batches
31
30
  int n_parallel = params.n_parallel;
@@ -45,7 +44,7 @@ int main(int argc, char ** argv) {
45
44
  llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
46
45
 
47
46
  if (model == NULL) {
48
- fprintf(stderr , "%s: error: unable to load model\n" , __func__);
47
+ LOG_ERR("%s: error: unable to load model\n" , __func__);
49
48
  return 1;
50
49
  }
51
50
 
@@ -65,32 +64,39 @@ int main(int argc, char ** argv) {
65
64
 
66
65
  llama_context * ctx = llama_new_context_with_model(model, ctx_params);
67
66
 
67
+ auto sparams = llama_sampler_chain_default_params();
68
+
69
+ llama_sampler * smpl = llama_sampler_chain_init(sparams);
70
+
71
+ llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
72
+ llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
73
+ llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
74
+ llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
75
+
68
76
  if (ctx == NULL) {
69
- fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
77
+ LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
70
78
  return 1;
71
79
  }
72
80
 
73
81
  const int n_ctx = llama_n_ctx(ctx);
74
82
 
75
- LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
83
+ LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
76
84
 
77
85
  // make sure the KV cache is big enough to hold all the prompt and generated tokens
78
86
  if (n_kv_req > n_ctx) {
79
- LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
80
- LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
87
+ LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
88
+ LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__);
81
89
  return 1;
82
90
  }
83
91
 
84
92
  // print the prompt token-by-token
85
93
 
86
- fprintf(stderr, "\n");
94
+ LOG("\n");
87
95
 
88
96
  for (auto id : tokens_list) {
89
- fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
97
+ LOG("%s", llama_token_to_piece(ctx, id).c_str());
90
98
  }
91
99
 
92
- fflush(stderr);
93
-
94
100
  // create a llama_batch
95
101
  // we use this object to submit token data for decoding
96
102
  llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
@@ -108,7 +114,7 @@ int main(int argc, char ** argv) {
108
114
 
109
115
  if (llama_model_has_encoder(model)) {
110
116
  if (llama_encode(ctx, batch)) {
111
- LOG_TEE("%s : failed to eval\n", __func__);
117
+ LOG_ERR("%s : failed to eval\n", __func__);
112
118
  return 1;
113
119
  }
114
120
 
@@ -125,7 +131,7 @@ int main(int argc, char ** argv) {
125
131
  batch.logits[batch.n_tokens - 1] = true;
126
132
 
127
133
  if (llama_decode(ctx, batch) != 0) {
128
- LOG_TEE("%s: llama_decode() failed\n", __func__);
134
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
129
135
  return 1;
130
136
  }
131
137
 
@@ -136,7 +142,7 @@ int main(int argc, char ** argv) {
136
142
  //}
137
143
 
138
144
  if (n_parallel > 1) {
139
- LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
145
+ LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
140
146
  }
141
147
 
142
148
  // main loop
@@ -164,36 +170,14 @@ int main(int argc, char ** argv) {
164
170
  continue;
165
171
  }
166
172
 
167
- auto n_vocab = llama_n_vocab(model);
168
- auto * logits = llama_get_logits_ith(ctx, i_batch[i]);
169
-
170
- std::vector<llama_token_data> candidates;
171
- candidates.reserve(n_vocab);
172
-
173
- for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
174
- candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
175
- }
176
-
177
- llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
178
-
179
- const int top_k = 40;
180
- const float top_p = 0.9f;
181
- const float temp = 0.4f;
182
-
183
- llama_sample_top_k(ctx, &candidates_p, top_k, 1);
184
- llama_sample_top_p(ctx, &candidates_p, top_p, 1);
185
- llama_sample_temp (ctx, &candidates_p, temp);
186
-
187
- const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
188
-
189
- //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
173
+ const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
190
174
 
191
175
  // is it an end of generation? -> mark the stream as finished
192
176
  if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
193
177
  i_batch[i] = -1;
194
- LOG_TEE("\n");
178
+ LOG("\n");
195
179
  if (n_parallel > 1) {
196
- LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
180
+ LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
197
181
  }
198
182
 
199
183
  continue;
@@ -201,8 +185,7 @@ int main(int argc, char ** argv) {
201
185
 
202
186
  // if there is only one stream, we print immediately to stdout
203
187
  if (n_parallel == 1) {
204
- LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
205
- fflush(stdout);
188
+ LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
206
189
  }
207
190
 
208
191
  streams[i] += llama_token_to_piece(ctx, new_token_id);
@@ -224,32 +207,33 @@ int main(int argc, char ** argv) {
224
207
 
225
208
  // evaluate the current batch with the transformer model
226
209
  if (llama_decode(ctx, batch)) {
227
- fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
210
+ LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
228
211
  return 1;
229
212
  }
230
213
  }
231
214
 
232
- LOG_TEE("\n");
233
-
234
215
  if (n_parallel > 1) {
235
- LOG_TEE("\n");
216
+ LOG("\n");
236
217
 
237
218
  for (int32_t i = 0; i < n_parallel; ++i) {
238
- LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
219
+ LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
239
220
  }
240
221
  }
241
222
 
242
223
  const auto t_main_end = ggml_time_us();
243
224
 
244
- LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
225
+ LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
245
226
  __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
246
227
 
247
- llama_print_timings(ctx);
228
+ LOG("\n");
229
+ llama_perf_sampler_print(smpl);
230
+ llama_perf_context_print(ctx);
248
231
 
249
232
  fprintf(stderr, "\n");
250
233
 
251
234
  llama_batch_free(batch);
252
235
 
236
+ llama_sampler_free(smpl);
253
237
  llama_free(ctx);
254
238
  llama_free_model(model);
255
239
 
@@ -1,49 +1,28 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
3
+ #include "log.h"
2
4
  #include "llama.h"
3
5
 
4
6
  #include <algorithm>
5
- #include <cmath>
6
7
  #include <cstdio>
7
8
  #include <string>
8
9
  #include <vector>
9
10
 
10
- // mutates the input string
11
- static std::vector<int> parse_list(char * p) {
12
- std::vector<int> ret;
13
-
14
- char * q = p;
15
-
16
- while (*p) {
17
- if (*p == ',') {
18
- *p = '\0';
19
- ret.push_back(std::atoi(q));
20
- q = p + 1;
21
- }
22
-
23
- ++p;
24
- }
25
-
26
- ret.push_back(std::atoi(q));
27
-
28
- return ret;
29
- }
30
-
31
- static void print_usage(int argc, char ** argv, const gpt_params & params) {
32
- gpt_params_print_usage(argc, argv, params);
33
-
34
- LOG_TEE("\nexample usage:\n");
35
- LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
36
- LOG_TEE("\n");
11
+ static void print_usage(int, char ** argv) {
12
+ LOG("\nexample usage:\n");
13
+ LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
14
+ LOG("\n");
37
15
  }
38
16
 
39
17
  int main(int argc, char ** argv) {
40
18
  gpt_params params;
41
19
 
42
- if (!gpt_params_parse(argc, argv, params)) {
43
- print_usage(argc, argv, params);
20
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
44
21
  return 1;
45
22
  }
46
23
 
24
+ gpt_init();
25
+
47
26
  int is_pp_shared = params.is_pp_shared;
48
27
 
49
28
  std::vector<int> n_pp = params.n_pp;
@@ -69,7 +48,7 @@ int main(int argc, char ** argv) {
69
48
  llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
70
49
 
71
50
  // ensure enough sequences are available
72
- ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
51
+ ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
73
52
 
74
53
  llama_context * ctx = llama_new_context_with_model(model, ctx_params);
75
54
 
@@ -100,7 +79,7 @@ int main(int argc, char ** argv) {
100
79
 
101
80
  const int ret = llama_decode(ctx, batch_view);
102
81
  if (ret != 0) {
103
- LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
82
+ LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
104
83
  return false;
105
84
  }
106
85
 
@@ -117,17 +96,18 @@ int main(int argc, char ** argv) {
117
96
  }
118
97
 
119
98
  if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
120
- LOG_TEE("%s: llama_decode() failed\n", __func__);
99
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
121
100
  return 1;
122
101
  }
123
102
  }
124
103
 
125
- LOG_TEE("\n");
126
- LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
127
- LOG_TEE("\n");
128
-
129
- LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
130
- LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
104
+ if (!params.batched_bench_output_jsonl) {
105
+ LOG("\n");
106
+ LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
107
+ LOG("\n");
108
+ LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
109
+ LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
110
+ }
131
111
 
132
112
  for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
133
113
  for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
@@ -156,7 +136,7 @@ int main(int argc, char ** argv) {
156
136
  llama_kv_cache_clear(ctx);
157
137
 
158
138
  if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
159
- LOG_TEE("%s: llama_decode() failed\n", __func__);
139
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
160
140
  return 1;
161
141
  }
162
142
 
@@ -178,7 +158,7 @@ int main(int argc, char ** argv) {
178
158
  }
179
159
 
180
160
  if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
181
- LOG_TEE("%s: llama_decode() failed\n", __func__);
161
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
182
162
  return 1;
183
163
  }
184
164
  }
@@ -195,12 +175,22 @@ int main(int argc, char ** argv) {
195
175
  const float speed_tg = pl*tg / t_tg;
196
176
  const float speed = n_kv / t;
197
177
 
198
- LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
178
+ if(params.batched_bench_output_jsonl) {
179
+ LOG(
180
+ "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
181
+ "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
182
+ n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
183
+ pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
184
+ );
185
+ } else {
186
+ LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
187
+ }
199
188
  }
200
189
  }
201
190
  }
202
191
 
203
- llama_print_timings(ctx);
192
+ LOG("\n");
193
+ llama_perf_context_print(ctx);
204
194
 
205
195
  llama_batch_free(batch);
206
196
 
@@ -209,7 +199,7 @@ int main(int argc, char ** argv) {
209
199
 
210
200
  llama_backend_free();
211
201
 
212
- fprintf(stderr, "\n\n");
202
+ LOG("\n\n");
213
203
 
214
204
  return 0;
215
205
  }