@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,175 +1,201 @@
1
- #include "common.h"
2
1
  #include "llama.h"
3
-
4
- #include <cmath>
5
2
  #include <cstdio>
3
+ #include <cstring>
6
4
  #include <string>
7
5
  #include <vector>
8
6
 
9
- static void print_usage(int argc, char ** argv, const gpt_params & params) {
10
- gpt_params_print_usage(argc, argv, params);
11
-
12
- LOG_TEE("\nexample usage:\n");
13
- LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
14
- LOG_TEE("\n");
7
+ static void print_usage(int, char ** argv) {
8
+ printf("\nexample usage:\n");
9
+ printf("\n %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]);
10
+ printf("\n");
15
11
  }
16
12
 
17
13
  int main(int argc, char ** argv) {
18
- gpt_params params;
19
-
20
- params.prompt = "Hello my name is";
21
- params.n_predict = 32;
22
-
23
- if (!gpt_params_parse(argc, argv, params)) {
24
- print_usage(argc, argv, params);
25
- return 1;
14
+ // path to the model gguf file
15
+ std::string model_path;
16
+ // prompt to generate text from
17
+ std::string prompt = "Hello my name is";
18
+ // number of layers to offload to the GPU
19
+ int ngl = 99;
20
+ // number of tokens to predict
21
+ int n_predict = 32;
22
+
23
+ // parse command line arguments
24
+
25
+ {
26
+ int i = 1;
27
+ for (; i < argc; i++) {
28
+ if (strcmp(argv[i], "-m") == 0) {
29
+ if (i + 1 < argc) {
30
+ model_path = argv[++i];
31
+ } else {
32
+ print_usage(argc, argv);
33
+ return 1;
34
+ }
35
+ } else if (strcmp(argv[i], "-n") == 0) {
36
+ if (i + 1 < argc) {
37
+ try {
38
+ n_predict = std::stoi(argv[++i]);
39
+ } catch (...) {
40
+ print_usage(argc, argv);
41
+ return 1;
42
+ }
43
+ } else {
44
+ print_usage(argc, argv);
45
+ return 1;
46
+ }
47
+ } else if (strcmp(argv[i], "-ngl") == 0) {
48
+ if (i + 1 < argc) {
49
+ try {
50
+ ngl = std::stoi(argv[++i]);
51
+ } catch (...) {
52
+ print_usage(argc, argv);
53
+ return 1;
54
+ }
55
+ } else {
56
+ print_usage(argc, argv);
57
+ return 1;
58
+ }
59
+ } else {
60
+ // prompt starts here
61
+ break;
62
+ }
63
+ }
64
+ if (model_path.empty()) {
65
+ print_usage(argc, argv);
66
+ return 1;
67
+ }
68
+ if (i < argc) {
69
+ prompt = argv[i++];
70
+ for (; i < argc; i++) {
71
+ prompt += " ";
72
+ prompt += argv[i];
73
+ }
74
+ }
26
75
  }
27
76
 
28
- // total length of the sequence including the prompt
29
- const int n_predict = params.n_predict;
30
-
31
- // init LLM
32
-
33
- llama_backend_init();
34
- llama_numa_init(params.numa);
35
-
36
77
  // initialize the model
37
78
 
38
- llama_model_params model_params = llama_model_params_from_gpt_params(params);
79
+ llama_model_params model_params = llama_model_default_params();
80
+ model_params.n_gpu_layers = ngl;
39
81
 
40
- llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
82
+ llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
41
83
 
42
84
  if (model == NULL) {
43
85
  fprintf(stderr , "%s: error: unable to load model\n" , __func__);
44
86
  return 1;
45
87
  }
46
88
 
47
- // initialize the context
48
-
49
- llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
89
+ // tokenize the prompt
50
90
 
51
- llama_context * ctx = llama_new_context_with_model(model, ctx_params);
91
+ // find the number of tokens in the prompt
92
+ const int n_prompt = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
52
93
 
53
- if (ctx == NULL) {
54
- fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
94
+ // allocate space for the tokens and tokenize the prompt
95
+ std::vector<llama_token> prompt_tokens(n_prompt);
96
+ if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
97
+ fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
55
98
  return 1;
56
99
  }
57
100
 
58
- // tokenize the prompt
59
-
60
- std::vector<llama_token> tokens_list;
61
- tokens_list = ::llama_tokenize(ctx, params.prompt, true);
101
+ // initialize the context
62
102
 
63
- const int n_ctx = llama_n_ctx(ctx);
64
- const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
103
+ llama_context_params ctx_params = llama_context_default_params();
104
+ // n_ctx is the context size
105
+ ctx_params.n_ctx = n_prompt + n_predict - 1;
106
+ // n_batch is the maximum number of tokens that can be processed in a single call to llama_decode
107
+ ctx_params.n_batch = n_prompt;
108
+ // enable performance counters
109
+ ctx_params.no_perf = false;
65
110
 
66
- LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
111
+ llama_context * ctx = llama_new_context_with_model(model, ctx_params);
67
112
 
68
- // make sure the KV cache is big enough to hold all the prompt and generated tokens
69
- if (n_kv_req > n_ctx) {
70
- LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
71
- LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__);
113
+ if (ctx == NULL) {
114
+ fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
72
115
  return 1;
73
116
  }
74
117
 
75
- // print the prompt token-by-token
76
-
77
- fprintf(stderr, "\n");
78
-
79
- for (auto id : tokens_list) {
80
- fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
81
- }
118
+ // initialize the sampler
82
119
 
83
- fflush(stderr);
120
+ auto sparams = llama_sampler_chain_default_params();
121
+ sparams.no_perf = false;
122
+ llama_sampler * smpl = llama_sampler_chain_init(sparams);
84
123
 
85
- // create a llama_batch with size 512
86
- // we use this object to submit token data for decoding
124
+ llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
87
125
 
88
- llama_batch batch = llama_batch_init(512, 0, 1);
126
+ // print the prompt token-by-token
89
127
 
90
- // evaluate the initial prompt
91
- for (size_t i = 0; i < tokens_list.size(); i++) {
92
- llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
128
+ for (auto id : prompt_tokens) {
129
+ char buf[128];
130
+ int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true);
131
+ if (n < 0) {
132
+ fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
133
+ return 1;
134
+ }
135
+ std::string s(buf, n);
136
+ printf("%s", s.c_str());
93
137
  }
94
138
 
95
- // llama_decode will output logits only for the last token of the prompt
96
- batch.logits[batch.n_tokens - 1] = true;
139
+ // prepare a batch for the prompt
97
140
 
98
- if (llama_decode(ctx, batch) != 0) {
99
- LOG_TEE("%s: llama_decode() failed\n", __func__);
100
- return 1;
101
- }
141
+ llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
102
142
 
103
143
  // main loop
104
144
 
105
- int n_cur = batch.n_tokens;
145
+ const auto t_main_start = ggml_time_us();
106
146
  int n_decode = 0;
147
+ llama_token new_token_id;
107
148
 
108
- const auto t_main_start = ggml_time_us();
149
+ for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) {
150
+ // evaluate the current batch with the transformer model
151
+ if (llama_decode(ctx, batch)) {
152
+ fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
153
+ return 1;
154
+ }
155
+
156
+ n_pos += batch.n_tokens;
109
157
 
110
- while (n_cur <= n_predict) {
111
158
  // sample the next token
112
159
  {
113
- auto n_vocab = llama_n_vocab(model);
114
- auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
115
-
116
- std::vector<llama_token_data> candidates;
117
- candidates.reserve(n_vocab);
118
-
119
- for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
120
- candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
121
- }
122
-
123
- llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
124
-
125
- // sample the most likely token
126
- const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
160
+ new_token_id = llama_sampler_sample(smpl, ctx, -1);
127
161
 
128
162
  // is it an end of generation?
129
- if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
130
- LOG_TEE("\n");
131
-
163
+ if (llama_token_is_eog(model, new_token_id)) {
132
164
  break;
133
165
  }
134
166
 
135
- LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
167
+ char buf[128];
168
+ int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
169
+ if (n < 0) {
170
+ fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
171
+ return 1;
172
+ }
173
+ std::string s(buf, n);
174
+ printf("%s", s.c_str());
136
175
  fflush(stdout);
137
176
 
138
- // prepare the next batch
139
- llama_batch_clear(batch);
140
-
141
- // push this new token for next evaluation
142
- llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
177
+ // prepare the next batch with the sampled token
178
+ batch = llama_batch_get_one(&new_token_id, 1);
143
179
 
144
180
  n_decode += 1;
145
181
  }
146
-
147
- n_cur += 1;
148
-
149
- // evaluate the current batch with the transformer model
150
- if (llama_decode(ctx, batch)) {
151
- fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
152
- return 1;
153
- }
154
182
  }
155
183
 
156
- LOG_TEE("\n");
184
+ printf("\n");
157
185
 
158
186
  const auto t_main_end = ggml_time_us();
159
187
 
160
- LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
188
+ fprintf(stderr, "%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
161
189
  __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
162
190
 
163
- llama_print_timings(ctx);
164
-
191
+ fprintf(stderr, "\n");
192
+ llama_perf_sampler_print(smpl);
193
+ llama_perf_context_print(ctx);
165
194
  fprintf(stderr, "\n");
166
195
 
167
- llama_batch_free(batch);
168
-
196
+ llama_sampler_free(smpl);
169
197
  llama_free(ctx);
170
198
  llama_free_model(model);
171
199
 
172
- llama_backend_free();
173
-
174
200
  return 0;
175
201
  }
@@ -0,0 +1,5 @@
1
+ set(TARGET llama-simple-chat)
2
+ add_executable(${TARGET} simple-chat.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)
@@ -0,0 +1,197 @@
1
+ #include "llama.h"
2
+ #include <cstdio>
3
+ #include <cstring>
4
+ #include <iostream>
5
+ #include <string>
6
+ #include <vector>
7
+
8
+ static void print_usage(int, char ** argv) {
9
+ printf("\nexample usage:\n");
10
+ printf("\n %s -m model.gguf [-c context_size] [-ngl n_gpu_layers]\n", argv[0]);
11
+ printf("\n");
12
+ }
13
+
14
+ int main(int argc, char ** argv) {
15
+ std::string model_path;
16
+ int ngl = 99;
17
+ int n_ctx = 2048;
18
+
19
+ // parse command line arguments
20
+ for (int i = 1; i < argc; i++) {
21
+ try {
22
+ if (strcmp(argv[i], "-m") == 0) {
23
+ if (i + 1 < argc) {
24
+ model_path = argv[++i];
25
+ } else {
26
+ print_usage(argc, argv);
27
+ return 1;
28
+ }
29
+ } else if (strcmp(argv[i], "-c") == 0) {
30
+ if (i + 1 < argc) {
31
+ n_ctx = std::stoi(argv[++i]);
32
+ } else {
33
+ print_usage(argc, argv);
34
+ return 1;
35
+ }
36
+ } else if (strcmp(argv[i], "-ngl") == 0) {
37
+ if (i + 1 < argc) {
38
+ ngl = std::stoi(argv[++i]);
39
+ } else {
40
+ print_usage(argc, argv);
41
+ return 1;
42
+ }
43
+ } else {
44
+ print_usage(argc, argv);
45
+ return 1;
46
+ }
47
+ } catch (std::exception & e) {
48
+ fprintf(stderr, "error: %s\n", e.what());
49
+ print_usage(argc, argv);
50
+ return 1;
51
+ }
52
+ }
53
+ if (model_path.empty()) {
54
+ print_usage(argc, argv);
55
+ return 1;
56
+ }
57
+
58
+ // only print errors
59
+ llama_log_set([](enum ggml_log_level level, const char * text, void * /* user_data */) {
60
+ if (level >= GGML_LOG_LEVEL_ERROR) {
61
+ fprintf(stderr, "%s", text);
62
+ }
63
+ }, nullptr);
64
+
65
+ // initialize the model
66
+ llama_model_params model_params = llama_model_default_params();
67
+ model_params.n_gpu_layers = ngl;
68
+
69
+ llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
70
+ if (!model) {
71
+ fprintf(stderr , "%s: error: unable to load model\n" , __func__);
72
+ return 1;
73
+ }
74
+
75
+ // initialize the context
76
+ llama_context_params ctx_params = llama_context_default_params();
77
+ ctx_params.n_ctx = n_ctx;
78
+ ctx_params.n_batch = n_ctx;
79
+
80
+ llama_context * ctx = llama_new_context_with_model(model, ctx_params);
81
+ if (!ctx) {
82
+ fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
83
+ return 1;
84
+ }
85
+
86
+ // initialize the sampler
87
+ llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
88
+ llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
89
+ llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.8f));
90
+ llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
91
+
92
+ // helper function to evaluate a prompt and generate a response
93
+ auto generate = [&](const std::string & prompt) {
94
+ std::string response;
95
+
96
+ // tokenize the prompt
97
+ const int n_prompt_tokens = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
98
+ std::vector<llama_token> prompt_tokens(n_prompt_tokens);
99
+ if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
100
+ GGML_ABORT("failed to tokenize the prompt\n");
101
+ }
102
+
103
+ // prepare a batch for the prompt
104
+ llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
105
+ llama_token new_token_id;
106
+ while (true) {
107
+ // check if we have enough space in the context to evaluate this batch
108
+ int n_ctx = llama_n_ctx(ctx);
109
+ int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
110
+ if (n_ctx_used + batch.n_tokens > n_ctx) {
111
+ printf("\033[0m\n");
112
+ fprintf(stderr, "context size exceeded\n");
113
+ exit(0);
114
+ }
115
+
116
+ if (llama_decode(ctx, batch)) {
117
+ GGML_ABORT("failed to decode\n");
118
+ }
119
+
120
+ // sample the next token
121
+ new_token_id = llama_sampler_sample(smpl, ctx, -1);
122
+
123
+ // is it an end of generation?
124
+ if (llama_token_is_eog(model, new_token_id)) {
125
+ break;
126
+ }
127
+
128
+ // convert the token to a string, print it and add it to the response
129
+ char buf[256];
130
+ int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
131
+ if (n < 0) {
132
+ GGML_ABORT("failed to convert token to piece\n");
133
+ }
134
+ std::string piece(buf, n);
135
+ printf("%s", piece.c_str());
136
+ fflush(stdout);
137
+ response += piece;
138
+
139
+ // prepare the next batch with the sampled token
140
+ batch = llama_batch_get_one(&new_token_id, 1);
141
+ }
142
+
143
+ return response;
144
+ };
145
+
146
+ std::vector<llama_chat_message> messages;
147
+ std::vector<char> formatted(llama_n_ctx(ctx));
148
+ int prev_len = 0;
149
+ while (true) {
150
+ // get user input
151
+ printf("\033[32m> \033[0m");
152
+ std::string user;
153
+ std::getline(std::cin, user);
154
+
155
+ if (user.empty()) {
156
+ break;
157
+ }
158
+
159
+ // add the user input to the message list and format it
160
+ messages.push_back({"user", strdup(user.c_str())});
161
+ int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
162
+ if (new_len > (int)formatted.size()) {
163
+ formatted.resize(new_len);
164
+ new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
165
+ }
166
+ if (new_len < 0) {
167
+ fprintf(stderr, "failed to apply the chat template\n");
168
+ return 1;
169
+ }
170
+
171
+ // remove previous messages to obtain the prompt to generate the response
172
+ std::string prompt(formatted.begin() + prev_len, formatted.begin() + new_len);
173
+
174
+ // generate a response
175
+ printf("\033[33m");
176
+ std::string response = generate(prompt);
177
+ printf("\n\033[0m");
178
+
179
+ // add the response to the messages
180
+ messages.push_back({"assistant", strdup(response.c_str())});
181
+ prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, nullptr, 0);
182
+ if (prev_len < 0) {
183
+ fprintf(stderr, "failed to apply the chat template\n");
184
+ return 1;
185
+ }
186
+ }
187
+
188
+ // free resources
189
+ for (auto & msg : messages) {
190
+ free(const_cast<char *>(msg.content));
191
+ }
192
+ llama_sampler_free(smpl);
193
+ llama_free(ctx);
194
+ llama_free_model(model);
195
+
196
+ return 0;
197
+ }