@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,11 +1,11 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
2
-
3
3
  #include "console.h"
4
+ #include "log.h"
5
+ #include "sampling.h"
4
6
  #include "llama.h"
5
7
 
6
8
  #include <cassert>
7
- #include <cinttypes>
8
- #include <cmath>
9
9
  #include <cstdio>
10
10
  #include <cstring>
11
11
  #include <ctime>
@@ -33,13 +33,23 @@
33
33
 
34
34
  static llama_context ** g_ctx;
35
35
  static llama_model ** g_model;
36
- static gpt_params * g_params;
36
+ static common_sampler ** g_smpl;
37
+ static common_params * g_params;
37
38
  static std::vector<llama_token> * g_input_tokens;
38
39
  static std::ostringstream * g_output_ss;
39
40
  static std::vector<llama_token> * g_output_tokens;
40
41
  static bool is_interacting = false;
41
42
  static bool need_insert_eot = false;
42
43
 
44
+ static void print_usage(int argc, char ** argv) {
45
+ (void) argc;
46
+
47
+ LOG("\nexample usage:\n");
48
+ LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
49
+ LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
50
+ LOG("\n");
51
+ }
52
+
43
53
  static bool file_exists(const std::string & path) {
44
54
  std::ifstream f(path.c_str());
45
55
  return f.good();
@@ -52,50 +62,6 @@ static bool file_is_empty(const std::string & path) {
52
62
  return f.tellg() == 0;
53
63
  }
54
64
 
55
- static void write_logfile(
56
- const llama_context * ctx, const gpt_params & params, const llama_model * model,
57
- const std::vector<llama_token> & input_tokens, const std::string & output,
58
- const std::vector<llama_token> & output_tokens
59
- ) {
60
- if (params.logdir.empty()) {
61
- return;
62
- }
63
-
64
- const std::string timestamp = string_get_sortable_timestamp();
65
-
66
- const bool success = fs_create_directory_with_parents(params.logdir);
67
- if (!success) {
68
- fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
69
- __func__, params.logdir.c_str());
70
- return;
71
- }
72
-
73
- const std::string logfile_path = params.logdir + timestamp + ".yml";
74
- FILE * logfile = fopen(logfile_path.c_str(), "w");
75
-
76
- if (logfile == NULL) {
77
- fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
78
- return;
79
- }
80
-
81
- fprintf(logfile, "binary: main\n");
82
- char model_desc[128];
83
- llama_model_desc(model, model_desc, sizeof(model_desc));
84
- yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
85
-
86
- fprintf(logfile, "\n");
87
- fprintf(logfile, "######################\n");
88
- fprintf(logfile, "# Generation Results #\n");
89
- fprintf(logfile, "######################\n");
90
- fprintf(logfile, "\n");
91
-
92
- yaml_dump_string_multiline(logfile, "output", output.c_str());
93
- yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
94
-
95
- llama_dump_timing_info_yaml(logfile, ctx);
96
- fclose(logfile);
97
- }
98
-
99
65
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
100
66
  static void sigint_handler(int signo) {
101
67
  if (signo == SIGINT) {
@@ -104,50 +70,37 @@ static void sigint_handler(int signo) {
104
70
  need_insert_eot = true;
105
71
  } else {
106
72
  console::cleanup();
107
- printf("\n");
108
- llama_print_timings(*g_ctx);
109
- write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
73
+ LOG("\n");
74
+ common_perf_print(*g_ctx, *g_smpl);
75
+
76
+ // make sure all logs are flushed
77
+ LOG("Interrupted by user\n");
78
+ common_log_pause(common_log_main());
79
+
110
80
  _exit(130);
111
81
  }
112
82
  }
113
83
  }
114
84
  #endif
115
85
 
116
- static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
117
- (void) level;
118
- (void) user_data;
119
- LOG_TEE("%s", text);
120
- }
121
-
122
- static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
123
- llama_chat_msg new_msg{role, content};
124
- auto formatted = llama_chat_format_single(
125
- model, g_params->chat_template, chat_msgs, new_msg, role == "user");
86
+ static std::string chat_add_and_format(struct llama_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
87
+ common_chat_msg new_msg{role, content};
88
+ auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
126
89
  chat_msgs.push_back({role, content});
127
- LOG("formatted: %s\n", formatted.c_str());
90
+ LOG_DBG("formatted: '%s'\n", formatted.c_str());
128
91
  return formatted;
129
92
  }
130
93
 
131
94
  int main(int argc, char ** argv) {
132
- gpt_params params;
95
+ common_params params;
133
96
  g_params = &params;
134
-
135
- if (!gpt_params_parse(argc, argv, params)) {
136
- gpt_params_print_usage(argc, argv, params);
97
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
137
98
  return 1;
138
99
  }
139
100
 
140
- llama_sampling_params & sparams = params.sparams;
141
-
142
- #ifndef LOG_DISABLE_LOGS
143
- log_set_target(log_filename_generator("main", "log"));
144
- LOG_TEE("Log start\n");
145
- log_dump_cmdline(argc, argv);
146
- llama_log_set(llama_log_callback_logTee, nullptr);
147
- #endif // LOG_DISABLE_LOGS
101
+ common_init();
148
102
 
149
- // TODO: Dump params ?
150
- //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
103
+ auto & sparams = params.sparams;
151
104
 
152
105
  // save choice to use color for later
153
106
  // (note for later: this is a slightly awkward choice)
@@ -155,120 +108,141 @@ int main(int argc, char ** argv) {
155
108
  atexit([]() { console::cleanup(); });
156
109
 
157
110
  if (params.logits_all) {
158
- printf("\n************\n");
159
- printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
160
- printf("************\n\n");
111
+ LOG_ERR("************\n");
112
+ LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
113
+ LOG_ERR("************\n\n");
161
114
 
162
115
  return 0;
163
116
  }
164
117
 
165
118
  if (params.embedding) {
166
- printf("\n************\n");
167
- printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
168
- printf("************\n\n");
119
+ LOG_ERR("************\n");
120
+ LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
121
+ LOG_ERR("************\n\n");
169
122
 
170
123
  return 0;
171
124
  }
172
125
 
173
126
  if (params.n_ctx != 0 && params.n_ctx < 8) {
174
- LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
127
+ LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
175
128
  params.n_ctx = 8;
176
129
  }
177
130
 
178
131
  if (params.rope_freq_base != 0.0) {
179
- LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
132
+ LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
180
133
  }
181
134
 
182
135
  if (params.rope_freq_scale != 0.0) {
183
- LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
184
- }
185
-
186
- LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
187
- LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
188
-
189
- if (params.seed == LLAMA_DEFAULT_SEED) {
190
- params.seed = time(NULL);
136
+ LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
191
137
  }
192
138
 
193
- LOG_TEE("%s: seed = %u\n", __func__, params.seed);
194
-
195
- std::mt19937 rng(params.seed);
139
+ LOG_INF("%s: llama backend init\n", __func__);
196
140
 
197
- LOG("%s: llama backend init\n", __func__);
198
141
  llama_backend_init();
199
142
  llama_numa_init(params.numa);
200
143
 
201
- llama_model * model;
202
- llama_context * ctx;
203
- llama_context * ctx_guidance = NULL;
204
- std::vector<llama_chat_msg> chat_msgs;
144
+ llama_model * model = nullptr;
145
+ llama_context * ctx = nullptr;
146
+ common_sampler * smpl = nullptr;
147
+
148
+ std::vector<common_chat_msg> chat_msgs;
149
+
205
150
  g_model = &model;
206
151
  g_ctx = &ctx;
152
+ g_smpl = &smpl;
207
153
 
208
154
  // load the model and apply lora adapter, if any
209
- LOG("%s: load the model and apply lora adapter, if any\n", __func__);
210
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
211
- if (sparams.cfg_scale > 1.f) {
212
- struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
213
- ctx_guidance = llama_new_context_with_model(model, lparams);
214
- }
155
+ LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
156
+ common_init_result llama_init = common_init_from_params(params);
157
+
158
+ model = llama_init.model;
159
+ ctx = llama_init.context;
215
160
 
216
161
  if (model == NULL) {
217
- LOG_TEE("%s: error: unable to load model\n", __func__);
162
+ LOG_ERR("%s: error: unable to load model\n", __func__);
163
+ return 1;
164
+ }
165
+
166
+ LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
167
+
168
+ struct ggml_threadpool_params tpp_batch =
169
+ ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
170
+ struct ggml_threadpool_params tpp =
171
+ ggml_threadpool_params_from_cpu_params(params.cpuparams);
172
+
173
+ set_process_priority(params.cpuparams.priority);
174
+
175
+ struct ggml_threadpool * threadpool_batch = NULL;
176
+ if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
177
+ threadpool_batch = ggml_threadpool_new(&tpp_batch);
178
+ if (!threadpool_batch) {
179
+ LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
180
+ return 1;
181
+ }
182
+
183
+ // Start the non-batch threadpool in the paused state
184
+ tpp.paused = true;
185
+ }
186
+
187
+ struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
188
+ if (!threadpool) {
189
+ LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
218
190
  return 1;
219
191
  }
220
192
 
193
+ llama_attach_threadpool(ctx, threadpool, threadpool_batch);
194
+
221
195
  const int n_ctx_train = llama_n_ctx_train(model);
222
196
  const int n_ctx = llama_n_ctx(ctx);
223
- LOG("n_ctx: %d\n", n_ctx);
224
197
 
225
198
  if (n_ctx > n_ctx_train) {
226
- LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
227
- __func__, n_ctx_train, n_ctx);
199
+ LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
228
200
  }
229
201
 
230
202
  // print chat template example in conversation mode
231
203
  if (params.conversation) {
232
204
  if (params.enable_chat_template) {
233
- LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
205
+ LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
234
206
  } else {
235
- LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
207
+ LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
236
208
  }
237
209
  }
238
210
 
239
211
  // print system information
240
212
  {
241
- LOG_TEE("\n");
242
- LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
213
+ LOG_INF("\n");
214
+ LOG_INF("%s\n", common_params_get_system_info(params).c_str());
215
+ LOG_INF("\n");
243
216
  }
244
217
 
245
218
  std::string path_session = params.path_prompt_cache;
246
219
  std::vector<llama_token> session_tokens;
247
220
 
248
221
  if (!path_session.empty()) {
249
- LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
222
+ LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
250
223
  if (!file_exists(path_session)) {
251
- LOG_TEE("%s: session file does not exist, will create.\n", __func__);
224
+ LOG_INF("%s: session file does not exist, will create.\n", __func__);
252
225
  } else if (file_is_empty(path_session)) {
253
- LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
226
+ LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
254
227
  } else {
255
228
  // The file exists and is not empty
256
229
  session_tokens.resize(n_ctx);
257
230
  size_t n_token_count_out = 0;
258
231
  if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
259
- LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
232
+ LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
260
233
  return 1;
261
234
  }
262
235
  session_tokens.resize(n_token_count_out);
263
- LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
236
+ LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
264
237
  }
265
238
  }
266
239
 
267
- const bool add_bos = llama_should_add_bos_token(model);
240
+ const bool add_bos = llama_add_bos_token(model);
268
241
  if (!llama_model_has_encoder(model)) {
269
- GGML_ASSERT(llama_add_eos_token(model) != 1);
242
+ GGML_ASSERT(!llama_add_eos_token(model));
270
243
  }
271
- LOG("add_bos: %d\n", add_bos);
244
+
245
+ LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
272
246
 
273
247
  std::vector<llama_token> embd_inp;
274
248
 
@@ -277,49 +251,31 @@ int main(int argc, char ** argv) {
277
251
  ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
278
252
  : params.prompt;
279
253
  if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
280
- LOG("tokenize the prompt\n");
281
- embd_inp = ::llama_tokenize(ctx, prompt, true, true);
254
+ LOG_DBG("tokenize the prompt\n");
255
+ embd_inp = common_tokenize(ctx, prompt, true, true);
282
256
  } else {
283
- LOG("use session tokens\n");
257
+ LOG_DBG("use session tokens\n");
284
258
  embd_inp = session_tokens;
285
259
  }
286
260
 
287
- LOG("prompt: \"%s\"\n", log_tostr(prompt));
288
- LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
261
+ LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
262
+ LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
289
263
  }
290
264
 
291
265
  // Should not run without any tokens
292
266
  if (embd_inp.empty()) {
293
267
  if (add_bos) {
294
268
  embd_inp.push_back(llama_token_bos(model));
295
- LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
269
+ LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
296
270
  } else {
297
- LOG_TEE("error: input is empty\n");
271
+ LOG_ERR("input is empty\n");
298
272
  return -1;
299
273
  }
300
274
  }
301
275
 
302
276
  // Tokenize negative prompt
303
- std::vector<llama_token> guidance_inp;
304
- int guidance_offset = 0;
305
- int original_prompt_len = 0;
306
- if (ctx_guidance) {
307
- LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
308
-
309
- guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
310
- LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
311
-
312
- std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
313
- LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
314
-
315
- original_prompt_len = original_inp.size();
316
- guidance_offset = (int)guidance_inp.size() - original_prompt_len;
317
- LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
318
- LOG("guidance_offset: %s", log_tostr(guidance_offset));
319
- }
320
-
321
277
  if ((int) embd_inp.size() > n_ctx - 4) {
322
- LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
278
+ LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
323
279
  return 1;
324
280
  }
325
281
 
@@ -333,29 +289,28 @@ int main(int argc, char ** argv) {
333
289
  n_matching_session_tokens++;
334
290
  }
335
291
  if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
336
- LOG_TEE("%s: using full prompt from session file\n", __func__);
292
+ LOG_INF("%s: using full prompt from session file\n", __func__);
337
293
  } else if (n_matching_session_tokens >= embd_inp.size()) {
338
- LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
294
+ LOG_INF("%s: session file has exact match for prompt!\n", __func__);
339
295
  } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
340
- LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
341
- __func__, n_matching_session_tokens, embd_inp.size());
296
+ LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
297
+ __func__, n_matching_session_tokens, embd_inp.size());
342
298
  } else {
343
- LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
344
- __func__, n_matching_session_tokens, embd_inp.size());
299
+ LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
300
+ __func__, n_matching_session_tokens, embd_inp.size());
345
301
  }
346
302
 
347
303
  // remove any "future" tokens that we might have inherited from the previous session
348
304
  llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
349
305
  }
350
306
 
351
- LOGLN(
352
- "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu",
353
- log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
307
+ LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
308
+ embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
354
309
 
355
310
  // if we will use the cache for the full prompt without reaching the end of the cache, force
356
311
  // reevaluation of the last token to recalculate the cached logits
357
312
  if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
358
- LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
313
+ LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
359
314
 
360
315
  session_tokens.resize(embd_inp.size() - 1);
361
316
  }
@@ -377,30 +332,20 @@ int main(int argc, char ** argv) {
377
332
  }
378
333
 
379
334
  if (params.verbose_prompt) {
380
- LOG_TEE("\n");
381
- LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
382
- LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
335
+ LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
336
+ LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
383
337
  for (int i = 0; i < (int) embd_inp.size(); i++) {
384
- LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
385
- }
386
-
387
- if (ctx_guidance) {
388
- LOG_TEE("\n");
389
- LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
390
- LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
391
- for (int i = 0; i < (int) guidance_inp.size(); i++) {
392
- LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
393
- }
338
+ LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
394
339
  }
395
340
 
396
341
  if (params.n_keep > add_bos) {
397
- LOG_TEE("%s: static prompt based on n_keep: '", __func__);
342
+ LOG_INF("%s: static prompt based on n_keep: '", __func__);
398
343
  for (int i = 0; i < params.n_keep; i++) {
399
- LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
344
+ LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
400
345
  }
401
- LOG_TEE("'\n");
346
+ LOG_CNT("'\n");
402
347
  }
403
- LOG_TEE("\n");
348
+ LOG_INF("\n");
404
349
  }
405
350
 
406
351
  // ctrl+C handling
@@ -420,47 +365,56 @@ int main(int argc, char ** argv) {
420
365
  }
421
366
 
422
367
  if (params.interactive) {
423
- LOG_TEE("%s: interactive mode on.\n", __func__);
368
+ LOG_INF("%s: interactive mode on.\n", __func__);
424
369
 
425
370
  if (!params.antiprompt.empty()) {
426
371
  for (const auto & antiprompt : params.antiprompt) {
427
- LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
372
+ LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
428
373
  if (params.verbose_prompt) {
429
- auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
374
+ auto tmp = common_tokenize(ctx, antiprompt, false, true);
430
375
  for (int i = 0; i < (int) tmp.size(); i++) {
431
- LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
376
+ LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
432
377
  }
433
378
  }
434
379
  }
435
380
  }
436
381
 
437
382
  if (params.input_prefix_bos) {
438
- LOG_TEE("Input prefix with BOS\n");
383
+ LOG_INF("Input prefix with BOS\n");
439
384
  }
440
385
 
441
386
  if (!params.input_prefix.empty()) {
442
- LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
387
+ LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
443
388
  if (params.verbose_prompt) {
444
- auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
389
+ auto tmp = common_tokenize(ctx, params.input_prefix, true, true);
445
390
  for (int i = 0; i < (int) tmp.size(); i++) {
446
- LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
391
+ LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
447
392
  }
448
393
  }
449
394
  }
450
395
 
451
396
  if (!params.input_suffix.empty()) {
452
- LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
397
+ LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
453
398
  if (params.verbose_prompt) {
454
- auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
399
+ auto tmp = common_tokenize(ctx, params.input_suffix, false, true);
455
400
  for (int i = 0; i < (int) tmp.size(); i++) {
456
- LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
401
+ LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
457
402
  }
458
403
  }
459
404
  }
460
405
  }
461
- LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
462
- LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
463
- LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
406
+
407
+ smpl = common_sampler_init(model, sparams);
408
+ if (!smpl) {
409
+ LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
410
+ return 1;
411
+ }
412
+
413
+ LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
414
+ LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
415
+ LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
416
+
417
+ LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
464
418
 
465
419
  // group-attention state
466
420
  // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
@@ -474,9 +428,9 @@ int main(int argc, char ** argv) {
474
428
  GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
475
429
  //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
476
430
  //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
477
- LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
431
+ LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
478
432
  }
479
- LOG_TEE("\n\n");
433
+ LOG_INF("\n");
480
434
 
481
435
  if (params.interactive) {
482
436
  const char * control_message;
@@ -488,11 +442,11 @@ int main(int argc, char ** argv) {
488
442
  " - To return control without starting a new line, end your input with '/'.\n"
489
443
  " - If you want to submit another line, end your input with '\\'.\n";
490
444
  }
491
- LOG_TEE("== Running in interactive mode. ==\n");
445
+ LOG_INF("== Running in interactive mode. ==\n");
492
446
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
493
- LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
447
+ LOG_INF( " - Press Ctrl+C to interject at any time.\n");
494
448
  #endif
495
- LOG_TEE( "%s\n", control_message);
449
+ LOG_INF( "%s\n", control_message);
496
450
 
497
451
  is_interacting = params.interactive_first;
498
452
  }
@@ -506,7 +460,6 @@ int main(int argc, char ** argv) {
506
460
  int n_remain = params.n_predict;
507
461
  int n_consumed = 0;
508
462
  int n_session_consumed = 0;
509
- int n_past_guidance = 0;
510
463
 
511
464
  std::vector<int> input_tokens; g_input_tokens = &input_tokens;
512
465
  std::vector<int> output_tokens; g_output_tokens = &output_tokens;
@@ -518,28 +471,21 @@ int main(int argc, char ** argv) {
518
471
  display = params.display_prompt;
519
472
 
520
473
  std::vector<llama_token> embd;
521
- std::vector<llama_token> embd_guidance;
522
474
 
523
475
  // tokenized antiprompts
524
476
  std::vector<std::vector<llama_token>> antiprompt_ids;
525
477
 
526
478
  antiprompt_ids.reserve(params.antiprompt.size());
527
479
  for (const std::string & antiprompt : params.antiprompt) {
528
- antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
529
- }
530
-
531
- struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
532
- if (!ctx_sampling) {
533
- fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
534
- exit(1);
480
+ antiprompt_ids.emplace_back(::common_tokenize(ctx, antiprompt, false, true));
535
481
  }
536
482
 
537
483
  if (llama_model_has_encoder(model)) {
538
484
  int enc_input_size = embd_inp.size();
539
485
  llama_token * enc_input_buf = embd_inp.data();
540
486
 
541
- if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
542
- LOG_TEE("%s : failed to eval\n", __func__);
487
+ if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) {
488
+ LOG_ERR("%s : failed to eval\n", __func__);
543
489
  return 1;
544
490
  }
545
491
 
@@ -565,9 +511,8 @@ int main(int argc, char ** argv) {
565
511
  embd.resize(max_embd_size);
566
512
 
567
513
  console::set_display(console::error);
568
- printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
514
+ LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
569
515
  console::set_display(console::reset);
570
- fflush(stdout);
571
516
  }
572
517
 
573
518
  if (ga_n == 1) {
@@ -575,16 +520,22 @@ int main(int argc, char ** argv) {
575
520
  // if we run out of context:
576
521
  // - take the n_keep first tokens from the original prompt (via n_past)
577
522
  // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
578
- if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
523
+
524
+ if (n_past + (int) embd.size() >= n_ctx) {
525
+ if (!params.ctx_shift){
526
+ LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
527
+ break;
528
+ }
529
+
579
530
  if (params.n_predict == -2) {
580
- LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
531
+ LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
581
532
  break;
582
533
  }
583
534
 
584
535
  const int n_left = n_past - params.n_keep;
585
536
  const int n_discard = n_left/2;
586
537
 
587
- LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
538
+ LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
588
539
  n_past, n_left, n_ctx, params.n_keep, n_discard);
589
540
 
590
541
  llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
@@ -592,15 +543,11 @@ int main(int argc, char ** argv) {
592
543
 
593
544
  n_past -= n_discard;
594
545
 
595
- if (ctx_guidance) {
596
- n_past_guidance -= n_discard;
597
- }
598
-
599
- LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
546
+ LOG_DBG("after swap: n_past = %d\n", n_past);
600
547
 
601
- LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
548
+ LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
602
549
 
603
- LOG("clear session path\n");
550
+ LOG_DBG("clear session path\n");
604
551
  path_session.clear();
605
552
  }
606
553
  } else {
@@ -610,10 +557,10 @@ int main(int argc, char ** argv) {
610
557
  const int bd = (ga_w/ga_n)*(ga_n - 1);
611
558
  const int dd = (ga_w/ga_n) - ib*bd - ga_w;
612
559
 
613
- LOG("\n");
614
- LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
615
- LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
616
- LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
560
+ LOG_DBG("\n");
561
+ LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
562
+ LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
563
+ LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
617
564
 
618
565
  llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
619
566
  llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
@@ -623,7 +570,7 @@ int main(int argc, char ** argv) {
623
570
 
624
571
  ga_i += ga_w/ga_n;
625
572
 
626
- LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
573
+ LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
627
574
  }
628
575
  }
629
576
 
@@ -649,65 +596,25 @@ int main(int argc, char ** argv) {
649
596
  }
650
597
  }
651
598
 
652
- // evaluate tokens in batches
653
- // embd is typically prepared beforehand to fit within a batch, but not always
654
- if (ctx_guidance) {
655
- int input_size = 0;
656
- llama_token * input_buf = NULL;
657
-
658
- if (n_past_guidance < (int) guidance_inp.size()) {
659
- // Guidance context should have the same data with these modifications:
660
- //
661
- // * Replace the initial prompt
662
- // * Shift everything by guidance_offset
663
- embd_guidance = guidance_inp;
664
- if (embd.begin() + original_prompt_len < embd.end()) {
665
- embd_guidance.insert(
666
- embd_guidance.end(),
667
- embd.begin() + original_prompt_len,
668
- embd.end()
669
- );
670
- }
671
-
672
- input_buf = embd_guidance.data();
673
- input_size = embd_guidance.size();
674
-
675
- LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
676
- } else {
677
- input_buf = embd.data();
678
- input_size = embd.size();
679
- }
680
-
681
- for (int i = 0; i < input_size; i += params.n_batch) {
682
- int n_eval = std::min(input_size - i, params.n_batch);
683
- if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
684
- LOG_TEE("%s : failed to eval\n", __func__);
685
- return 1;
686
- }
687
-
688
- n_past_guidance += n_eval;
689
- }
690
- }
691
-
692
599
  for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
693
600
  int n_eval = (int) embd.size() - i;
694
601
  if (n_eval > params.n_batch) {
695
602
  n_eval = params.n_batch;
696
603
  }
697
604
 
698
- LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
605
+ LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
699
606
 
700
- if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
701
- LOG_TEE("%s : failed to eval\n", __func__);
607
+ if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
608
+ LOG_ERR("%s : failed to eval\n", __func__);
702
609
  return 1;
703
610
  }
704
611
 
705
612
  n_past += n_eval;
706
613
 
707
- LOG("n_past = %d\n", n_past);
614
+ LOG_DBG("n_past = %d\n", n_past);
708
615
  // Display total tokens alongside total time
709
616
  if (params.n_print > 0 && n_past % params.n_print == 0) {
710
- LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
617
+ LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
711
618
  }
712
619
  }
713
620
 
@@ -718,7 +625,6 @@ int main(int argc, char ** argv) {
718
625
  }
719
626
 
720
627
  embd.clear();
721
- embd_guidance.clear();
722
628
 
723
629
  if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
724
630
  // optionally save the session on first sample (for faster prompt loading next time)
@@ -726,14 +632,14 @@ int main(int argc, char ** argv) {
726
632
  need_to_save_session = false;
727
633
  llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
728
634
 
729
- LOG("saved session to %s\n", path_session.c_str());
635
+ LOG_DBG("saved session to %s\n", path_session.c_str());
730
636
  }
731
637
 
732
- const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
638
+ const llama_token id = common_sampler_sample(smpl, ctx, -1);
733
639
 
734
- llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
640
+ common_sampler_accept(smpl, id, /* accept_grammar= */ true);
735
641
 
736
- LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
642
+ // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
737
643
 
738
644
  embd.push_back(id);
739
645
 
@@ -743,16 +649,16 @@ int main(int argc, char ** argv) {
743
649
  // decrement remaining sampling budget
744
650
  --n_remain;
745
651
 
746
- LOG("n_remain: %d\n", n_remain);
652
+ LOG_DBG("n_remain: %d\n", n_remain);
747
653
  } else {
748
654
  // some user input remains from prompt or interaction, forward it to processing
749
- LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
655
+ LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
750
656
  while ((int) embd_inp.size() > n_consumed) {
751
657
  embd.push_back(embd_inp[n_consumed]);
752
658
 
753
659
  // push the prompt in the sampling context in order to apply repetition penalties later
754
660
  // for the prompt, we don't apply grammar rules
755
- llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
661
+ common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
756
662
 
757
663
  ++n_consumed;
758
664
  if ((int) embd.size() >= params.n_batch) {
@@ -764,10 +670,10 @@ int main(int argc, char ** argv) {
764
670
  // display text
765
671
  if (input_echo && display) {
766
672
  for (auto id : embd) {
767
- const std::string token_str = llama_token_to_piece(ctx, id, params.special);
673
+ const std::string token_str = common_token_to_piece(ctx, id, params.special);
768
674
 
769
675
  // Console/Stream Output
770
- fprintf(stdout, "%s", token_str.c_str());
676
+ LOG("%s", token_str.c_str());
771
677
 
772
678
  // Record Displayed Tokens To Log
773
679
  // Note: Generated tokens are created one by one hence this check
@@ -779,8 +685,6 @@ int main(int argc, char ** argv) {
779
685
  output_tokens.push_back(id);
780
686
  output_ss << token_str;
781
687
  }
782
-
783
- fflush(stdout);
784
688
  }
785
689
  }
786
690
 
@@ -795,7 +699,7 @@ int main(int argc, char ** argv) {
795
699
  // check for reverse prompt in the last n_prev tokens
796
700
  if (!params.antiprompt.empty()) {
797
701
  const int n_prev = 32;
798
- const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
702
+ const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev);
799
703
 
800
704
  is_antiprompt = false;
801
705
  // Check if each of the reverse prompts appears at the end of the output.
@@ -817,7 +721,7 @@ int main(int argc, char ** argv) {
817
721
  }
818
722
 
819
723
  // check for reverse prompt using special tokens
820
- llama_token last_token = llama_sampling_last(ctx_sampling);
724
+ llama_token last_token = common_sampler_last(smpl);
821
725
  for (std::vector<llama_token> ids : antiprompt_ids) {
822
726
  if (ids.size() == 1 && last_token == ids[0]) {
823
727
  if (params.interactive) {
@@ -829,18 +733,18 @@ int main(int argc, char ** argv) {
829
733
  }
830
734
 
831
735
  if (is_antiprompt) {
832
- LOG("found antiprompt: %s\n", last_output.c_str());
736
+ LOG_DBG("found antiprompt: %s\n", last_output.c_str());
833
737
  }
834
738
  }
835
739
 
836
740
  // deal with end of generation tokens in interactive mode
837
- if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
838
- LOG("found an EOG token\n");
741
+ if (llama_token_is_eog(model, common_sampler_last(smpl))) {
742
+ LOG_DBG("found an EOG token\n");
839
743
 
840
744
  if (params.interactive) {
841
745
  if (!params.antiprompt.empty()) {
842
746
  // tokenize and inject first reverse prompt
843
- const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
747
+ const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true);
844
748
  embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
845
749
  is_antiprompt = true;
846
750
  }
@@ -849,32 +753,32 @@ int main(int argc, char ** argv) {
849
753
  chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
850
754
  }
851
755
  is_interacting = true;
852
- printf("\n");
756
+ LOG("\n");
853
757
  }
854
758
  }
855
759
 
856
760
  // if current token is not EOG, we add it to current assistant message
857
761
  if (params.conversation) {
858
- auto id = llama_sampling_last(ctx_sampling);
859
- assistant_ss << llama_token_to_piece(ctx, id, false);
762
+ const auto id = common_sampler_last(smpl);
763
+ assistant_ss << common_token_to_piece(ctx, id, false);
860
764
  }
861
765
 
862
766
  if (n_past > 0 && is_interacting) {
863
- LOG("waiting for user input\n");
767
+ LOG_DBG("waiting for user input\n");
864
768
 
865
769
  if (params.conversation) {
866
- printf("\n> ");
770
+ LOG("\n> ");
867
771
  }
868
772
 
869
773
  if (params.input_prefix_bos) {
870
- LOG("adding input prefix BOS token\n");
774
+ LOG_DBG("adding input prefix BOS token\n");
871
775
  embd_inp.push_back(llama_token_bos(model));
872
776
  }
873
777
 
874
778
  std::string buffer;
875
779
  if (!params.input_prefix.empty() && !params.conversation) {
876
- LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
877
- printf("%s", params.input_prefix.c_str());
780
+ LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
781
+ LOG("%s", params.input_prefix.c_str());
878
782
  }
879
783
 
880
784
  // color user input only
@@ -897,11 +801,11 @@ int main(int argc, char ** argv) {
897
801
  if (buffer.length() > 1) {
898
802
  // append input suffix if any
899
803
  if (!params.input_suffix.empty() && !params.conversation) {
900
- LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
901
- printf("%s", params.input_suffix.c_str());
804
+ LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
805
+ LOG("%s", params.input_suffix.c_str());
902
806
  }
903
807
 
904
- LOG("buffer: '%s'\n", buffer.c_str());
808
+ LOG_DBG("buffer: '%s'\n", buffer.c_str());
905
809
 
906
810
  const size_t original_size = embd_inp.size();
907
811
 
@@ -914,11 +818,11 @@ int main(int argc, char ** argv) {
914
818
  ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
915
819
  : std::move(buffer);
916
820
  // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
917
- const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
918
- const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
919
- const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
821
+ const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
822
+ const auto line_inp = common_tokenize(ctx, user_inp, false, format_chat);
823
+ const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true);
920
824
 
921
- LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
825
+ LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
922
826
 
923
827
  // if user stop generation mid-way, we must add EOT to finish model's last response
924
828
  if (need_insert_eot && format_chat) {
@@ -934,16 +838,16 @@ int main(int argc, char ** argv) {
934
838
  for (size_t i = original_size; i < embd_inp.size(); ++i) {
935
839
  const llama_token token = embd_inp[i];
936
840
  output_tokens.push_back(token);
937
- output_ss << llama_token_to_piece(ctx, token);
841
+ output_ss << common_token_to_piece(ctx, token);
938
842
  }
939
843
 
940
844
  // reset assistant message
941
845
  assistant_ss.str("");
942
846
 
943
847
  n_remain -= line_inp.size();
944
- LOG("n_remain: %d\n", n_remain);
848
+ LOG_DBG("n_remain: %d\n", n_remain);
945
849
  } else {
946
- LOG("empty line, passing control back\n");
850
+ LOG_DBG("empty line, passing control back\n");
947
851
  }
948
852
 
949
853
  input_echo = false; // do not echo this again
@@ -951,7 +855,7 @@ int main(int argc, char ** argv) {
951
855
 
952
856
  if (n_past > 0) {
953
857
  if (is_interacting) {
954
- llama_sampling_reset(ctx_sampling);
858
+ common_sampler_reset(smpl);
955
859
  }
956
860
  is_interacting = false;
957
861
  }
@@ -959,7 +863,7 @@ int main(int argc, char ** argv) {
959
863
 
960
864
  // end of generation
961
865
  if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
962
- LOG_TEE(" [end of text]\n");
866
+ LOG(" [end of text]\n");
963
867
  break;
964
868
  }
965
869
 
@@ -972,23 +876,22 @@ int main(int argc, char ** argv) {
972
876
  }
973
877
 
974
878
  if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
975
- LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
879
+ LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
976
880
  llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
977
881
  }
978
882
 
979
- llama_print_timings(ctx);
980
- write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
883
+ LOG("\n\n");
884
+ common_perf_print(ctx, smpl);
885
+
886
+ common_sampler_free(smpl);
981
887
 
982
- if (ctx_guidance) { llama_free(ctx_guidance); }
983
888
  llama_free(ctx);
984
889
  llama_free_model(model);
985
890
 
986
- llama_sampling_free(ctx_sampling);
987
891
  llama_backend_free();
988
892
 
989
- #ifndef LOG_DISABLE_LOGS
990
- LOG_TEE("Log end\n");
991
- #endif // LOG_DISABLE_LOGS
893
+ ggml_threadpool_free(threadpool);
894
+ ggml_threadpool_free(threadpool_batch);
992
895
 
993
896
  return 0;
994
897
  }