@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,8 +1,9 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
2
-
3
3
  #include "console.h"
4
+ #include "sampling.h"
5
+ #include "log.h"
4
6
  #include "llama.h"
5
- #include "grammar-parser.h"
6
7
 
7
8
  #include <cassert>
8
9
  #include <cinttypes>
@@ -34,57 +35,14 @@
34
35
 
35
36
  static llama_context ** g_ctx;
36
37
  static llama_model ** g_model;
37
- static gpt_params * g_params;
38
+ static common_sampler ** g_smpl;
39
+ static common_params * g_params;
38
40
  static std::vector<llama_token> * g_input_tokens;
39
41
  static std::ostringstream * g_output_ss;
40
42
  static std::vector<llama_token> * g_output_tokens;
41
43
 
42
44
  static bool is_interacting = false;
43
45
 
44
- static void write_logfile(
45
- const llama_context * ctx, const gpt_params & params, const llama_model * model,
46
- const std::vector<llama_token> & input_tokens, const std::string & output,
47
- const std::vector<llama_token> & output_tokens
48
- ) {
49
- if (params.logdir.empty()) {
50
- return;
51
- }
52
-
53
- const std::string timestamp = string_get_sortable_timestamp();
54
-
55
- const bool success = fs_create_directory_with_parents(params.logdir);
56
- if (!success) {
57
- fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
58
- __func__, params.logdir.c_str());
59
- return;
60
- }
61
-
62
- const std::string logfile_path = params.logdir + timestamp + ".yml";
63
- FILE * logfile = fopen(logfile_path.c_str(), "w");
64
-
65
- if (logfile == NULL) {
66
- fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
67
- return;
68
- }
69
-
70
- fprintf(logfile, "binary: infill\n");
71
- char model_desc[128];
72
- llama_model_desc(model, model_desc, sizeof(model_desc));
73
- yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
74
-
75
- fprintf(logfile, "\n");
76
- fprintf(logfile, "######################\n");
77
- fprintf(logfile, "# Generation Results #\n");
78
- fprintf(logfile, "######################\n");
79
- fprintf(logfile, "\n");
80
-
81
- yaml_dump_string_multiline(logfile, "output", output.c_str());
82
- yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
83
-
84
- llama_dump_timing_info_yaml(logfile, ctx);
85
- fclose(logfile);
86
- }
87
-
88
46
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
89
47
  static void sigint_handler(int signo) {
90
48
  if (signo == SIGINT) {
@@ -92,9 +50,13 @@ static void sigint_handler(int signo) {
92
50
  is_interacting = true;
93
51
  } else {
94
52
  console::cleanup();
95
- printf("\n");
96
- llama_print_timings(*g_ctx);
97
- write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
53
+ LOG("\n");
54
+ common_perf_print(*g_ctx, *g_smpl);
55
+
56
+ // make sure all logs are flushed
57
+ LOG("Interrupted by user\n");
58
+ common_log_pause(common_log_main());
59
+
98
60
  _exit(130);
99
61
  }
100
62
  }
@@ -102,118 +64,107 @@ static void sigint_handler(int signo) {
102
64
  #endif
103
65
 
104
66
  int main(int argc, char ** argv) {
105
- gpt_params params;
106
- llama_sampling_params & sparams = params.sparams;
67
+ common_params params;
107
68
  g_params = &params;
108
69
 
109
- if (!gpt_params_parse(argc, argv, params)) {
110
- gpt_params_print_usage(argc, argv, params);
70
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
111
71
  return 1;
112
72
  }
113
73
 
114
- #ifndef LOG_DISABLE_LOGS
115
- log_set_target(log_filename_generator("infill", "log"));
116
- LOG_TEE("Log start\n");
117
- log_dump_cmdline(argc, argv);
118
- #endif // LOG_DISABLE_LOGS
74
+ common_init();
75
+
76
+ auto & sparams = params.sparams;
119
77
 
120
78
  console::init(params.simple_io, params.use_color);
121
79
  atexit([]() { console::cleanup(); });
122
80
 
123
81
  if (params.logits_all) {
124
- printf("\n************\n");
125
- printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
126
- printf("************\n\n");
82
+ LOG_ERR("\n************\n");
83
+ LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
84
+ LOG_ERR("************\n\n");
127
85
 
128
86
  return 0;
129
87
  }
130
88
 
131
89
  if (params.embedding) {
132
- printf("\n************\n");
133
- printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
134
- printf("************\n\n");
90
+ LOG_ERR("\n************\n");
91
+ LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
92
+ LOG_ERR("************\n\n");
135
93
 
136
94
  return 0;
137
95
  }
138
96
 
139
97
  if (params.n_ctx != 0 && params.n_ctx < 8) {
140
- LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
98
+ LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
141
99
  params.n_ctx = 8;
142
100
  }
101
+
143
102
  if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
144
- printf("\n************\n");
145
- printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
146
- printf("************\n\n");
103
+ LOG_ERR("\n************\n");
104
+ LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
105
+ LOG_ERR("************\n\n");
147
106
 
148
107
  return 0;
149
108
  }
150
109
 
151
110
  if (params.rope_freq_base != 0.0) {
152
- LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
111
+ LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
153
112
  }
154
113
 
155
114
  if (params.rope_freq_scale != 0.0) {
156
- LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
157
- }
158
-
159
- LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
160
- LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
161
-
162
- if (params.seed == LLAMA_DEFAULT_SEED) {
163
- params.seed = time(NULL);
115
+ LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
164
116
  }
165
117
 
166
- LOG_TEE("%s: seed = %u\n", __func__, params.seed);
167
-
168
- std::mt19937 rng(params.seed);
169
-
170
- LOG("%s: llama backend init\n", __func__);
118
+ LOG_INF("%s: llama backend init\n", __func__);
171
119
  llama_backend_init();
172
120
  llama_numa_init(params.numa);
173
121
 
174
- llama_model * model;
175
- llama_context * ctx;
122
+ llama_model * model = nullptr;
123
+ llama_context * ctx = nullptr;
124
+ common_sampler * smpl = nullptr;
176
125
 
177
126
  g_model = &model;
178
127
  g_ctx = &ctx;
128
+ g_smpl = &smpl;
179
129
 
180
130
  // load the model and apply lora adapter, if any
181
- LOG("%s: load the model and apply lora adapter, if any\n", __func__);
182
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
131
+ LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
132
+ common_init_result llama_init = common_init_from_params(params);
133
+
134
+ model = llama_init.model;
135
+ ctx = llama_init.context;
183
136
 
184
137
  if (model == NULL) {
185
- LOG_TEE("%s: error: unable to load model\n", __func__);
138
+ LOG_ERR("%s: unable to load model\n", __func__);
186
139
  return 1;
187
140
  }
188
141
 
189
142
  const int n_ctx_train = llama_n_ctx_train(model);
190
143
  const int n_ctx = llama_n_ctx(ctx);
191
- LOG("n_ctx: %d\n", n_ctx);
144
+ LOG_DBG("n_ctx: %d\n", n_ctx);
192
145
 
193
146
  if (n_ctx > n_ctx_train) {
194
- LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
195
- __func__, n_ctx_train, n_ctx);
147
+ LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
196
148
  }
197
149
 
198
150
  // print system information
199
151
  {
200
- LOG_TEE("\n");
201
- LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
152
+ LOG_INF("\n");
153
+ LOG_INF("%s\n", common_params_get_system_info(params).c_str());
202
154
  }
203
- const bool add_bos = llama_should_add_bos_token(model);
204
- GGML_ASSERT(llama_add_eos_token(model) != 1);
205
- LOG("add_bos: %d\n", add_bos);
155
+ const bool add_bos = llama_add_bos_token(model);
156
+ GGML_ASSERT(!llama_add_eos_token(model));
206
157
 
207
158
  std::vector<llama_token> embd_inp;
208
159
  std::vector<llama_token> embd_end;
209
- std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
210
- std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
160
+ std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
161
+ std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
211
162
 
212
- GGML_ASSERT(llama_token_prefix(model) >= 0);
213
- GGML_ASSERT(llama_token_suffix(model) >= 0);
163
+ GGML_ASSERT(llama_token_fim_pre(model) >= 0);
164
+ GGML_ASSERT(llama_token_fim_suf(model) >= 0);
214
165
 
215
- inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
216
- inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
166
+ inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
167
+ inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
217
168
 
218
169
  embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
219
170
  embd_end = params.spm_infill ? inp_pfx : inp_sfx;
@@ -222,23 +173,24 @@ int main(int argc, char ** argv) {
222
173
  }
223
174
  embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
224
175
 
225
- const llama_token middle_token = llama_token_middle(model);
176
+ const llama_token middle_token = llama_token_fim_mid(model);
226
177
  if (middle_token >= 0) {
227
178
  embd_inp.push_back(middle_token);
228
179
  }
229
180
 
230
- LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
231
- LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
232
- LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
181
+ LOG_DBG("add_bos: %d\n", add_bos);
182
+ LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
183
+ LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
184
+ LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
233
185
 
234
186
  // Should not run without any tokens
235
187
  if (embd_inp.empty()) {
236
188
  embd_inp.push_back(llama_token_bos(model));
237
- LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
189
+ LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
238
190
  }
239
191
 
240
192
  if ((int) embd_inp.size() > n_ctx - 4) {
241
- LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
193
+ LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
242
194
  return 1;
243
195
  }
244
196
 
@@ -247,9 +199,8 @@ int main(int argc, char ** argv) {
247
199
  params.n_keep = (int)embd_inp.size();
248
200
  }
249
201
 
250
- LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
251
- LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
252
-
202
+ LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
203
+ LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
253
204
 
254
205
  // enable interactive mode if interactive start is specified
255
206
  if (params.interactive_first) {
@@ -257,21 +208,21 @@ int main(int argc, char ** argv) {
257
208
  }
258
209
 
259
210
  if (params.verbose_prompt) {
260
- LOG_TEE("\n");
261
- LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
262
- LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
211
+ LOG_INF("\n");
212
+ LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
213
+ LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
263
214
  for (int i = 0; i < (int) embd_inp.size(); i++) {
264
- LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
215
+ LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
265
216
  }
266
217
 
267
218
  if (params.n_keep > 0) {
268
- LOG_TEE("%s: static prompt based on n_keep: '", __func__);
219
+ LOG_INF("%s: static prompt based on n_keep: '", __func__);
269
220
  for (int i = 0; i < params.n_keep; i++) {
270
- LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
221
+ LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
271
222
  }
272
- LOG_TEE("'\n");
223
+ LOG_CNT("'\n");
273
224
  }
274
- LOG_TEE("\n");
225
+ LOG_INF("\n");
275
226
  }
276
227
 
277
228
  if (params.interactive) {
@@ -288,30 +239,30 @@ int main(int argc, char ** argv) {
288
239
  SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
289
240
  #endif
290
241
 
291
- LOG_TEE("%s: interactive mode on.\n", __func__);
242
+ LOG_INF("%s: interactive mode on.\n", __func__);
292
243
 
293
244
  if (params.input_prefix_bos) {
294
- LOG_TEE("Input prefix with BOS\n");
245
+ LOG_INF("Input prefix with BOS\n");
295
246
  }
296
247
 
297
248
  if (!params.input_prefix.empty()) {
298
- LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
249
+ LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
299
250
  }
300
251
 
301
252
  if (!params.input_suffix.empty()) {
302
- LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
253
+ LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
303
254
  }
304
255
  }
305
- LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
306
- LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
307
- LOG_TEE("\n\n");
308
-
309
- LOG_TEE("\n##### Infill mode #####\n\n");
310
- if (params.infill) {
311
- printf("\n************\n");
312
- printf("no need to specify '--infill', always running infill\n");
313
- printf("************\n\n");
314
- }
256
+ smpl = common_sampler_init(model, sparams);
257
+
258
+ LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
259
+ LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
260
+ LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
261
+
262
+ LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
263
+
264
+ LOG_INF("\n");
265
+ LOG_INF("\n##### Infill mode #####\n\n");
315
266
  if (params.interactive) {
316
267
  const char *control_message;
317
268
  if (params.multiline_input) {
@@ -322,11 +273,11 @@ int main(int argc, char ** argv) {
322
273
  " - To return control without starting a new line, end your input with '/'.\n"
323
274
  " - If you want to submit another line, end your input with '\\'.\n";
324
275
  }
325
- LOG_TEE("== Running in interactive mode. ==\n");
276
+ LOG_INF("== Running in interactive mode. ==\n");
326
277
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
327
- LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
278
+ LOG_INF( " - Press Ctrl+C to interject at any time.\n");
328
279
  #endif
329
- LOG_TEE( "%s\n", control_message);
280
+ LOG_INF( "%s\n", control_message);
330
281
 
331
282
  is_interacting = params.interactive_first;
332
283
  }
@@ -346,8 +297,6 @@ int main(int argc, char ** argv) {
346
297
 
347
298
  std::vector<llama_token> embd;
348
299
 
349
- struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
350
-
351
300
  while (n_remain != 0 || params.interactive) {
352
301
  // predict
353
302
  if (!embd.empty()) {
@@ -361,9 +310,8 @@ int main(int argc, char ** argv) {
361
310
  embd.resize(max_embd_size);
362
311
 
363
312
  console::set_display(console::error);
364
- printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
313
+ LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
365
314
  console::set_display(console::reset);
366
- fflush(stdout);
367
315
  }
368
316
 
369
317
  // infinite text generation via context swapping
@@ -372,14 +320,14 @@ int main(int argc, char ** argv) {
372
320
  // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
373
321
  if (n_past + (int) embd.size() > n_ctx) {
374
322
  if (params.n_predict == -2) {
375
- LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
323
+ LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
376
324
  break;
377
325
  }
378
326
 
379
327
  const int n_left = n_past - params.n_keep - 1;
380
328
  const int n_discard = n_left/2;
381
329
 
382
- LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
330
+ LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
383
331
  n_past, n_left, n_ctx, params.n_keep, n_discard);
384
332
 
385
333
  llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
@@ -387,9 +335,9 @@ int main(int argc, char ** argv) {
387
335
 
388
336
  n_past -= n_discard;
389
337
 
390
- LOG("after swap: n_past = %d\n", n_past);
338
+ LOG_DBG("after swap: n_past = %d\n", n_past);
391
339
 
392
- LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
340
+ LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
393
341
 
394
342
  }
395
343
 
@@ -401,16 +349,16 @@ int main(int argc, char ** argv) {
401
349
  n_eval = params.n_batch;
402
350
  }
403
351
 
404
- LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
352
+ LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
405
353
 
406
- if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
407
- LOG_TEE("%s : failed to eval\n", __func__);
354
+ if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
355
+ LOG_ERR("%s : failed to eval\n", __func__);
408
356
  return 1;
409
357
  }
410
358
 
411
359
  n_past += n_eval;
412
360
 
413
- LOG("n_past = %d\n", n_past);
361
+ LOG_DBG("n_past = %d\n", n_past);
414
362
  }
415
363
 
416
364
  }
@@ -418,11 +366,11 @@ int main(int argc, char ** argv) {
418
366
  embd.clear();
419
367
 
420
368
  if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
421
- const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
369
+ const llama_token id = common_sampler_sample(smpl, ctx, -1);
422
370
 
423
- llama_sampling_accept(ctx_sampling, ctx, id, true);
371
+ common_sampler_accept(smpl, id, true);
424
372
 
425
- LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
373
+ // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
426
374
 
427
375
  embd.push_back(id);
428
376
 
@@ -432,16 +380,16 @@ int main(int argc, char ** argv) {
432
380
  // decrement remaining sampling budget
433
381
  --n_remain;
434
382
 
435
- LOG("n_remain: %d\n", n_remain);
383
+ LOG_DBG("n_remain: %d\n", n_remain);
436
384
  } else {
437
385
  // some user input remains from prompt or interaction, forward it to processing
438
- LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
386
+ LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
439
387
  while ((int) embd_inp.size() > n_consumed) {
440
388
  embd.push_back(embd_inp[n_consumed]);
441
389
 
442
390
  // push the prompt in the sampling context in order to apply repetition penalties later
443
391
  // for the prompt, we don't apply grammar rules
444
- llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
392
+ common_sampler_accept(smpl, embd_inp[n_consumed], false);
445
393
 
446
394
  ++n_consumed;
447
395
  if ((int) embd.size() >= params.n_batch) {
@@ -453,8 +401,8 @@ int main(int argc, char ** argv) {
453
401
  // display text
454
402
  if (input_echo) {
455
403
  for (auto id : embd) {
456
- const std::string token_str = llama_token_to_piece(ctx, id);
457
- printf("%s", token_str.c_str());
404
+ const std::string token_str = common_token_to_piece(ctx, id);
405
+ LOG("%s", token_str.c_str());
458
406
 
459
407
  if (embd.size() > 1) {
460
408
  input_tokens.push_back(id);
@@ -463,7 +411,6 @@ int main(int argc, char ** argv) {
463
411
  output_ss << token_str;
464
412
  }
465
413
  }
466
- fflush(stdout);
467
414
  }
468
415
  // reset color to default if we there is no pending user input
469
416
  if (input_echo && (int) embd_inp.size() == n_consumed) {
@@ -473,13 +420,12 @@ int main(int argc, char ** argv) {
473
420
  // if not currently processing queued inputs;
474
421
  if ((int) embd_inp.size() <= n_consumed) {
475
422
  // deal with eot token in infill mode
476
- if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
423
+ if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
477
424
  if (is_interacting && !params.interactive_first) {
478
425
  // print an eot token
479
- printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
426
+ LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
480
427
  }
481
- fflush(stdout);
482
- printf("\n");
428
+ LOG("\n");
483
429
  console::set_display(console::user_input);
484
430
  std::string buffer;
485
431
  std::string line;
@@ -514,11 +460,11 @@ int main(int argc, char ** argv) {
514
460
  }
515
461
 
516
462
  // tokenize new prefix and suffix
517
- std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
518
- std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
463
+ std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
464
+ std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
519
465
 
520
- inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
521
- inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
466
+ inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
467
+ inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
522
468
 
523
469
  embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
524
470
  embd_end = params.spm_infill ? inp_pfx : inp_sfx;
@@ -535,35 +481,33 @@ int main(int argc, char ** argv) {
535
481
  n_remain = params.n_predict;
536
482
  n_past = 0;
537
483
  n_consumed = 0;
538
- // LOG_TEE("took new input\n");
539
484
  is_interacting = false;
540
485
  }
541
486
  // deal with end of generation tokens in interactive mode
542
- else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
543
- LOG("found EOS token\n");
487
+ else if (llama_token_is_eog(model, common_sampler_last(smpl))) {
488
+ LOG_DBG("found EOS token\n");
544
489
 
545
490
  if (params.interactive) {
546
491
 
547
492
  is_interacting = true;
548
- printf("\n");
493
+ LOG("\n");
549
494
  console::set_display(console::user_input);
550
- fflush(stdout);
551
495
  }
552
496
  }
553
497
 
554
498
  if (n_past > 0 && is_interacting && !params.interactive) {
555
- LOG("waiting for user input\n");
499
+ LOG_DBG("waiting for user input\n");
556
500
 
557
501
  if (params.input_prefix_bos) {
558
- LOG("adding input prefix BOS token\n");
502
+ LOG_DBG("adding input prefix BOS token\n");
559
503
  embd_inp.push_back(llama_token_bos(model));
560
504
  }
561
505
 
562
506
  std::string buffer;
563
507
  if (!params.input_prefix.empty()) {
564
- LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
508
+ LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
565
509
  buffer += params.input_prefix;
566
- printf("%s", buffer.c_str());
510
+ LOG("%s", buffer.c_str());
567
511
  }
568
512
 
569
513
  std::string line;
@@ -581,30 +525,30 @@ int main(int argc, char ** argv) {
581
525
  if (buffer.length() > 1) {
582
526
  // append input suffix if any
583
527
  if (!params.input_suffix.empty()) {
584
- LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
528
+ LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
585
529
  buffer += params.input_suffix;
586
- printf("%s", params.input_suffix.c_str());
530
+ LOG("%s", params.input_suffix.c_str());
587
531
  }
588
532
 
589
- LOG("buffer: '%s'\n", buffer.c_str());
533
+ LOG_DBG("buffer: '%s'\n", buffer.c_str());
590
534
 
591
535
  const size_t original_size = embd_inp.size();
592
536
 
593
- const auto line_inp = ::llama_tokenize(ctx, buffer, false);
594
- LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
537
+ const auto line_inp = common_tokenize(ctx, buffer, false);
538
+ LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
595
539
 
596
540
  embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
597
541
 
598
542
  for (size_t i = original_size; i < embd_inp.size(); ++i) {
599
543
  const llama_token token = embd_inp[i];
600
544
  output_tokens.push_back(token);
601
- output_ss << llama_token_to_piece(ctx, token);
545
+ output_ss << common_token_to_piece(ctx, token);
602
546
  }
603
547
 
604
548
  n_remain -= line_inp.size();
605
- LOG("n_remain: %d\n", n_remain);
549
+ LOG_DBG("n_remain: %d\n", n_remain);
606
550
  } else {
607
- LOG("empty line, passing control back\n");
551
+ LOG_DBG("empty line, passing control back\n");
608
552
  }
609
553
 
610
554
  input_echo = false; // do not echo this again
@@ -612,7 +556,7 @@ int main(int argc, char ** argv) {
612
556
 
613
557
  if (n_past > 0) {
614
558
  if (is_interacting) {
615
- llama_sampling_reset(ctx_sampling);
559
+ common_sampler_reset(smpl);
616
560
  }
617
561
  is_interacting = false;
618
562
  }
@@ -631,22 +575,17 @@ int main(int argc, char ** argv) {
631
575
  }
632
576
  }
633
577
  if (!params.interactive && n_remain <= 0) {
634
- printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
635
- fflush(stdout);
578
+ LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
636
579
  }
637
580
 
638
- llama_print_timings(ctx);
639
- write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
581
+ LOG("\n");
582
+ common_perf_print(ctx, smpl);
640
583
 
641
584
  llama_free(ctx);
642
585
  llama_free_model(model);
643
586
 
644
- llama_sampling_free(ctx_sampling);
587
+ common_sampler_free(smpl);
645
588
  llama_backend_free();
646
589
 
647
- #ifndef LOG_DISABLE_LOGS
648
- LOG_TEE("Log end\n");
649
- #endif // LOG_DISABLE_LOGS
650
-
651
590
  return 0;
652
591
  }