@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,72 +1,68 @@
1
+ #include "arg.h"
1
2
  #include "ggml.h"
2
- #include "llama.h"
3
3
  #include "common.h"
4
4
  #include "ngram-cache.h"
5
+ #include "sampling.h"
6
+ #include "log.h"
7
+ #include "llama.h"
5
8
 
6
- #include <cmath>
7
9
  #include <cstdint>
8
10
  #include <cstdio>
9
11
  #include <fstream>
10
12
  #include <string>
11
13
  #include <vector>
12
- #include <unordered_map>
13
14
 
14
15
  int main(int argc, char ** argv){
15
- gpt_params params;
16
+ common_params params;
16
17
 
17
- if (!gpt_params_parse(argc, argv, params)) {
18
- gpt_params_print_usage(argc, argv, params);
18
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
19
19
  return 1;
20
20
  }
21
21
 
22
+ common_init();
23
+
22
24
  // max. number of additional tokens to draft if match is found
23
25
  const int n_draft = params.n_draft;
24
26
 
25
27
  const bool dump_kv_cache = params.dump_kv_cache;
26
28
 
27
- #ifndef LOG_DISABLE_LOGS
28
- log_set_target(log_filename_generator("lookup", "log"));
29
- LOG_TEE("Log start\n");
30
- log_dump_cmdline(argc, argv);
31
- #endif // LOG_DISABLE_LOGS
32
-
33
29
  // init llama.cpp
34
30
  llama_backend_init();
35
31
  llama_numa_init(params.numa);
36
32
 
37
- llama_model * model = NULL;
38
- llama_context * ctx = NULL;
39
-
40
33
  // load the model
41
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
34
+ common_init_result llama_init = common_init_from_params(params);
35
+
36
+ llama_model * model = llama_init.model;
37
+ llama_context * ctx = llama_init.context;
42
38
 
43
39
  // tokenize the prompt
44
40
  std::vector<llama_token> inp;
45
- inp = ::llama_tokenize(ctx, params.prompt, true, true);
41
+ inp = common_tokenize(ctx, params.prompt, true, true);
46
42
 
47
- llama_ngram_cache ngram_cache_context;
48
- llama_ngram_cache ngram_cache_dynamic;
49
- llama_ngram_cache ngram_cache_static;
43
+ common_ngram_cache ngram_cache_context;
44
+ common_ngram_cache ngram_cache_dynamic;
45
+ common_ngram_cache ngram_cache_static;
50
46
  int64_t t_draft_flat_us = 0;
51
47
  int64_t t_draft_us = 0;
52
48
 
53
49
  {
54
50
  // Fill up context ngram cache with tokens from user input:
55
51
  const int64_t t_start_draft_us = ggml_time_us();
56
- llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
52
+ common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
57
53
 
58
54
  if (!params.lookup_cache_static.empty()) {
59
55
  try {
60
- ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
56
+ ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
61
57
  } catch (std::ifstream::failure const &) {
62
- fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
58
+ LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
63
59
  exit(1);
64
60
  }
65
61
  }
66
62
 
67
63
  if (!params.lookup_cache_dynamic.empty()) {
68
64
  try {
69
- ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
65
+ ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
70
66
  } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
71
67
  }
72
68
 
@@ -77,14 +73,14 @@ int main(int argc, char ** argv){
77
73
  const int max_tokens_list_size = max_context_size - 4;
78
74
 
79
75
  if ((int) inp.size() > max_tokens_list_size) {
80
- fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
76
+ LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
81
77
  return 1;
82
78
  }
83
79
 
84
- fprintf(stderr, "\n\n");
80
+ LOG("\n\n");
85
81
 
86
82
  for (auto id : inp) {
87
- fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
83
+ LOG("%s", common_token_to_piece(ctx, id).c_str());
88
84
  }
89
85
 
90
86
  fflush(stderr);
@@ -93,8 +89,8 @@ int main(int argc, char ** argv){
93
89
 
94
90
  const auto t_enc_start = ggml_time_us();
95
91
 
96
- llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
97
- llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
92
+ llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
93
+ llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
98
94
 
99
95
  const auto t_enc_end = ggml_time_us();
100
96
 
@@ -106,7 +102,7 @@ int main(int argc, char ** argv){
106
102
 
107
103
  bool has_eos = false;
108
104
 
109
- struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
105
+ struct common_sampler * smpl = common_sampler_init(model, params.sparams);
110
106
 
111
107
  std::vector<llama_token> draft;
112
108
 
@@ -121,23 +117,23 @@ int main(int argc, char ** argv){
121
117
  // debug
122
118
  if (dump_kv_cache) {
123
119
  llama_kv_cache_view_update(ctx, &kvc_view);
124
- llama_kv_cache_dump_view_seqs(kvc_view, 40);
120
+ common_kv_cache_dump_view_seqs(kvc_view, 40);
125
121
  }
126
122
 
127
123
  // print current draft sequence
128
- LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());
124
+ LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
129
125
 
130
126
  int i_dft = 0;
131
127
  while (true) {
132
128
  // sample from the target model
133
- llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);
129
+ llama_token id = common_sampler_sample(smpl, ctx, i_dft);
134
130
 
135
- llama_sampling_accept(ctx_sampling, ctx, id, true);
131
+ common_sampler_accept(smpl, id, true);
136
132
 
137
- const std::string token_str = llama_token_to_piece(ctx, id);
133
+ const std::string token_str = common_token_to_piece(ctx, id);
138
134
 
139
135
  if (!params.use_color) {
140
- printf("%s", token_str.c_str());
136
+ LOG("%s", token_str.c_str());
141
137
  }
142
138
 
143
139
  if (llama_token_is_eog(model, id)) {
@@ -148,7 +144,7 @@ int main(int argc, char ** argv){
148
144
 
149
145
  // check if the target token matches the draft
150
146
  if (i_dft < (int) draft.size() && id == draft[i_dft]) {
151
- LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
147
+ LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
152
148
  ++n_accept;
153
149
  ++n_past;
154
150
  ++i_dft;
@@ -156,25 +152,25 @@ int main(int argc, char ** argv){
156
152
  {
157
153
  // Update context ngram cache with the newly accepted token:
158
154
  const int64_t t_start_draft_us = ggml_time_us();
159
- llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
155
+ common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
160
156
  t_draft_us += ggml_time_us() - t_start_draft_us;
161
157
  }
162
158
 
163
159
  if (params.use_color) {
164
160
  // color accepted draft token
165
- printf("\033[34m%s\033[0m", token_str.c_str());
161
+ LOG("\033[34m%s\033[0m", token_str.c_str());
166
162
  fflush(stdout);
167
163
  }
168
164
  continue;
169
165
  }
170
166
 
171
167
  if (params.use_color) {
172
- printf("%s", token_str.c_str());
168
+ LOG("%s", token_str.c_str());
173
169
  }
174
170
  fflush(stdout);
175
171
 
176
172
 
177
- LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
173
+ LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
178
174
 
179
175
  draft.clear();
180
176
  draft.push_back(id);
@@ -182,7 +178,7 @@ int main(int argc, char ** argv){
182
178
  {
183
179
  // Update context ngram cache with the newly accepted token:
184
180
  const int64_t t_start_draft_us = ggml_time_us();
185
- llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
181
+ common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
186
182
  t_draft_us += ggml_time_us() - t_start_draft_us;
187
183
  }
188
184
  break;
@@ -196,18 +192,18 @@ int main(int argc, char ** argv){
196
192
  // clean the cache of draft tokens that weren't accepted
197
193
  llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
198
194
 
199
- llama_batch_clear(batch_tgt);
200
- llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
195
+ common_batch_clear(batch_tgt);
196
+ common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
201
197
 
202
198
  // Draft already contains a single token sampled from the model:
203
199
  GGML_ASSERT(draft.size() == 1);
204
200
  GGML_ASSERT(draft[0] == inp.back());
205
201
  const int64_t t_start_draft_us = ggml_time_us();
206
202
 
207
- llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
203
+ common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
208
204
 
209
205
  for (size_t i = 1; i < draft.size(); ++i) {
210
- llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
206
+ common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
211
207
  }
212
208
 
213
209
  t_draft_us += ggml_time_us() - t_start_draft_us;
@@ -222,28 +218,29 @@ int main(int argc, char ** argv){
222
218
  auto t_dec_end = ggml_time_us();
223
219
 
224
220
  // Update dynamic ngram cache with context ngram cache and save it to disk:
225
- llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
226
- llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
221
+ common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
222
+ common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
227
223
 
228
- LOG_TEE("\n\n");
224
+ LOG("\n\n");
229
225
 
230
- LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
231
- LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
226
+ LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
227
+ LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
232
228
 
233
- LOG_TEE("\n");
234
- LOG_TEE("n_draft = %d\n", n_draft);
235
- LOG_TEE("n_predict = %d\n", n_predict);
236
- LOG_TEE("n_drafted = %d\n", n_drafted);
237
- LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
238
- LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
229
+ LOG_INF("\n");
230
+ LOG_INF("n_draft = %d\n", n_draft);
231
+ LOG_INF("n_predict = %d\n", n_predict);
232
+ LOG_INF("n_drafted = %d\n", n_drafted);
233
+ LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
234
+ LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
239
235
  t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
240
- LOG_TEE("n_accept = %d\n", n_accept);
241
- LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
236
+ LOG_INF("n_accept = %d\n", n_accept);
237
+ LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
238
+
239
+ LOG_INF("\ntarget:\n\n");
240
+ common_perf_print(ctx, smpl);
242
241
 
243
- LOG_TEE("\ntarget:\n");
244
- llama_print_timings(ctx);
242
+ common_sampler_free(smpl);
245
243
 
246
- llama_sampling_free(ctx_sampling);
247
244
  llama_batch_free(batch_tgt);
248
245
 
249
246
  llama_free(ctx);
@@ -251,7 +248,7 @@ int main(int argc, char ** argv){
251
248
 
252
249
  llama_backend_free();
253
250
 
254
- fprintf(stderr, "\n\n");
251
+ LOG("\n\n");
255
252
 
256
253
  return 0;
257
254
  }