@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,7 +1,9 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
3
+ #include "sampling.h"
4
+ #include "log.h"
2
5
  #include "llama.h"
3
6
 
4
- #include <cmath>
5
7
  #include <cstdio>
6
8
  #include <string>
7
9
  #include <vector>
@@ -35,54 +37,49 @@ struct ngram_container {
35
37
  };
36
38
 
37
39
  int main(int argc, char ** argv) {
38
- gpt_params params;
40
+ common_params params;
39
41
 
40
- if (!gpt_params_parse(argc, argv, params)) {
41
- gpt_params_print_usage(argc, argv, params);
42
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
42
43
  return 1;
43
44
  }
44
45
 
46
+ common_init();
47
+
45
48
  const int W = 15; // lookahead window
46
49
  const int N = 5; // n-gram size
47
50
  const int G = 15; // max verification n-grams
48
51
 
49
52
  const bool dump_kv_cache = params.dump_kv_cache;
50
53
 
51
- #ifndef LOG_DISABLE_LOGS
52
- log_set_target(log_filename_generator("lookahead", "log"));
53
- LOG_TEE("Log start\n");
54
- log_dump_cmdline(argc, argv);
55
- #endif // LOG_DISABLE_LOGS
56
-
57
54
  // init llama.cpp
58
55
  llama_backend_init();
59
56
  llama_numa_init(params.numa);
60
57
 
61
- llama_model * model = NULL;
62
- llama_context * ctx = NULL;
63
-
64
58
  // load the target model
65
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
59
+ common_init_result llama_init = common_init_from_params(params);
60
+
61
+ llama_model * model = llama_init.model;
62
+ llama_context * ctx = llama_init.context;
66
63
 
67
64
  // Tokenize the prompt
68
65
  std::vector<llama_token> inp;
69
66
  std::vector<llama_token> all;
70
67
 
71
- inp = ::llama_tokenize(ctx, params.prompt, true, true);
68
+ inp = common_tokenize(ctx, params.prompt, true, true);
72
69
  all = inp;
73
70
 
74
71
  const int max_context_size = llama_n_ctx(ctx);
75
72
  const int max_tokens_list_size = max_context_size - 4;
76
73
 
77
74
  if ((int) inp.size() > max_tokens_list_size) {
78
- fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
75
+ LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
79
76
  return 1;
80
77
  }
81
78
 
82
- fprintf(stderr, "\n\n");
79
+ LOG("\n\n");
83
80
 
84
81
  for (auto id : inp) {
85
- fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
82
+ LOG("%s", common_token_to_piece(ctx, id).c_str());
86
83
  }
87
84
 
88
85
  fflush(stderr);
@@ -92,8 +89,8 @@ int main(int argc, char ** argv) {
92
89
  const auto t_enc_start = ggml_time_us();
93
90
 
94
91
  // eval the prompt
95
- llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
96
- llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
92
+ llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
93
+ llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
97
94
 
98
95
  for (int s = 1; s < W + G + 1; ++s) {
99
96
  llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
@@ -118,7 +115,7 @@ int main(int argc, char ** argv) {
118
115
  llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
119
116
 
120
117
  // target model sampling context
121
- struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
118
+ struct common_sampler * smpl = common_sampler_init(model, params.sparams);
122
119
 
123
120
  // verification n-grams
124
121
  std::vector<ngram_data> ngrams_cur(G);
@@ -159,14 +156,14 @@ int main(int argc, char ** argv) {
159
156
 
160
157
  // sample first token
161
158
  {
162
- id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
159
+ id = common_sampler_sample(smpl, ctx, 0);
163
160
 
164
- llama_sampling_accept(ctx_sampling, ctx, id, true);
161
+ common_sampler_accept(smpl, id, true);
165
162
 
166
163
  {
167
- const std::string token_str = llama_token_to_piece(ctx, id);
164
+ const std::string token_str = common_token_to_piece(ctx, id);
168
165
 
169
- printf("%s", token_str.c_str());
166
+ LOG("%s", token_str.c_str());
170
167
  fflush(stdout);
171
168
  }
172
169
  }
@@ -175,7 +172,7 @@ int main(int argc, char ** argv) {
175
172
  // debug
176
173
  if (dump_kv_cache) {
177
174
  llama_kv_cache_view_update(ctx, &kvc_view);
178
- llama_kv_cache_dump_view_seqs(kvc_view, 40);
175
+ common_kv_cache_dump_view_seqs(kvc_view, 40);
179
176
  }
180
177
 
181
178
  // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
@@ -204,10 +201,10 @@ int main(int argc, char ** argv) {
204
201
  // V V V V V V
205
202
  // id
206
203
  {
207
- llama_batch_clear(batch);
204
+ common_batch_clear(batch);
208
205
 
209
206
  // current token - first token of the first level
210
- llama_batch_add(batch, id, n_past, seq_id_all, true);
207
+ common_batch_add(batch, id, n_past, seq_id_all, true);
211
208
 
212
209
  // verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
213
210
  {
@@ -232,7 +229,7 @@ int main(int argc, char ** argv) {
232
229
  ngrams_cur[g].tokens [j + 1] = t;
233
230
  ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
234
231
 
235
- llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
232
+ common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
236
233
  }
237
234
  }
238
235
  }
@@ -244,19 +241,19 @@ int main(int argc, char ** argv) {
244
241
  seq_id_look[j] = i + j + 1;
245
242
  }
246
243
 
247
- llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
244
+ common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
248
245
  }
249
246
 
250
247
  // fill the rest of the levels
251
248
  for (int j = 1; j < N - 1; j++) {
252
249
  for (int i = 0; i < W; i++) {
253
- llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
250
+ common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
254
251
  }
255
252
  }
256
253
  }
257
254
 
258
255
  if (llama_decode(ctx, batch) != 0) {
259
- fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
256
+ LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
260
257
  return 1;
261
258
  }
262
259
 
@@ -284,19 +281,19 @@ int main(int argc, char ** argv) {
284
281
  }
285
282
 
286
283
  // sample the next token
287
- id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
284
+ id = common_sampler_sample(smpl, ctx, i_batch);
288
285
 
289
- llama_sampling_accept(ctx_sampling, ctx, id, true);
286
+ common_sampler_accept(smpl, id, true);
290
287
 
291
288
  // print
292
289
  {
293
- const std::string token_str = llama_token_to_piece(ctx, id);
290
+ const std::string token_str = common_token_to_piece(ctx, id);
294
291
 
295
292
  if (v == 0) {
296
- printf("%s", token_str.c_str());
293
+ LOG("%s", token_str.c_str());
297
294
  } else {
298
295
  // print light cyan
299
- printf("\033[0;96m%s\033[0m", token_str.c_str());
296
+ LOG("\033[0;96m%s\033[0m", token_str.c_str());
300
297
  }
301
298
  fflush(stdout);
302
299
 
@@ -330,21 +327,21 @@ int main(int argc, char ** argv) {
330
327
  // print known n-grams starting with token id (debug)
331
328
  if (0 && v == 0) {
332
329
  if (ngrams_observed.cnt[id] > 0) {
333
- printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
330
+ LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], common_token_to_piece(ctx, id).c_str());
334
331
  }
335
332
 
336
333
  for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
337
- printf(" - ngram %2d: ", i);
334
+ LOG(" - ngram %2d: ", i);
338
335
 
339
336
  const int idx = id*(N - 1)*G + i*(N - 1);
340
337
 
341
338
  for (int j = 0; j < N - 1; j++) {
342
- const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
339
+ const std::string token_str = common_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
343
340
 
344
- printf("%s", token_str.c_str());
341
+ LOG("%s", token_str.c_str());
345
342
  }
346
343
 
347
- printf("\n");
344
+ LOG("\n");
348
345
  }
349
346
  }
350
347
 
@@ -361,7 +358,7 @@ int main(int argc, char ** argv) {
361
358
  if (v == 0) {
362
359
  // sample from the last level
363
360
  for (int i = 0; i < W; i++) {
364
- tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
361
+ tokens_j[N - 2][i] = common_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
365
362
  }
366
363
  } else {
367
364
  for (int i = 0; i < W; i++) {
@@ -455,23 +452,25 @@ int main(int argc, char ** argv) {
455
452
 
456
453
  auto t_dec_end = ggml_time_us();
457
454
 
458
- LOG_TEE("\n\n");
455
+ LOG("\n\n");
456
+
457
+ LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
458
+ LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
459
459
 
460
- LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
461
- LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
460
+ LOG_INF("\n");
461
+ LOG_INF("W = %2d\n", W);
462
+ LOG_INF("N = %2d\n", N);
463
+ LOG_INF("G = %2d\n", G);
464
+ LOG_INF("\n");
465
+ LOG_INF("n_predict = %d\n", n_predict);
466
+ LOG_INF("n_accept = %d\n", n_accept);
462
467
 
463
- LOG_TEE("\n");
464
- LOG_TEE("W = %2d\n", W);
465
- LOG_TEE("N = %2d\n", N);
466
- LOG_TEE("G = %2d\n", G);
467
- LOG_TEE("\n");
468
- LOG_TEE("n_predict = %d\n", n_predict);
469
- LOG_TEE("n_accept = %d\n", n_accept);
468
+ LOG_INF("\n");
469
+ common_perf_print(ctx, smpl);
470
470
 
471
- llama_print_timings(ctx);
471
+ common_sampler_free(smpl);
472
472
 
473
473
  llama_kv_cache_view_free(&kvc_view);
474
- llama_sampling_free(ctx_sampling);
475
474
 
476
475
  llama_batch_free(batch);
477
476
 
@@ -480,7 +479,7 @@ int main(int argc, char ** argv) {
480
479
 
481
480
  llama_backend_free();
482
481
 
483
- fprintf(stderr, "\n\n");
482
+ LOG("\n\n");
484
483
 
485
484
  return 0;
486
485
  }
@@ -1,7 +1,8 @@
1
- #include "ggml.h"
2
- #include "llama.h"
1
+ #include "arg.h"
3
2
  #include "common.h"
4
3
  #include "ngram-cache.h"
4
+ #include "ggml.h"
5
+ #include "llama.h"
5
6
 
6
7
  #include <cstdint>
7
8
  #include <fstream>
@@ -11,10 +12,9 @@
11
12
  #include <vector>
12
13
 
13
14
  int main(int argc, char ** argv){
14
- gpt_params params;
15
+ common_params params;
15
16
 
16
- if (!gpt_params_parse(argc, argv, params)) {
17
- gpt_params_print_usage(argc, argv, params);
17
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
18
18
  return 1;
19
19
  }
20
20
 
@@ -22,22 +22,24 @@ int main(int argc, char ** argv){
22
22
  llama_backend_init();
23
23
  llama_numa_init(params.numa);
24
24
 
25
- llama_model * model = NULL;
26
- llama_context * ctx = NULL;
27
-
28
25
  // load the model
29
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
26
+ common_init_result llama_init = common_init_from_params(params);
27
+
28
+ llama_model * model = llama_init.model;
29
+ llama_context * ctx = llama_init.context;
30
30
  GGML_ASSERT(model != nullptr);
31
31
 
32
32
  // tokenize the prompt
33
33
  std::vector<llama_token> inp;
34
- inp = ::llama_tokenize(ctx, params.prompt, true, true);
34
+ inp = common_tokenize(ctx, params.prompt, true, true);
35
35
  fprintf(stderr, "%s: tokenization done\n", __func__);
36
36
 
37
37
 
38
- llama_ngram_cache ngram_cache;
39
- llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
38
+ common_ngram_cache ngram_cache;
39
+ common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
40
40
  fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
41
41
 
42
- llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
42
+ common_ngram_cache_save(ngram_cache, params.lookup_cache_static);
43
+
44
+ return 0;
43
45
  }
@@ -33,15 +33,15 @@ int main(int argc, char ** argv){
33
33
  }
34
34
 
35
35
  fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
36
- llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);
36
+ common_ngram_cache ngram_cache_merged = common_ngram_cache_load(args[0]);
37
37
 
38
38
  for (size_t i = 1; i < args.size()-1; ++i) {
39
39
  fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
40
- llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);
40
+ common_ngram_cache ngram_cache = common_ngram_cache_load(args[i]);
41
41
 
42
- llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
42
+ common_ngram_cache_merge(ngram_cache_merged, ngram_cache);
43
43
  }
44
44
 
45
45
  fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
46
- llama_ngram_cache_save(ngram_cache_merged, args.back());
46
+ common_ngram_cache_save(ngram_cache_merged, args.back());
47
47
  }
@@ -1,44 +1,45 @@
1
- #include "ggml.h"
1
+ #include "arg.h"
2
2
  #include "common.h"
3
- #include "llama.h"
4
3
  #include "log.h"
5
4
  #include "ngram-cache.h"
5
+ #include "llama.h"
6
+ #include "ggml.h"
6
7
 
7
- #include <cmath>
8
8
  #include <cstdint>
9
9
  #include <cstdio>
10
+ #include <cinttypes>
10
11
  #include <fstream>
11
12
  #include <string>
12
13
  #include <vector>
13
- #include <unordered_map>
14
14
 
15
15
  int main(int argc, char ** argv){
16
- gpt_params params;
16
+ common_params params;
17
17
 
18
- if (!gpt_params_parse(argc, argv, params)) {
19
- gpt_params_print_usage(argc, argv, params);
18
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
20
19
  return 1;
21
20
  }
22
21
 
22
+ common_init();
23
+
23
24
  const int n_draft = params.n_draft;
24
25
 
25
26
  // init llama.cpp
26
27
  llama_backend_init();
27
28
  llama_numa_init(params.numa);
28
29
 
29
- llama_model * model = NULL;
30
- llama_context * ctx = NULL;
31
-
32
30
  // load the model
33
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
31
+ common_init_result llama_init = common_init_from_params(params);
32
+
33
+ llama_model * model = llama_init.model;
34
+ llama_context * ctx = llama_init.context;
34
35
 
35
36
  // tokenize the prompt
36
37
  std::vector<llama_token> inp;
37
- inp = ::llama_tokenize(ctx, params.prompt, true, true);
38
+ inp = common_tokenize(ctx, params.prompt, true, true);
38
39
 
39
- llama_ngram_cache ngram_cache_context;
40
- llama_ngram_cache ngram_cache_dynamic;
41
- llama_ngram_cache ngram_cache_static;
40
+ common_ngram_cache ngram_cache_context;
41
+ common_ngram_cache ngram_cache_dynamic;
42
+ common_ngram_cache ngram_cache_static;
42
43
  int64_t t_draft_flat_us = 0;
43
44
  int64_t t_draft_us = 0;
44
45
 
@@ -47,16 +48,16 @@ int main(int argc, char ** argv){
47
48
 
48
49
  if (!params.lookup_cache_static.empty()) {
49
50
  try {
50
- ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
51
+ ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
51
52
  } catch (std::ifstream::failure const &) {
52
- fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
53
+ LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
53
54
  exit(1);
54
55
  }
55
56
  }
56
57
 
57
58
  if (!params.lookup_cache_dynamic.empty()) {
58
59
  try {
59
- ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
60
+ ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
60
61
  } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
61
62
  }
62
63
 
@@ -85,7 +86,7 @@ int main(int argc, char ** argv){
85
86
 
86
87
  {
87
88
  const int64_t t_start_draft_us = ggml_time_us();
88
- llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
89
+ common_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
89
90
  t_draft_us += ggml_time_us() - t_start_draft_us;
90
91
  }
91
92
 
@@ -104,7 +105,7 @@ int main(int argc, char ** argv){
104
105
 
105
106
  {
106
107
  const int64_t t_start_draft_us = ggml_time_us();
107
- llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
108
+ common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
108
109
  t_draft_us += ggml_time_us() - t_start_draft_us;
109
110
  }
110
111
  }
@@ -114,7 +115,7 @@ int main(int argc, char ** argv){
114
115
  pseudo_output.push_back(inp_slice[pseudo_output.size()]);
115
116
  {
116
117
  const int64_t t_start_draft_us = ggml_time_us();
117
- llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
118
+ common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
118
119
  t_draft_us += ggml_time_us() - t_start_draft_us;
119
120
  }
120
121
  }
@@ -128,32 +129,32 @@ int main(int argc, char ** argv){
128
129
  const int64_t eta_min = eta_ms / (60*1000);
129
130
  const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
130
131
 
131
- LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
132
+ LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
132
133
  }
133
134
 
134
135
  // After each chunk, update the dynamic ngram cache with the context ngram cache:
135
- llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
136
+ common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
136
137
  ngram_cache_context.clear();
137
138
  }
138
139
 
139
- LOG_TEE("\n");
140
+ LOG("\n");
140
141
 
141
- LOG_TEE("\n");
142
- LOG_TEE("n_draft = %d\n", n_draft);
143
- LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx);
144
- LOG_TEE("n_drafted = %d\n", n_drafted);
145
- LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
146
- LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
142
+ LOG_INF("\n");
143
+ LOG_INF("n_draft = %d\n", n_draft);
144
+ LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx);
145
+ LOG_INF("n_drafted = %d\n", n_drafted);
146
+ LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
147
+ LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
147
148
  t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
148
- LOG_TEE("n_accept = %d\n", n_accept);
149
- LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
149
+ LOG_INF("n_accept = %d\n", n_accept);
150
+ LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
150
151
 
151
152
  llama_free(ctx);
152
153
  llama_free_model(model);
153
154
 
154
155
  llama_backend_free();
155
156
 
156
- fprintf(stderr, "\n\n");
157
+ LOG("\n\n");
157
158
 
158
159
  return 0;
159
160
  }