@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,4 +1,6 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
3
+ #include "log.h"
2
4
  #include "llama.h"
3
5
 
4
6
  #include <ctime>
@@ -26,18 +28,29 @@ static std::vector<std::string> split_lines(const std::string & s, const std::st
26
28
  static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
27
29
  size_t n_tokens = tokens.size();
28
30
  for (size_t i = 0; i < n_tokens; i++) {
29
- llama_batch_add(batch, tokens[i], i, { seq_id }, true);
31
+ common_batch_add(batch, tokens[i], i, { seq_id }, true);
30
32
  }
31
33
  }
32
34
 
33
35
  static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
36
+ const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
37
+ const struct llama_model * model = llama_get_model(ctx);
38
+
34
39
  // clear previous kv_cache values (irrelevant for embeddings)
35
40
  llama_kv_cache_clear(ctx);
36
41
 
37
42
  // run model
38
- fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
39
- if (llama_decode(ctx, batch) < 0) {
40
- fprintf(stderr, "%s : failed to decode\n", __func__);
43
+ LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
44
+ if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
45
+ // encoder-only model
46
+ if (llama_encode(ctx, batch) < 0) {
47
+ LOG_ERR("%s : failed to encode\n", __func__);
48
+ }
49
+ } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
50
+ // decoder-only model
51
+ if (llama_decode(ctx, batch) < 0) {
52
+ LOG_ERR("%s : failed to decode\n", __func__);
53
+ }
41
54
  }
42
55
 
43
56
  for (int i = 0; i < batch.n_tokens; i++) {
@@ -45,47 +58,49 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
45
58
  continue;
46
59
  }
47
60
 
48
- // try to get sequence embeddings - supported only when pooling_type is not NONE
49
- const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
50
- GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
61
+ const float * embd = nullptr;
62
+ int embd_pos = 0;
63
+
64
+ if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
65
+ // try to get token embeddings
66
+ embd = llama_get_embeddings_ith(ctx, i);
67
+ embd_pos = i;
68
+ GGML_ASSERT(embd != NULL && "failed to get token embeddings");
69
+ } else {
70
+ // try to get sequence embeddings - supported only when pooling_type is not NONE
71
+ embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
72
+ embd_pos = batch.seq_id[i][0];
73
+ GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
74
+ }
51
75
 
52
- float * out = output + batch.seq_id[i][0] * n_embd;
53
- llama_embd_normalize(embd, out, n_embd, embd_norm);
76
+ float * out = output + embd_pos * n_embd;
77
+ common_embd_normalize(embd, out, n_embd, embd_norm);
54
78
  }
55
79
  }
56
80
 
57
81
  int main(int argc, char ** argv) {
58
- gpt_params params;
82
+ common_params params;
59
83
 
60
- if (!gpt_params_parse(argc, argv, params)) {
61
- gpt_params_print_usage(argc, argv, params);
84
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
62
85
  return 1;
63
86
  }
64
87
 
88
+ common_init();
89
+
65
90
  params.embedding = true;
66
91
  // For non-causal models, batch size must be equal to ubatch size
67
92
  params.n_ubatch = params.n_batch;
68
93
 
69
- print_build_info();
70
-
71
- if (params.seed == LLAMA_DEFAULT_SEED) {
72
- params.seed = time(NULL);
73
- }
74
-
75
- fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
76
-
77
- std::mt19937 rng(params.seed);
78
-
79
94
  llama_backend_init();
80
95
  llama_numa_init(params.numa);
81
96
 
82
- llama_model * model;
83
- llama_context * ctx;
84
-
85
97
  // load the model
86
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
98
+ common_init_result llama_init = common_init_from_params(params);
99
+
100
+ llama_model * model = llama_init.model;
101
+ llama_context * ctx = llama_init.context;
87
102
  if (model == NULL) {
88
- fprintf(stderr, "%s: error: unable to load model\n", __func__);
103
+ LOG_ERR("%s: unable to load model\n", __func__);
89
104
  return 1;
90
105
  }
91
106
 
@@ -93,20 +108,21 @@ int main(int argc, char ** argv) {
93
108
  const int n_ctx = llama_n_ctx(ctx);
94
109
 
95
110
  const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
96
- if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
97
- fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
111
+
112
+ if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
113
+ LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
98
114
  return 1;
99
115
  }
100
116
 
101
117
  if (n_ctx > n_ctx_train) {
102
- fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
118
+ LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
103
119
  __func__, n_ctx_train, n_ctx);
104
120
  }
105
121
 
106
122
  // print system information
107
123
  {
108
- fprintf(stderr, "\n");
109
- fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
124
+ LOG_INF("\n");
125
+ LOG_INF("%s\n", common_params_get_system_info(params).c_str());
110
126
  }
111
127
 
112
128
  // split the prompt into lines
@@ -119,9 +135,9 @@ int main(int argc, char ** argv) {
119
135
  // tokenize the prompts and trim
120
136
  std::vector<std::vector<int32_t>> inputs;
121
137
  for (const auto & prompt : prompts) {
122
- auto inp = ::llama_tokenize(ctx, prompt, true, false);
138
+ auto inp = common_tokenize(ctx, prompt, true, true);
123
139
  if (inp.size() > n_batch) {
124
- fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
140
+ LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
125
141
  __func__, (long long int) inp.size(), (long long int) n_batch);
126
142
  return 1;
127
143
  }
@@ -132,20 +148,20 @@ int main(int argc, char ** argv) {
132
148
  // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
133
149
  for (auto & inp : inputs) {
134
150
  if (inp.empty() || inp.back() != llama_token_sep(model)) {
135
- fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
136
- fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
151
+ LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
152
+ LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
137
153
  }
138
154
  }
139
155
 
140
156
  // tokenization stats
141
157
  if (params.verbose_prompt) {
142
158
  for (int i = 0; i < (int) inputs.size(); i++) {
143
- fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
144
- fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
159
+ LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
160
+ LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
145
161
  for (int j = 0; j < (int) inputs[i].size(); j++) {
146
- fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
162
+ LOG("%6d -> '%s'\n", inputs[i][j], common_token_to_piece(ctx, inputs[i][j]).c_str());
147
163
  }
148
- fprintf(stderr, "\n\n");
164
+ LOG("\n\n");
149
165
  }
150
166
  }
151
167
 
@@ -153,13 +169,23 @@ int main(int argc, char ** argv) {
153
169
  const int n_prompts = prompts.size();
154
170
  struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
155
171
 
172
+ // count number of embeddings
173
+ int n_embd_count = 0;
174
+ if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
175
+ for (int k = 0; k < n_prompts; k++) {
176
+ n_embd_count += inputs[k].size();
177
+ }
178
+ } else {
179
+ n_embd_count = n_prompts;
180
+ }
181
+
156
182
  // allocate output
157
183
  const int n_embd = llama_n_embd(model);
158
- std::vector<float> embeddings(n_prompts * n_embd, 0);
184
+ std::vector<float> embeddings(n_embd_count * n_embd, 0);
159
185
  float * emb = embeddings.data();
160
186
 
161
187
  // break into batches
162
- int p = 0; // number of prompts processed already
188
+ int e = 0; // number of embeddings already stored
163
189
  int s = 0; // number of prompts in current batch
164
190
  for (int k = 0; k < n_prompts; k++) {
165
191
  // clamp to n_batch tokens
@@ -169,11 +195,11 @@ int main(int argc, char ** argv) {
169
195
 
170
196
  // encode if at capacity
171
197
  if (batch.n_tokens + n_toks > n_batch) {
172
- float * out = emb + p * n_embd;
198
+ float * out = emb + e * n_embd;
173
199
  batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
174
- llama_batch_clear(batch);
175
- p += s;
200
+ e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
176
201
  s = 0;
202
+ common_batch_clear(batch);
177
203
  }
178
204
 
179
205
  // add to batch
@@ -182,39 +208,67 @@ int main(int argc, char ** argv) {
182
208
  }
183
209
 
184
210
  // final batch
185
- float * out = emb + p * n_embd;
211
+ float * out = emb + e * n_embd;
186
212
  batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
187
213
 
188
214
  if (params.embd_out.empty()) {
189
- // print the first part of the embeddings or for a single prompt, the full embedding
190
- fprintf(stdout, "\n");
191
- for (int j = 0; j < n_prompts; j++) {
192
- fprintf(stdout, "embedding %d: ", j);
193
- for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
194
- if (params.embd_normalize == 0) {
195
- fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
196
- } else {
197
- fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
215
+ LOG("\n");
216
+
217
+ if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
218
+ for (int j = 0; j < n_embd_count; j++) {
219
+ LOG("embedding %d: ", j);
220
+ for (int i = 0; i < std::min(3, n_embd); i++) {
221
+ if (params.embd_normalize == 0) {
222
+ LOG("%6.0f ", emb[j * n_embd + i]);
223
+ } else {
224
+ LOG("%9.6f ", emb[j * n_embd + i]);
225
+ }
226
+ }
227
+ LOG(" ... ");
228
+ for (int i = n_embd - 3; i < n_embd; i++) {
229
+ if (params.embd_normalize == 0) {
230
+ LOG("%6.0f ", emb[j * n_embd + i]);
231
+ } else {
232
+ LOG("%9.6f ", emb[j * n_embd + i]);
233
+ }
198
234
  }
235
+ LOG("\n");
199
236
  }
200
- fprintf(stdout, "\n");
201
- }
202
-
203
- // print cosine similarity matrix
204
- if (n_prompts > 1) {
205
- fprintf(stdout, "\n");
206
- printf("cosine similarity matrix:\n\n");
207
- for (int i = 0; i < n_prompts; i++) {
208
- fprintf(stdout, "%6.6s ", prompts[i].c_str());
237
+ } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
238
+ for (int j = 0; j < n_embd_count; j++) {
239
+ // NOTE: if you change this log - update the tests in ci/run.sh
240
+ LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
241
+ }
242
+ } else {
243
+ // print the first part of the embeddings or for a single prompt, the full embedding
244
+ for (int j = 0; j < n_prompts; j++) {
245
+ LOG("embedding %d: ", j);
246
+ for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
247
+ if (params.embd_normalize == 0) {
248
+ LOG("%6.0f ", emb[j * n_embd + i]);
249
+ } else {
250
+ LOG("%9.6f ", emb[j * n_embd + i]);
251
+ }
252
+ }
253
+ LOG("\n");
209
254
  }
210
- fprintf(stdout, "\n");
211
- for (int i = 0; i < n_prompts; i++) {
212
- for (int j = 0; j < n_prompts; j++) {
213
- float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
214
- fprintf(stdout, "%6.2f ", sim);
255
+
256
+ // print cosine similarity matrix
257
+ if (n_prompts > 1) {
258
+ LOG("\n");
259
+ LOG("cosine similarity matrix:\n\n");
260
+ for (int i = 0; i < n_prompts; i++) {
261
+ LOG("%6.6s ", prompts[i].c_str());
262
+ }
263
+ LOG("\n");
264
+ for (int i = 0; i < n_prompts; i++) {
265
+ for (int j = 0; j < n_prompts; j++) {
266
+ float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
267
+ LOG("%6.2f ", sim);
268
+ }
269
+ LOG("%1.10s", prompts[i].c_str());
270
+ LOG("\n");
215
271
  }
216
- fprintf(stdout, "%1.10s", prompts[i].c_str());
217
- fprintf(stdout, "\n");
218
272
  }
219
273
  }
220
274
  }
@@ -222,43 +276,45 @@ int main(int argc, char ** argv) {
222
276
  if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
223
277
  const bool notArray = params.embd_out != "array";
224
278
 
225
- fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
279
+ LOG(notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
226
280
  for (int j = 0;;) { // at least one iteration (one prompt)
227
- if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
228
- fprintf(stdout, "[");
281
+ if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
282
+ LOG("[");
229
283
  for (int i = 0;;) { // at least one iteration (n_embd > 0)
230
- fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
284
+ LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
231
285
  i++;
232
- if (i < n_embd) fprintf(stdout, ","); else break;
286
+ if (i < n_embd) LOG(","); else break;
233
287
  }
234
- fprintf(stdout, notArray ? "]\n }" : "]");
288
+ LOG(notArray ? "]\n }" : "]");
235
289
  j++;
236
- if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
290
+ if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
237
291
  }
238
- fprintf(stdout, notArray ? "\n ]" : "]\n");
292
+ LOG(notArray ? "\n ]" : "]\n");
239
293
 
240
294
  if (params.embd_out == "json+" && n_prompts > 1) {
241
- fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
242
- for (int i = 0;;) { // at least two iteration (n_prompts > 1)
243
- fprintf(stdout, " [");
244
- for (int j = 0;;) { // at least two iteration (n_prompts > 1)
245
- float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
246
- fprintf(stdout, "%6.2f", sim);
295
+ LOG(",\n \"cosineSimilarity\": [\n");
296
+ for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
297
+ LOG(" [");
298
+ for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
299
+ float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
300
+ LOG("%6.2f", sim);
247
301
  j++;
248
- if (j < n_prompts) fprintf(stdout, ", "); else break;
302
+ if (j < n_embd_count) LOG(", "); else break;
249
303
  }
250
- fprintf(stdout, " ]");
304
+ LOG(" ]");
251
305
  i++;
252
- if (i < n_prompts) fprintf(stdout, ",\n"); else break;
306
+ if (i < n_embd_count) LOG(",\n"); else break;
253
307
  }
254
- fprintf(stdout, "\n ]");
308
+ LOG("\n ]");
255
309
  }
256
310
 
257
- if (notArray) fprintf(stdout, "\n}\n");
311
+ if (notArray) LOG("\n}\n");
258
312
  }
259
313
 
314
+ LOG("\n");
315
+ llama_perf_context_print(ctx);
316
+
260
317
  // clean up
261
- llama_print_timings(ctx);
262
318
  llama_batch_free(batch);
263
319
  llama_free(ctx);
264
320
  llama_free_model(model);
@@ -1,11 +1,11 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
3
+ #include "log.h"
2
4
  #include "llama.h"
3
5
  #include "ggml.h"
4
6
 
5
7
  #include <cstdio>
6
- #include <random>
7
8
  #include <string>
8
- #include <tuple>
9
9
  #include <vector>
10
10
 
11
11
  /**
@@ -31,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
31
31
  GGML_ASSERT(n > 0);
32
32
  float sum = 0;
33
33
  for (int64_t i3 = 0; i3 < ne[3]; i3++) {
34
- printf(" [\n");
34
+ LOG(" [\n");
35
35
  for (int64_t i2 = 0; i2 < ne[2]; i2++) {
36
36
  if (i2 == n && ne[2] > 2*n) {
37
- printf(" ..., \n");
37
+ LOG(" ..., \n");
38
38
  i2 = ne[2] - n;
39
39
  }
40
- printf(" [\n");
40
+ LOG(" [\n");
41
41
  for (int64_t i1 = 0; i1 < ne[1]; i1++) {
42
42
  if (i1 == n && ne[1] > 2*n) {
43
- printf(" ..., \n");
43
+ LOG(" ..., \n");
44
44
  i1 = ne[1] - n;
45
45
  }
46
- printf(" [");
46
+ LOG(" [");
47
47
  for (int64_t i0 = 0; i0 < ne[0]; i0++) {
48
48
  if (i0 == n && ne[0] > 2*n) {
49
- printf("..., ");
49
+ LOG("..., ");
50
50
  i0 = ne[0] - n;
51
51
  }
52
52
  size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
@@ -64,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
64
64
  } else {
65
65
  GGML_ABORT("fatal error");
66
66
  }
67
- printf("%12.4f", v);
67
+ LOG("%12.4f", v);
68
68
  sum += v;
69
- if (i0 < ne[0] - 1) printf(", ");
69
+ if (i0 < ne[0] - 1) LOG(", ");
70
70
  }
71
- printf("],\n");
71
+ LOG("],\n");
72
72
  }
73
- printf(" ],\n");
73
+ LOG(" ],\n");
74
74
  }
75
- printf(" ]\n");
76
- printf(" sum = %f\n", sum);
75
+ LOG(" ]\n");
76
+ LOG(" sum = %f\n", sum);
77
77
  }
78
78
  }
79
79
 
@@ -102,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
102
102
  snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
103
103
  }
104
104
 
105
- printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
106
- t->name, ggml_type_name(t->type), ggml_op_desc(t),
107
- src0->name, ggml_ne_string(src0).c_str(),
108
- src1 ? src1_str : "",
109
- ggml_ne_string(t).c_str());
105
+ LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
106
+ t->name, ggml_type_name(t->type), ggml_op_desc(t),
107
+ src0->name, ggml_ne_string(src0).c_str(),
108
+ src1 ? src1_str : "",
109
+ ggml_ne_string(t).c_str());
110
110
 
111
111
 
112
112
  // copy the data from the GPU memory if needed
@@ -126,13 +126,13 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
126
126
  return true;
127
127
  }
128
128
 
129
- static bool run(llama_context * ctx, const gpt_params & params) {
130
- const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
129
+ static bool run(llama_context * ctx, const common_params & params) {
130
+ const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
131
131
 
132
- std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
132
+ std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
133
133
 
134
- if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
135
- fprintf(stderr, "%s : failed to eval\n", __func__);
134
+ if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
135
+ LOG_ERR("%s : failed to eval\n", __func__);
136
136
  return false;
137
137
  }
138
138
 
@@ -142,16 +142,13 @@ static bool run(llama_context * ctx, const gpt_params & params) {
142
142
  int main(int argc, char ** argv) {
143
143
  callback_data cb_data;
144
144
 
145
- gpt_params params;
145
+ common_params params;
146
146
 
147
- if (!gpt_params_parse(argc, argv, params)) {
148
- gpt_params_print_usage(argc, argv, params);
147
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
149
148
  return 1;
150
149
  }
151
150
 
152
- print_build_info();
153
-
154
- std::mt19937 rng(params.seed);
151
+ common_init();
155
152
 
156
153
  llama_backend_init();
157
154
  llama_numa_init(params.numa);
@@ -163,18 +160,20 @@ int main(int argc, char ** argv) {
163
160
  params.warmup = false;
164
161
 
165
162
  // init
166
- llama_model * model;
167
- llama_context * ctx;
168
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
163
+ common_init_result llama_init = common_init_from_params(params);
164
+
165
+ llama_model * model = llama_init.model;
166
+ llama_context * ctx = llama_init.context;
169
167
  if (model == nullptr || ctx == nullptr) {
170
- fprintf(stderr, "%s : failed to init\n", __func__);
168
+ LOG_ERR("%s : failed to init\n", __func__);
171
169
  return 1;
172
170
  }
173
171
 
174
172
  // print system information
175
173
  {
176
- fprintf(stderr, "\n");
177
- fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
174
+ LOG_INF("\n");
175
+ LOG_INF("%s\n", common_params_get_system_info(params).c_str());
176
+ LOG_INF("\n");
178
177
  }
179
178
 
180
179
  bool OK = run(ctx, params);
@@ -182,7 +181,8 @@ int main(int argc, char ** argv) {
182
181
  return 1;
183
182
  }
184
183
 
185
- llama_print_timings(ctx);
184
+ LOG("\n");
185
+ llama_perf_context_print(ctx);
186
186
 
187
187
  llama_free(ctx);
188
188
  llama_free_model(model);