@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -1,6 +1,6 @@
1
1
  set(TARGET llama-quantize)
2
2
  add_executable(${TARGET} quantize.cpp)
3
3
  install(TARGETS ${TARGET} RUNTIME)
4
- target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
5
  target_include_directories(${TARGET} PRIVATE ../../common)
6
6
  target_compile_features(${TARGET} PRIVATE cxx_std_11)
@@ -26,6 +26,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
26
26
  { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
27
27
  { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
28
28
  { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
29
+ { "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", },
30
+ { "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", },
29
31
  { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
30
32
  { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
31
33
  { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
@@ -61,6 +63,16 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix
61
63
  static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count";
62
64
  static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";
63
65
 
66
+ static bool striequals(const char * a, const char * b) {
67
+ while (*a && *b) {
68
+ if (std::tolower(*a) != std::tolower(*b)) {
69
+ return false;
70
+ }
71
+ a++; b++;
72
+ }
73
+ return *a == *b;
74
+ }
75
+
64
76
  static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
65
77
  std::string ftype_str;
66
78
 
@@ -68,7 +80,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
68
80
  ftype_str.push_back(std::toupper(ch));
69
81
  }
70
82
  for (auto & it : QUANT_OPTIONS) {
71
- if (it.name == ftype_str) {
83
+ if (striequals(it.name.c_str(), ftype_str.c_str())) {
72
84
  ftype = it.ftype;
73
85
  ftype_str_out = it.name;
74
86
  return true;
@@ -91,7 +103,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
91
103
  }
92
104
 
93
105
  // usage:
94
- // ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
106
+ // ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
95
107
  //
96
108
  [[noreturn]]
97
109
  static void usage(const char * executable) {
@@ -104,7 +116,7 @@ static void usage(const char * executable) {
104
116
  printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
105
117
  printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
106
118
  printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
107
- printf(" --keep-split: will generate quatized model in the same shards as input");
119
+ printf(" --keep-split: will generate quantized model in the same shards as input\n");
108
120
  printf(" --override-kv KEY=TYPE:VALUE\n");
109
121
  printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
110
122
  printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@@ -223,15 +235,15 @@ static int prepare_imatrix(const std::string & imatrix_file,
223
235
  }
224
236
 
225
237
  static ggml_type parse_ggml_type(const char * arg) {
226
- ggml_type result = GGML_TYPE_COUNT;
227
- for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
228
- auto type = ggml_type(j);
238
+ for (int i = 0; i < GGML_TYPE_COUNT; ++i) {
239
+ auto type = (ggml_type)i;
229
240
  const auto * name = ggml_type_name(type);
230
- if (name && strcmp(arg, name) == 0) {
231
- result = type; break;
241
+ if (name && striequals(name, arg)) {
242
+ return type;
232
243
  }
233
244
  }
234
- return result;
245
+ fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg);
246
+ return GGML_TYPE_COUNT;
235
247
  }
236
248
 
237
249
  int main(int argc, char ** argv) {
@@ -252,12 +264,18 @@ int main(int argc, char ** argv) {
252
264
  } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
253
265
  if (arg_idx < argc-1) {
254
266
  params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
267
+ if (params.output_tensor_type == GGML_TYPE_COUNT) {
268
+ usage(argv[0]);
269
+ }
255
270
  } else {
256
271
  usage(argv[0]);
257
272
  }
258
273
  } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
259
274
  if (arg_idx < argc-1) {
260
275
  params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
276
+ if (params.token_embedding_type == GGML_TYPE_COUNT) {
277
+ usage(argv[0]);
278
+ }
261
279
  } else {
262
280
  usage(argv[0]);
263
281
  }
@@ -1,7 +1,7 @@
1
- #define LLAMA_API_INTERNAL
2
1
  #include "common.h"
3
2
  #include "ggml.h"
4
3
  #include "llama.h"
4
+ #include "llama-impl.h"
5
5
 
6
6
  #include <algorithm>
7
7
  #include <cassert>
@@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
142
142
  }
143
143
 
144
144
  static void test_roundtrip_on_chunk(
145
- const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
145
+ const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
146
146
  float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
147
147
  ) {
148
148
  if (layer->type == GGML_TYPE_F16) {
@@ -156,7 +156,7 @@ static void test_roundtrip_on_chunk(
156
156
  if (use_reference) {
157
157
  qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
158
158
  } else {
159
- qfns.from_float(input_scratch, quantized_scratch, chunk_size);
159
+ qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
160
160
  }
161
161
  qfns.to_float(quantized_scratch, output_scratch, chunk_size);
162
162
 
@@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(
166
166
 
167
167
  // Run quantization function for a single layer and update error stats
168
168
  static void test_roundtrip_on_layer(
169
- std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
169
+ std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
170
170
  const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
171
171
  std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
172
172
  ) {
@@ -187,13 +187,13 @@ static void test_roundtrip_on_layer(
187
187
  int num_chunks = (nelements + chunk_size - 1)/chunk_size;
188
188
 
189
189
  if (num_chunks < 2 || max_thread < 2) {
190
- test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
190
+ test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(),
191
191
  output_scratch.data(), print_layer_stats ? layer_error : total_error);
192
192
  } else {
193
193
  auto & stats = print_layer_stats ? layer_error : total_error;
194
194
  std::mutex mutex;
195
195
  uint64_t counter = 0;
196
- auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
196
+ auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr,
197
197
  &quantized_scratch, &output_scratch, chunk_size] () {
198
198
  error_stats local_stats {};
199
199
  while (true) {
@@ -205,7 +205,7 @@ static void test_roundtrip_on_layer(
205
205
  }
206
206
  lock.unlock();
207
207
  uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
208
- test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
208
+ test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset,
209
209
  quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
210
210
  }
211
211
  };
@@ -319,8 +319,7 @@ int main(int argc, char ** argv) {
319
319
  }
320
320
 
321
321
  auto cparams = llama_context_default_params();
322
- cparams.n_ctx = 256;
323
- cparams.seed = 1;
322
+ cparams.n_ctx = 256;
324
323
 
325
324
  ctx = llama_new_context_with_model(model, cparams);
326
325
 
@@ -372,8 +371,9 @@ int main(int argc, char ** argv) {
372
371
  if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
373
372
  continue;
374
373
  }
375
- ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
376
- if (qfns.from_float && qfns.to_float) {
374
+ const auto * qfns = ggml_get_type_traits(type);
375
+ const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
376
+ if (qfns_cpu->from_float && qfns->to_float) {
377
377
  if (params.verbose) {
378
378
  printf("testing %s ...\n", ggml_type_name(type));
379
379
  }
@@ -394,7 +394,7 @@ int main(int argc, char ** argv) {
394
394
  test_roundtrip_on_layer(
395
395
  layer_name,
396
396
  params.per_layer_stats,
397
- qfns,
397
+ *qfns, *qfns_cpu,
398
398
  params.reference,
399
399
  kv_tensor.second,
400
400
  input_scratch,
@@ -1,15 +1,16 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
3
+ #include "log.h"
2
4
  #include "llama.h"
3
5
 
4
6
  #include <algorithm>
5
7
  #include <fstream>
8
+ #include <iostream> // TODO: remove me
6
9
 
7
- static void print_usage(int argc, char ** argv, const gpt_params & params) {
8
- gpt_params_print_usage(argc, argv, params);
9
-
10
- LOG_TEE("\nexample usage:\n");
11
- LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
12
- LOG_TEE("\n");
10
+ static void print_usage(int, char ** argv) {
11
+ LOG("\nexample usage:\n");
12
+ LOG("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
13
+ LOG("\n");
13
14
  }
14
15
 
15
16
  struct chunk {
@@ -18,7 +19,7 @@ struct chunk {
18
19
  // original file position
19
20
  size_t filepos;
20
21
  // original text data
21
- std::string textdata = "";
22
+ std::string textdata;
22
23
  // tokenized text data
23
24
  std::vector<llama_token> tokens;
24
25
  // embedding
@@ -32,14 +33,14 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
32
33
  std::ifstream f(filename.c_str());
33
34
 
34
35
  if (!f.is_open()) {
35
- fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
36
+ LOG_ERR("could not open file %s\n", filename.c_str());
36
37
  return chunks;
37
38
  }
38
39
 
39
40
  chunk current_chunk;
40
41
  char buffer[1024];
41
42
  int64_t filepos = 0;
42
- std::string current = "";
43
+ std::string current;
43
44
  while (f.read(buffer, 1024)) {
44
45
  current += std::string(buffer, f.gcount());
45
46
  size_t pos;
@@ -76,7 +77,7 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
76
77
  static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
77
78
  size_t n_tokens = tokens.size();
78
79
  for (size_t i = 0; i < n_tokens; i++) {
79
- llama_batch_add(batch, tokens[i], i, { seq_id }, true);
80
+ common_batch_add(batch, tokens[i], i, { seq_id }, true);
80
81
  }
81
82
  }
82
83
 
@@ -85,9 +86,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
85
86
  llama_kv_cache_clear(ctx);
86
87
 
87
88
  // run model
88
- fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
89
+ LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
89
90
  if (llama_decode(ctx, batch) < 0) {
90
- fprintf(stderr, "%s : failed to decode\n", __func__);
91
+ LOG_ERR("%s : failed to decode\n", __func__);
91
92
  }
92
93
 
93
94
  for (int i = 0; i < batch.n_tokens; i++) {
@@ -100,42 +101,41 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
100
101
  if (embd == NULL) {
101
102
  embd = llama_get_embeddings_ith(ctx, i);
102
103
  if (embd == NULL) {
103
- fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
104
+ LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
104
105
  continue;
105
106
  }
106
107
  }
107
108
 
108
109
  float * out = output + batch.seq_id[i][0] * n_embd;
109
- llama_embd_normalize(embd, out, n_embd);
110
+ common_embd_normalize(embd, out, n_embd);
110
111
  }
111
112
  }
112
113
 
113
114
  int main(int argc, char ** argv) {
114
- gpt_params params;
115
+ common_params params;
115
116
 
116
- if (!gpt_params_parse(argc, argv, params)) {
117
- print_usage(argc, argv, params);
117
+ if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
118
118
  return 1;
119
119
  }
120
120
 
121
+ common_init();
122
+
121
123
  // For BERT models, batch size must be equal to ubatch size
122
124
  params.n_ubatch = params.n_batch;
123
125
  params.embedding = true;
124
126
 
125
127
  if (params.chunk_size <= 0) {
126
- fprintf(stderr, "chunk_size must be positive\n");
128
+ LOG_ERR("chunk_size must be positive\n");
127
129
  return 1;
128
130
  }
129
131
  if (params.context_files.empty()) {
130
- fprintf(stderr, "context_files must be specified\n");
132
+ LOG_ERR("context_files must be specified\n");
131
133
  return 1;
132
134
  }
133
135
 
134
- print_build_info();
135
-
136
- printf("processing files:\n");
136
+ LOG_INF("processing files:\n");
137
137
  for (auto & context_file : params.context_files) {
138
- printf("%s\n", context_file.c_str());
138
+ LOG_INF("%s\n", context_file.c_str());
139
139
  }
140
140
 
141
141
  std::vector<chunk> chunks;
@@ -143,18 +143,19 @@ int main(int argc, char ** argv) {
143
143
  std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
144
144
  chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
145
145
  }
146
- printf("Number of chunks: %ld\n", chunks.size());
146
+ LOG_INF("Number of chunks: %ld\n", chunks.size());
147
147
 
148
148
  llama_backend_init();
149
149
  llama_numa_init(params.numa);
150
150
 
151
- llama_model * model;
152
- llama_context * ctx;
153
-
154
151
  // load the model
155
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
152
+ common_init_result llama_init = common_init_from_params(params);
153
+
154
+ llama_model * model = llama_init.model;
155
+ llama_context * ctx = llama_init.context;
156
+
156
157
  if (model == NULL) {
157
- fprintf(stderr, "%s: error: unable to load model\n", __func__);
158
+ LOG_ERR("%s: unable to load model\n", __func__);
158
159
  return 1;
159
160
  }
160
161
 
@@ -163,19 +164,19 @@ int main(int argc, char ** argv) {
163
164
 
164
165
  const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
165
166
  if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
166
- fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
167
+ LOG_ERR("%s: pooling type NONE not supported\n", __func__);
167
168
  return 1;
168
169
  }
169
170
 
170
171
  if (n_ctx > n_ctx_train) {
171
- fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
172
+ LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
172
173
  __func__, n_ctx_train, n_ctx);
173
174
  }
174
175
 
175
176
  // print system information
176
177
  {
177
- fprintf(stderr, "\n");
178
- fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
178
+ LOG_INF("\n");
179
+ LOG_INF("%s\n", common_params_get_system_info(params).c_str());
179
180
  }
180
181
 
181
182
  // max batch size
@@ -184,9 +185,9 @@ int main(int argc, char ** argv) {
184
185
 
185
186
  // tokenize the prompts and trim
186
187
  for (auto & chunk : chunks) {
187
- auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
188
+ auto inp = common_tokenize(ctx, chunk.textdata, true, false);
188
189
  if (inp.size() > n_batch) {
189
- fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
190
+ LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
190
191
  __func__, (long long int) inp.size(), (long long int) n_batch);
191
192
  return 1;
192
193
  }
@@ -200,12 +201,12 @@ int main(int argc, char ** argv) {
200
201
  // tokenization stats
201
202
  if (params.verbose_prompt) {
202
203
  for (int i = 0; i < (int) chunks.size(); i++) {
203
- fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
204
- fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
204
+ LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
205
+ LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
205
206
  for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
206
- fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
207
+ LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], common_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
207
208
  }
208
- fprintf(stderr, "\n\n");
209
+ LOG_INF("\n\n");
209
210
  }
210
211
  }
211
212
 
@@ -231,7 +232,7 @@ int main(int argc, char ** argv) {
231
232
  if (batch.n_tokens + n_toks > n_batch) {
232
233
  float * out = emb + p * n_embd;
233
234
  batch_decode(ctx, batch, out, s, n_embd);
234
- llama_batch_clear(batch);
235
+ common_batch_clear(batch);
235
236
  p += s;
236
237
  s = 0;
237
238
  }
@@ -252,26 +253,27 @@ int main(int argc, char ** argv) {
252
253
  chunks[i].tokens.clear();
253
254
  }
254
255
 
256
+ struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
257
+
255
258
  // start loop, receive query and return top k similar chunks based on cosine similarity
256
259
  std::string query;
257
260
  while (true) {
258
- printf("Enter query: ");
261
+ LOG("Enter query: ");
259
262
  std::getline(std::cin, query);
260
- std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
263
+ std::vector<int32_t> query_tokens = common_tokenize(ctx, query, true);
261
264
 
262
- struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
263
265
  batch_add_seq(query_batch, query_tokens, 0);
264
266
 
265
267
  std::vector<float> query_emb(n_embd, 0);
266
268
  batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
267
269
 
268
- llama_batch_clear(query_batch);
270
+ common_batch_clear(query_batch);
269
271
 
270
272
  // compute cosine similarities
271
273
  {
272
274
  std::vector<std::pair<int, float>> similarities;
273
275
  for (int i = 0; i < n_chunks; i++) {
274
- float sim = llama_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
276
+ float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
275
277
  similarities.push_back(std::make_pair(i, sim));
276
278
  }
277
279
 
@@ -280,19 +282,22 @@ int main(int argc, char ** argv) {
280
282
  return a.second > b.second;
281
283
  });
282
284
 
283
- printf("Top %d similar chunks:\n", params.sparams.top_k);
285
+ LOG("Top %d similar chunks:\n", params.sparams.top_k);
284
286
  for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
285
- printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
286
- printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
287
- printf("similarity: %f\n", similarities[i].second);
288
- printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
289
- printf("--------------------\n");
287
+ LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
288
+ LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
289
+ LOG("similarity: %f\n", similarities[i].second);
290
+ LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
291
+ LOG("--------------------\n");
290
292
  }
291
293
  }
292
294
  }
293
295
 
296
+ LOG("\n");
297
+ llama_perf_context_print(ctx);
298
+
294
299
  // clean up
295
- llama_print_timings(ctx);
300
+ llama_batch_free(query_batch);
296
301
  llama_free(ctx);
297
302
  llama_free_model(model);
298
303
  llama_backend_free();
@@ -1,3 +1,5 @@
1
+ #include "ggml-cpu.h"
2
+
1
3
  #ifdef GGML_USE_CUDA
2
4
  #include "ggml-cuda.h"
3
5
  #endif
@@ -6,6 +8,10 @@
6
8
  #include "ggml-metal.h"
7
9
  #endif
8
10
 
11
+ #ifdef GGML_USE_VULKAN
12
+ #include "ggml-vulkan.h"
13
+ #endif
14
+
9
15
  #include "ggml-rpc.h"
10
16
  #ifdef _WIN32
11
17
  # include <windows.h>
@@ -16,7 +22,7 @@
16
22
  #include <stdio.h>
17
23
 
18
24
  struct rpc_server_params {
19
- std::string host = "0.0.0.0";
25
+ std::string host = "127.0.0.1";
20
26
  int port = 50052;
21
27
  size_t backend_mem = 0;
22
28
  };
@@ -79,6 +85,12 @@ static ggml_backend_t create_backend() {
79
85
  if (!backend) {
80
86
  fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
81
87
  }
88
+ #elif GGML_USE_VULKAN
89
+ fprintf(stderr, "%s: using Vulkan backend\n", __func__);
90
+ backend = ggml_backend_vk_init(0); // init device 0
91
+ if (!backend) {
92
+ fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
93
+ }
82
94
  #endif
83
95
 
84
96
  // if there aren't GPU Backends fallback to CPU backend
@@ -92,6 +104,8 @@ static ggml_backend_t create_backend() {
92
104
  static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
93
105
  #ifdef GGML_USE_CUDA
94
106
  ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
107
+ #elif GGML_USE_VULKAN
108
+ ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
95
109
  #else
96
110
  #ifdef _WIN32
97
111
  MEMORYSTATUSEX status;
@@ -114,6 +128,17 @@ int main(int argc, char * argv[]) {
114
128
  fprintf(stderr, "Invalid parameters\n");
115
129
  return 1;
116
130
  }
131
+
132
+ if (params.host != "127.0.0.1") {
133
+ fprintf(stderr, "\n");
134
+ fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
135
+ fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str());
136
+ fprintf(stderr, " Never expose the RPC server to an open network!\n");
137
+ fprintf(stderr, " This is an experimental feature and is not secure!\n");
138
+ fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
139
+ fprintf(stderr, "\n");
140
+ }
141
+
117
142
  ggml_backend_t backend = create_backend();
118
143
  if (!backend) {
119
144
  fprintf(stderr, "Failed to create backend\n");
@@ -128,7 +153,7 @@ int main(int argc, char * argv[]) {
128
153
  get_backend_memory(&free_mem, &total_mem);
129
154
  }
130
155
  printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
131
- start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
156
+ ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
132
157
  ggml_backend_free(backend);
133
158
  return 0;
134
159
  }