@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -0,0 +1,322 @@
1
+ #if defined(_MSC_VER)
2
+ #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
3
+ #endif
4
+
5
+ #include "ggml-rpc.h"
6
+ #ifdef _WIN32
7
+ # define NOMINMAX
8
+ # define DIRECTORY_SEPARATOR '\\'
9
+ # include <locale>
10
+ # include <windows.h>
11
+ # include <fcntl.h>
12
+ # include <io.h>
13
+ #else
14
+ # define DIRECTORY_SEPARATOR '/'
15
+ # include <unistd.h>
16
+ # include <sys/stat.h>
17
+ #endif
18
+ #include <codecvt>
19
+ #include <string>
20
+ #include <stdio.h>
21
+ #include <vector>
22
+ #include <filesystem>
23
+ #include <algorithm>
24
+ #include <thread>
25
+
26
+ namespace fs = std::filesystem;
27
+
28
+ // NOTE: this is copied from common.cpp to avoid linking with libcommon
29
+ // returns true if successful, false otherwise
30
+ static bool fs_create_directory_with_parents(const std::string & path) {
31
+ #ifdef _WIN32
32
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
33
+ std::wstring wpath = converter.from_bytes(path);
34
+
35
+ // if the path already exists, check whether it's a directory
36
+ const DWORD attributes = GetFileAttributesW(wpath.c_str());
37
+ if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
38
+ return true;
39
+ }
40
+
41
+ size_t pos_slash = 0;
42
+
43
+ // process path from front to back, procedurally creating directories
44
+ while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
45
+ const std::wstring subpath = wpath.substr(0, pos_slash);
46
+ const wchar_t * test = subpath.c_str();
47
+
48
+ const bool success = CreateDirectoryW(test, NULL);
49
+ if (!success) {
50
+ const DWORD error = GetLastError();
51
+
52
+ // if the path already exists, ensure that it's a directory
53
+ if (error == ERROR_ALREADY_EXISTS) {
54
+ const DWORD attributes = GetFileAttributesW(subpath.c_str());
55
+ if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
56
+ return false;
57
+ }
58
+ } else {
59
+ return false;
60
+ }
61
+ }
62
+
63
+ pos_slash += 1;
64
+ }
65
+
66
+ return true;
67
+ #else
68
+ // if the path already exists, check whether it's a directory
69
+ struct stat info;
70
+ if (stat(path.c_str(), &info) == 0) {
71
+ return S_ISDIR(info.st_mode);
72
+ }
73
+
74
+ size_t pos_slash = 1; // skip leading slashes for directory creation
75
+
76
+ // process path from front to back, procedurally creating directories
77
+ while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
78
+ const std::string subpath = path.substr(0, pos_slash);
79
+ struct stat info;
80
+
81
+ // if the path already exists, ensure that it's a directory
82
+ if (stat(subpath.c_str(), &info) == 0) {
83
+ if (!S_ISDIR(info.st_mode)) {
84
+ return false;
85
+ }
86
+ } else {
87
+ // create parent directories
88
+ const int ret = mkdir(subpath.c_str(), 0755);
89
+ if (ret != 0) {
90
+ return false;
91
+ }
92
+ }
93
+
94
+ pos_slash += 1;
95
+ }
96
+
97
+ return true;
98
+ #endif // _WIN32
99
+ }
100
+
101
+ // NOTE: this is copied from common.cpp to avoid linking with libcommon
102
+ static std::string fs_get_cache_directory() {
103
+ std::string cache_directory = "";
104
+ auto ensure_trailing_slash = [](std::string p) {
105
+ // Make sure to add trailing slash
106
+ if (p.back() != DIRECTORY_SEPARATOR) {
107
+ p += DIRECTORY_SEPARATOR;
108
+ }
109
+ return p;
110
+ };
111
+ if (getenv("LLAMA_CACHE")) {
112
+ cache_directory = std::getenv("LLAMA_CACHE");
113
+ } else {
114
+ #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
115
+ if (std::getenv("XDG_CACHE_HOME")) {
116
+ cache_directory = std::getenv("XDG_CACHE_HOME");
117
+ } else {
118
+ cache_directory = std::getenv("HOME") + std::string("/.cache/");
119
+ }
120
+ #elif defined(__APPLE__)
121
+ cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
122
+ #elif defined(_WIN32)
123
+ cache_directory = std::getenv("LOCALAPPDATA");
124
+ #else
125
+ # error Unknown architecture
126
+ #endif
127
+ cache_directory = ensure_trailing_slash(cache_directory);
128
+ cache_directory += "llama.cpp";
129
+ }
130
+ return ensure_trailing_slash(cache_directory);
131
+ }
132
+
133
+ struct rpc_server_params {
134
+ std::string host = "127.0.0.1";
135
+ int port = 50052;
136
+ size_t backend_mem = 0;
137
+ bool use_cache = false;
138
+ int n_threads = std::max(1U, std::thread::hardware_concurrency()/2);
139
+ std::string device;
140
+ };
141
+
142
+ static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
143
+ fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
144
+ fprintf(stderr, "options:\n");
145
+ fprintf(stderr, " -h, --help show this help message and exit\n");
146
+ fprintf(stderr, " -t, --threads number of threads for the CPU backend (default: %d)\n", params.n_threads);
147
+ fprintf(stderr, " -d DEV, --device device to use\n");
148
+ fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
149
+ fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
150
+ fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
151
+ fprintf(stderr, " -c, --cache enable local file cache\n");
152
+ fprintf(stderr, "\n");
153
+ }
154
+
155
+ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
156
+ std::string arg;
157
+ for (int i = 1; i < argc; i++) {
158
+ arg = argv[i];
159
+ if (arg == "-H" || arg == "--host") {
160
+ if (++i >= argc) {
161
+ return false;
162
+ }
163
+ params.host = argv[i];
164
+ } else if (arg == "-t" || arg == "--threads") {
165
+ if (++i >= argc) {
166
+ return false;
167
+ }
168
+ params.n_threads = std::stoi(argv[i]);
169
+ if (params.n_threads <= 0) {
170
+ fprintf(stderr, "error: invalid number of threads: %d\n", params.n_threads);
171
+ return false;
172
+ }
173
+ } else if (arg == "-d" || arg == "--device") {
174
+ if (++i >= argc) {
175
+ return false;
176
+ }
177
+ params.device = argv[i];
178
+ if (ggml_backend_dev_by_name(params.device.c_str()) == nullptr) {
179
+ fprintf(stderr, "error: unknown device: %s\n", params.device.c_str());
180
+ fprintf(stderr, "available devices:\n");
181
+ for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
182
+ auto * dev = ggml_backend_dev_get(i);
183
+ size_t free, total;
184
+ ggml_backend_dev_memory(dev, &free, &total);
185
+ printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
186
+ }
187
+ return false;
188
+ }
189
+ } else if (arg == "-p" || arg == "--port") {
190
+ if (++i >= argc) {
191
+ return false;
192
+ }
193
+ params.port = std::stoi(argv[i]);
194
+ if (params.port <= 0 || params.port > 65535) {
195
+ return false;
196
+ }
197
+ } else if (arg == "-c" || arg == "--cache") {
198
+ params.use_cache = true;
199
+ } else if (arg == "-m" || arg == "--mem") {
200
+ if (++i >= argc) {
201
+ return false;
202
+ }
203
+ params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
204
+ } else if (arg == "-h" || arg == "--help") {
205
+ print_usage(argc, argv, params);
206
+ exit(0);
207
+ } else {
208
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
209
+ print_usage(argc, argv, params);
210
+ exit(0);
211
+ }
212
+ }
213
+ return true;
214
+ }
215
+
216
+ static ggml_backend_t create_backend(const rpc_server_params & params) {
217
+ ggml_backend_t backend = nullptr;
218
+
219
+ if (!params.device.empty()) {
220
+ ggml_backend_dev_t dev = ggml_backend_dev_by_name(params.device.c_str());
221
+ if (dev) {
222
+ backend = ggml_backend_dev_init(dev, nullptr);
223
+ if (!backend) {
224
+ fprintf(stderr, "Failed to create backend for device %s\n", params.device.c_str());
225
+ return nullptr;
226
+ }
227
+ }
228
+ }
229
+
230
+ // try to initialize a GPU backend first
231
+ if (!backend) {
232
+ backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
233
+ }
234
+
235
+ // if there aren't GPU backends fallback to CPU backend
236
+ if (!backend) {
237
+ backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
238
+ }
239
+
240
+ if (backend) {
241
+ fprintf(stderr, "%s: using %s backend\n", __func__, ggml_backend_name(backend));
242
+
243
+ // set the number of threads
244
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend);
245
+ ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
246
+ if (reg) {
247
+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
248
+ if (ggml_backend_set_n_threads_fn) {
249
+ ggml_backend_set_n_threads_fn(backend, params.n_threads);
250
+ }
251
+ }
252
+ }
253
+
254
+ return backend;
255
+ }
256
+
257
+ static void get_backend_memory(ggml_backend_t backend, size_t * free_mem, size_t * total_mem) {
258
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend);
259
+ GGML_ASSERT(dev != nullptr);
260
+ ggml_backend_dev_memory(dev, free_mem, total_mem);
261
+ }
262
+
263
+ int main(int argc, char * argv[]) {
264
+ ggml_backend_load_all();
265
+
266
+ rpc_server_params params;
267
+ if (!rpc_server_params_parse(argc, argv, params)) {
268
+ fprintf(stderr, "Invalid parameters\n");
269
+ return 1;
270
+ }
271
+
272
+ if (params.host != "127.0.0.1") {
273
+ fprintf(stderr, "\n");
274
+ fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
275
+ fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str());
276
+ fprintf(stderr, " Never expose the RPC server to an open network!\n");
277
+ fprintf(stderr, " This is an experimental feature and is not secure!\n");
278
+ fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
279
+ fprintf(stderr, "\n");
280
+ }
281
+
282
+ ggml_backend_t backend = create_backend(params);
283
+ if (!backend) {
284
+ fprintf(stderr, "Failed to create backend\n");
285
+ return 1;
286
+ }
287
+ std::string endpoint = params.host + ":" + std::to_string(params.port);
288
+ size_t free_mem, total_mem;
289
+ if (params.backend_mem > 0) {
290
+ free_mem = params.backend_mem;
291
+ total_mem = params.backend_mem;
292
+ } else {
293
+ get_backend_memory(backend, &free_mem, &total_mem);
294
+ }
295
+ const char * cache_dir = nullptr;
296
+ std::string cache_dir_str;
297
+ if (params.use_cache) {
298
+ cache_dir_str = fs_get_cache_directory() + "rpc/";
299
+ if (!fs_create_directory_with_parents(cache_dir_str)) {
300
+ fprintf(stderr, "Failed to create cache directory: %s\n", cache_dir_str.c_str());
301
+ return 1;
302
+ }
303
+ cache_dir = cache_dir_str.c_str();
304
+ }
305
+
306
+ ggml_backend_reg_t reg = ggml_backend_reg_by_name("RPC");
307
+ if (!reg) {
308
+ fprintf(stderr, "Failed to find RPC backend\n");
309
+ return 1;
310
+ }
311
+
312
+ auto start_server_fn = (decltype(ggml_backend_rpc_start_server)*) ggml_backend_reg_get_proc_address(reg, "ggml_backend_rpc_start_server");
313
+ if (!start_server_fn) {
314
+ fprintf(stderr, "Failed to obtain RPC backend start server function\n");
315
+ return 1;
316
+ }
317
+
318
+ start_server_fn(backend, endpoint.c_str(), cache_dir, free_mem, total_mem);
319
+
320
+ ggml_backend_free(backend);
321
+ return 0;
322
+ }
@@ -0,0 +1,16 @@
1
+ set(TARGET llama-run)
2
+ add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp)
3
+
4
+ # TODO: avoid copying this code block from common/CMakeLists.txt
5
+ set(LLAMA_RUN_EXTRA_LIBS "")
6
+ if (LLAMA_CURL)
7
+ find_package(CURL REQUIRED)
8
+ target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
9
+ include_directories(${CURL_INCLUDE_DIRS})
10
+ find_library(CURL_LIBRARY curl REQUIRED)
11
+ set(LLAMA_RUN_EXTRA_LIBS ${LLAMA_RUN_EXTRA_LIBS} ${CURL_LIBRARY})
12
+ endif ()
13
+
14
+ install(TARGETS ${TARGET} RUNTIME)
15
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_RUN_EXTRA_LIBS})
16
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -38,24 +38,6 @@
38
38
  }
39
39
  #endif
40
40
 
41
- GGML_ATTRIBUTE_FORMAT(1, 2)
42
- static std::string fmt(const char * fmt, ...) {
43
- va_list ap;
44
- va_list ap2;
45
- va_start(ap, fmt);
46
- va_copy(ap2, ap);
47
- const int size = vsnprintf(NULL, 0, fmt, ap);
48
- GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
49
- std::string buf;
50
- buf.resize(size);
51
- const int size2 = vsnprintf(const_cast<char *>(buf.data()), buf.size() + 1, fmt, ap2);
52
- GGML_ASSERT(size2 == size);
53
- va_end(ap2);
54
- va_end(ap);
55
-
56
- return buf;
57
- }
58
-
59
41
  GGML_ATTRIBUTE_FORMAT(1, 2)
60
42
  static int printe(const char * fmt, ...) {
61
43
  va_list args;
@@ -285,7 +267,7 @@ class Opt {
285
267
  "Commands:\n"
286
268
  " model\n"
287
269
  " Model is a string with an optional prefix of \n"
288
- " huggingface:// (hf://), ollama://, https:// or file://.\n"
270
+ " huggingface:// (hf://), modelscope:// (ms://), ollama://, https:// or file://.\n"
289
271
  " If no protocol is specified and a file exists in the specified\n"
290
272
  " path, file:// is assumed, otherwise if a file does not exist in\n"
291
273
  " the specified path, ollama:// is assumed. Models that are being\n"
@@ -300,6 +282,9 @@ class Opt {
300
282
  " llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf\n"
301
283
  " llama-run "
302
284
  "huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n"
285
+ " llama-run ms://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf\n"
286
+ " llama-run "
287
+ "modelscope://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n"
303
288
  " llama-run https://example.com/some-file1.gguf\n"
304
289
  " llama-run some-file2.gguf\n"
305
290
  " llama-run file://some-file3.gguf\n"
@@ -525,11 +510,11 @@ class HttpClient {
525
510
  int secs = static_cast<int>(seconds) % 60;
526
511
 
527
512
  if (hrs > 0) {
528
- return fmt("%dh %02dm %02ds", hrs, mins, secs);
513
+ return string_format("%dh %02dm %02ds", hrs, mins, secs);
529
514
  } else if (mins > 0) {
530
- return fmt("%dm %02ds", mins, secs);
515
+ return string_format("%dm %02ds", mins, secs);
531
516
  } else {
532
- return fmt("%ds", secs);
517
+ return string_format("%ds", secs);
533
518
  }
534
519
  }
535
520
 
@@ -544,7 +529,7 @@ class HttpClient {
544
529
  }
545
530
  }
546
531
 
547
- return fmt("%.2f %s", dbl_size, suffix[i]);
532
+ return string_format("%.2f %s", dbl_size, suffix[i]);
548
533
  }
549
534
 
550
535
  static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
@@ -578,7 +563,9 @@ class HttpClient {
578
563
  return (now_downloaded_plus_file_size * 100) / total_to_download;
579
564
  }
580
565
 
581
- static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", static_cast<long int>(percentage)); }
566
+ static std::string generate_progress_prefix(curl_off_t percentage) {
567
+ return string_format("%3ld%% |", static_cast<long int>(percentage));
568
+ }
582
569
 
583
570
  static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
584
571
  const auto now = std::chrono::steady_clock::now();
@@ -589,9 +576,9 @@ class HttpClient {
589
576
  static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download,
590
577
  double speed, double estimated_time) {
591
578
  const int width = 10;
592
- return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width,
593
- human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width,
594
- human_readable_time(estimated_time).c_str());
579
+ return string_format("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(),
580
+ width, human_readable_size(total_to_download).c_str(), width,
581
+ human_readable_size(speed).c_str(), width, human_readable_time(estimated_time).c_str());
595
582
  }
596
583
 
597
584
  static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) {
@@ -705,7 +692,7 @@ class LlamaData {
705
692
  return 0;
706
693
  }
707
694
 
708
- int huggingface_dl(std::string & model, const std::string & bn) {
695
+ int dl_from_endpoint(std::string & model_endpoint, std::string & model, const std::string & bn) {
709
696
  // Find the second occurrence of '/' after protocol string
710
697
  size_t pos = model.find('/');
711
698
  pos = model.find('/', pos + 1);
@@ -714,7 +701,7 @@ class LlamaData {
714
701
  std::string url;
715
702
 
716
703
  if (pos == std::string::npos) {
717
- auto [model_name, manifest_url] = extract_model_and_tag(model, "https://huggingface.co/v2/");
704
+ auto [model_name, manifest_url] = extract_model_and_tag(model, model_endpoint + "v2/");
718
705
  hfr = model_name;
719
706
 
720
707
  nlohmann::json manifest;
@@ -729,11 +716,21 @@ class LlamaData {
729
716
  hff = model.substr(pos + 1);
730
717
  }
731
718
 
732
- url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
719
+ url = model_endpoint + hfr + "/resolve/main/" + hff;
733
720
 
734
721
  return download(url, bn, true, headers);
735
722
  }
736
723
 
724
+ int modelscope_dl(std::string & model, const std::string & bn) {
725
+ std::string model_endpoint = "https://modelscope.cn/models/";
726
+ return dl_from_endpoint(model_endpoint, model, bn);
727
+ }
728
+
729
+ int huggingface_dl(std::string & model, const std::string & bn) {
730
+ std::string model_endpoint = get_model_endpoint();
731
+ return dl_from_endpoint(model_endpoint, model, bn);
732
+ }
733
+
737
734
  int ollama_dl(std::string & model, const std::string & bn) {
738
735
  const std::vector<std::string> headers = { "Accept: application/vnd.docker.distribution.manifest.v2+json" };
739
736
  if (model.find('/') == std::string::npos) {
@@ -851,6 +848,9 @@ class LlamaData {
851
848
  rm_until_substring(model_, "hf.co/");
852
849
  rm_until_substring(model_, "://");
853
850
  ret = huggingface_dl(model_, bn);
851
+ } else if (string_starts_with(model_, "ms://") || string_starts_with(model_, "modelscope://")) {
852
+ rm_until_substring(model_, "://");
853
+ ret = modelscope_dl(model_, bn);
854
854
  } else if ((string_starts_with(model_, "https://") || string_starts_with(model_, "http://")) &&
855
855
  !string_starts_with(model_, "https://ollama.com/library/")) {
856
856
  ret = download(model_, bn, true);
@@ -34,8 +34,9 @@ endforeach()
34
34
  add_executable(${TARGET} ${TARGET_SRCS})
35
35
  install(TARGETS ${TARGET} RUNTIME)
36
36
 
37
+ target_include_directories(${TARGET} PRIVATE ../llava)
37
38
  target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
38
- target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
39
+ target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
39
40
 
40
41
  if (LLAMA_SERVER_SSL)
41
42
  find_package(OpenSSL REQUIRED)