@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -0,0 +1,204 @@
1
+ #include "regex-partial.h"
2
+ #include "common.h"
3
+ #include <functional>
4
+ #include <optional>
5
+
6
+ common_regex::common_regex(const std::string & pattern) :
7
+ pattern(pattern),
8
+ rx(pattern),
9
+ rx_reversed_partial(regex_to_reversed_partial_regex(pattern)) {}
10
+
11
+ common_regex_match common_regex::search(const std::string & input, size_t pos, bool as_match) const {
12
+ std::smatch match;
13
+ if (pos > input.size()) {
14
+ throw std::runtime_error("Position out of bounds");
15
+ }
16
+ auto start = input.begin() + pos;
17
+ auto found = as_match
18
+ ? std::regex_match(start, input.end(), match, rx)
19
+ : std::regex_search(start, input.end(), match, rx);
20
+ if (found) {
21
+ common_regex_match res;
22
+ res.type = COMMON_REGEX_MATCH_TYPE_FULL;
23
+ for (size_t i = 0; i < match.size(); ++i) {
24
+ auto begin = pos + match.position(i);
25
+ res.groups.emplace_back(begin, begin + match.length(i));
26
+ }
27
+ return res;
28
+ }
29
+ std::match_results<std::string::const_reverse_iterator> srmatch;
30
+ if (std::regex_match(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial)) {
31
+ auto group = srmatch[1].str();
32
+ if (group.length() != 0) {
33
+ auto it = srmatch[1].second.base();
34
+ // auto position = static_cast<size_t>(std::distance(input.begin(), it));
35
+ if ((!as_match) || it == input.begin()) {
36
+ common_regex_match res;
37
+ res.type = COMMON_REGEX_MATCH_TYPE_PARTIAL;
38
+ const size_t begin = std::distance(input.begin(), it);
39
+ const size_t end = input.size();
40
+ if (begin == std::string::npos || end == std::string::npos || begin > end) {
41
+ throw std::runtime_error("Invalid range");
42
+ }
43
+ res.groups.push_back({begin, end});
44
+ return res;
45
+ }
46
+ }
47
+ }
48
+ return {};
49
+ }
50
+
51
+ /*
52
+ Transforms a regex pattern to a partial match pattern that operates on a reversed input string to find partial final matches of the original pattern.
53
+
54
+ Ideally we'd like to use boost::match_partial (https://beta.boost.org/doc/libs/1_59_0/libs/regex/doc/html/boost_regex/partial_matches.html)
55
+ to see if a string ends with a partial regex match, but but it's not in std::regex yet.
56
+ Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
57
+
58
+ - /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:(?:d)?c)?b)?a).*
59
+ - /a|b/ -> (a|b).*
60
+ - /a*?/ -> error, could match ""
61
+ - /a*b/ -> ((?:b)?a*+).* (final repetitions become eager)
62
+ - /.*?ab/ -> ((?:b)?a).* (merge .*)
63
+ - /a.*?b/ -> ((?:b)?.*?a).* (keep reluctant matches)
64
+ - /a(bc)d/ -> ((?:(?:d)?(?:(?:c)?b))?a).*
65
+ - /a(bc|de)/ -> ((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a).*
66
+ - /ab{2,4}c/ -> abbb?b?c -> ((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a).*
67
+
68
+ The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern
69
+ (i.e. just where the final .* starts in the inverted pattern; all other groups are turned into non-capturing groups, and reluctant quantifiers are ignored)
70
+ */
71
+ std::string regex_to_reversed_partial_regex(const std::string & pattern) {
72
+ auto it = pattern.begin();
73
+ const auto end = pattern.end();
74
+
75
+ std::function<std::string()> process = [&]() {
76
+ std::vector<std::vector<std::string>> alternatives(1);
77
+ std::vector<std::string> * sequence = &alternatives.back();
78
+
79
+ while (it != end) {
80
+ if (*it == '[') {
81
+ auto start = it;
82
+ ++it;
83
+ while (it != end) {
84
+ if ((*it == '\\') && (++it != end)) {
85
+ ++it;
86
+ } else if ((it != end) && (*it == ']')) {
87
+ break;
88
+ } else {
89
+ ++it;
90
+ }
91
+ }
92
+ if (it == end) {
93
+ throw std::runtime_error("Unmatched '[' in pattern");
94
+ }
95
+ ++it;
96
+ sequence->push_back(std::string(start, it));
97
+ } else if (*it == '*' || *it == '?' || *it == '+') {
98
+ if (sequence->empty()) {
99
+ throw std::runtime_error("Quantifier without preceding element");
100
+ }
101
+ sequence->back() += *it;
102
+ auto is_star = *it == '*';
103
+ ++it;
104
+ if (is_star) {
105
+ if (*it == '?') {
106
+ ++it;
107
+ }
108
+ }
109
+ } else if (*it == '{') {
110
+ if (sequence->empty()) {
111
+ throw std::runtime_error("Repetition without preceding element");
112
+ }
113
+ ++it;
114
+ auto start = it;
115
+ while (it != end && *it != '}') {
116
+ ++it;
117
+ }
118
+ if (it == end) {
119
+ throw std::runtime_error("Unmatched '{' in pattern");
120
+ }
121
+ auto parts = string_split(std::string(start, it), ",");
122
+ ++it;
123
+ if (parts.size() > 2) {
124
+ throw std::runtime_error("Invalid repetition range in pattern");
125
+ }
126
+
127
+ auto parseOptInt = [&](const std::string & s, const std::optional<int> & def = std::nullopt) -> std::optional<int> {
128
+ if (s.empty()) {
129
+ return def;
130
+ }
131
+ return std::stoi(s);
132
+ };
133
+ auto min = parseOptInt(parts[0], 0);
134
+ auto max = parts.size() == 1 ? min : parseOptInt(parts[1]);
135
+ if (min && max && *max < *min) {
136
+ throw std::runtime_error("Invalid repetition range in pattern");
137
+ }
138
+ // Brutal but... let's repeat at least min times, then ? for the delta between min & max (or * for unbounded)
139
+ auto part = sequence->back();
140
+ sequence->pop_back();
141
+ for (int i = 0; i < *min; i++) {
142
+ sequence->push_back(part);
143
+ }
144
+ if (max) {
145
+ for (int i = *min; i < *max; i++) {
146
+ sequence->push_back(part + "?");
147
+ }
148
+ } else {
149
+ sequence->push_back(part + "*");
150
+ }
151
+ } else if (*it == '(') {
152
+ ++it;
153
+ if (it != end && *it == '?' && (it + 1 != end) && *(it + 1) == ':') {
154
+ it += 2;
155
+ }
156
+ auto sub = process();
157
+ if (*it != ')') {
158
+ throw std::runtime_error("Unmatched '(' in pattern");
159
+ }
160
+ ++it;
161
+ auto & part = sequence->emplace_back("(?:");
162
+ part += sub;
163
+ part += ")";
164
+ } else if (*it == ')') {
165
+ break;
166
+ } else if (*it == '|') {
167
+ ++it;
168
+ alternatives.emplace_back();
169
+ sequence = &alternatives.back();
170
+ } else if (*it == '\\' && (++it != end)) {
171
+ auto str = std::string("\\") + *it;
172
+ sequence->push_back(str);
173
+ ++it;
174
+ } else if (it != end) {
175
+ sequence->push_back(std::string(1, *it));
176
+ ++it;
177
+ }
178
+ }
179
+
180
+ // /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:d)?c)?b)?a).*
181
+ // if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
182
+ // We'll do the outermost capturing group and final .* in the enclosing function.
183
+ std::vector<std::string> res_alts;
184
+ for (const auto & parts : alternatives) {
185
+ auto & res = res_alts.emplace_back();
186
+ for (size_t i = 0; i < parts.size() - 1; i++) {
187
+ res += "(?:";
188
+ }
189
+ for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
190
+ res += *it;
191
+ if (it != parts.rend() - 1) {
192
+ res += ")?";
193
+ }
194
+ }
195
+ }
196
+ return string_join(res_alts, "|");
197
+ };
198
+ auto res = process();
199
+ if (it != end) {
200
+ throw std::runtime_error("Unmatched '(' in pattern");
201
+ }
202
+
203
+ return "(" + res + ")[\\s\\S]*";
204
+ }
@@ -0,0 +1,56 @@
1
+ #pragma once
2
+
3
+ #include <regex>
4
+ #include <string>
5
+
6
+ enum common_regex_match_type {
7
+ COMMON_REGEX_MATCH_TYPE_NONE,
8
+ COMMON_REGEX_MATCH_TYPE_PARTIAL,
9
+ COMMON_REGEX_MATCH_TYPE_FULL,
10
+ };
11
+
12
+ struct common_string_range {
13
+ size_t begin;
14
+ size_t end;
15
+ common_string_range(size_t begin, size_t end) : begin(begin), end(end) {
16
+ if (begin > end) {
17
+ throw std::runtime_error("Invalid range");
18
+ }
19
+ }
20
+ // prevent default ctor
21
+ common_string_range() = delete;
22
+ bool empty() const {
23
+ return begin == end;
24
+ }
25
+ bool operator==(const common_string_range & other) const {
26
+ return begin == other.begin && end == other.end;
27
+ }
28
+ };
29
+
30
+ struct common_regex_match {
31
+ common_regex_match_type type = COMMON_REGEX_MATCH_TYPE_NONE;
32
+ std::vector<common_string_range> groups;
33
+
34
+ bool operator==(const common_regex_match & other) const {
35
+ return type == other.type && groups == other.groups;
36
+ }
37
+ bool operator!=(const common_regex_match & other) const {
38
+ return !(*this == other);
39
+ }
40
+ };
41
+
42
+ class common_regex {
43
+ std::string pattern;
44
+ std::regex rx;
45
+ std::regex rx_reversed_partial;
46
+
47
+ public:
48
+ explicit common_regex(const std::string & pattern);
49
+
50
+ common_regex_match search(const std::string & input, size_t pos, bool as_match = false) const;
51
+
52
+ const std::string & str() const { return pattern; }
53
+ };
54
+
55
+ // For testing only (pretty print of failures).
56
+ std::string regex_to_reversed_partial_regex(const std::string & pattern);
@@ -1,6 +1,7 @@
1
1
  #include "sampling.h"
2
2
 
3
3
  #include "common.h"
4
+ #include "log.h"
4
5
 
5
6
  #include <cmath>
6
7
  #include <unordered_map>
@@ -208,6 +209,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
208
209
  trigger_patterns_c.data(), trigger_patterns_c.size(),
209
210
  trigger_tokens.data(), trigger_tokens.size())
210
211
  : llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
212
+ if (!grmr) {
213
+ return nullptr;
214
+ }
211
215
  }
212
216
 
213
217
  auto * result = new common_sampler {
@@ -226,51 +230,48 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
226
230
  params.logit_bias.data()));
227
231
 
228
232
  if (params.mirostat == 0) {
229
- if (params.top_n_sigma >= 0) {
230
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
231
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
232
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
233
- } else {
234
- for (const auto & cnstr : params.samplers) {
235
- switch (cnstr) {
236
- case COMMON_SAMPLER_TYPE_DRY:
237
- {
238
- std::vector<const char *> c_breakers;
239
- c_breakers.reserve(params.dry_sequence_breakers.size());
240
- for (const auto & str : params.dry_sequence_breakers) {
241
- c_breakers.push_back(str.c_str());
242
- }
243
-
244
- llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
233
+ for (const auto & cnstr : params.samplers) {
234
+ switch (cnstr) {
235
+ case COMMON_SAMPLER_TYPE_DRY:
236
+ {
237
+ std::vector<const char *> c_breakers;
238
+ c_breakers.reserve(params.dry_sequence_breakers.size());
239
+ for (const auto & str : params.dry_sequence_breakers) {
240
+ c_breakers.push_back(str.c_str());
245
241
  }
246
- break;
247
- case COMMON_SAMPLER_TYPE_TOP_K:
248
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
249
- break;
250
- case COMMON_SAMPLER_TYPE_TOP_P:
251
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
252
- break;
253
- case COMMON_SAMPLER_TYPE_MIN_P:
254
- llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
255
- break;
256
- case COMMON_SAMPLER_TYPE_XTC:
257
- llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
258
- break;
259
- case COMMON_SAMPLER_TYPE_TYPICAL_P:
260
- llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
261
- break;
262
- case COMMON_SAMPLER_TYPE_TEMPERATURE:
263
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
264
- break;
265
- case COMMON_SAMPLER_TYPE_INFILL:
266
- llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
267
- break;
268
- case COMMON_SAMPLER_TYPE_PENALTIES:
269
- llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
270
- break;
271
- default:
272
- GGML_ASSERT(false && "unknown sampler type");
273
- }
242
+
243
+ llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
244
+ }
245
+ break;
246
+ case COMMON_SAMPLER_TYPE_TOP_K:
247
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
248
+ break;
249
+ case COMMON_SAMPLER_TYPE_TOP_P:
250
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
251
+ break;
252
+ case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
253
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
254
+ break;
255
+ case COMMON_SAMPLER_TYPE_MIN_P:
256
+ llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
257
+ break;
258
+ case COMMON_SAMPLER_TYPE_XTC:
259
+ llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
260
+ break;
261
+ case COMMON_SAMPLER_TYPE_TYPICAL_P:
262
+ llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
263
+ break;
264
+ case COMMON_SAMPLER_TYPE_TEMPERATURE:
265
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
266
+ break;
267
+ case COMMON_SAMPLER_TYPE_INFILL:
268
+ llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
269
+ break;
270
+ case COMMON_SAMPLER_TYPE_PENALTIES:
271
+ llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
272
+ break;
273
+ default:
274
+ GGML_ASSERT(false && "unknown sampler type");
274
275
  }
275
276
  }
276
277
  llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
@@ -472,6 +473,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
472
473
  case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
473
474
  case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
474
475
  case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
476
+ case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
475
477
  case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
476
478
  case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
477
479
  case COMMON_SAMPLER_TYPE_XTC: return 'x';
@@ -487,6 +489,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
487
489
  case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
488
490
  case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
489
491
  case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
492
+ case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
490
493
  case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
491
494
  case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
492
495
  case COMMON_SAMPLER_TYPE_XTC: return "xtc";
@@ -501,6 +504,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
501
504
  { "dry", COMMON_SAMPLER_TYPE_DRY },
502
505
  { "top_k", COMMON_SAMPLER_TYPE_TOP_K },
503
506
  { "top_p", COMMON_SAMPLER_TYPE_TOP_P },
507
+ { "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
504
508
  { "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
505
509
  { "min_p", COMMON_SAMPLER_TYPE_MIN_P },
506
510
  { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
@@ -514,6 +518,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
514
518
  std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
515
519
  { "top-k", COMMON_SAMPLER_TYPE_TOP_K },
516
520
  { "top-p", COMMON_SAMPLER_TYPE_TOP_P },
521
+ { "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
517
522
  { "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
518
523
  { "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
519
524
  { "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
@@ -530,14 +535,16 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
530
535
  auto sampler = sampler_canonical_name_map.find(name);
531
536
  if (sampler != sampler_canonical_name_map.end()) {
532
537
  samplers.push_back(sampler->second);
533
- } else {
534
- if (allow_alt_names) {
535
- sampler = sampler_alt_name_map.find(name);
536
- if (sampler != sampler_alt_name_map.end()) {
537
- samplers.push_back(sampler->second);
538
- }
538
+ continue;
539
+ }
540
+ if (allow_alt_names) {
541
+ sampler = sampler_alt_name_map.find(name);
542
+ if (sampler != sampler_alt_name_map.end()) {
543
+ samplers.push_back(sampler->second);
544
+ continue;
539
545
  }
540
546
  }
547
+ LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
541
548
  }
542
549
 
543
550
  return samplers;
@@ -549,6 +556,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
549
556
  { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
550
557
  { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
551
558
  { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
559
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
552
560
  { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
553
561
  { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
554
562
  { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
@@ -563,6 +571,8 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
563
571
  const auto sampler = sampler_name_map.find(c);
564
572
  if (sampler != sampler_name_map.end()) {
565
573
  samplers.push_back(sampler->second);
574
+ } else {
575
+ LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
566
576
  }
567
577
  }
568
578
 
@@ -132,12 +132,14 @@ You may find the official downloads here: [NVIDIA developer site](https://develo
132
132
 
133
133
 
134
134
  #### Compile and run inside a Fedora Toolbox Container
135
- We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
135
+ We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
136
136
 
137
137
  **Recommended for:**
138
-
139
- - ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
140
- - Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
138
+ - ***Necessary*** for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
139
+ - (there are no supported CUDA packages for these systems)
140
+ - ***Necessary*** for users that have a host that is not a: [Supported Nvidia CUDA Release Platform](https://developer.nvidia.com/cuda-downloads).
141
+ - (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your your host operating system)
142
+ - ***Convenient*** For those running [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde), and want to keep their host system clean.
141
143
  - *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
142
144
 
143
145
 
@@ -189,7 +191,7 @@ The following compilation options are also available to tweak performance:
189
191
 
190
192
  | Option | Legal values | Default | Description |
191
193
  |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
192
- | GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
194
+ | GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
193
195
  | GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
194
196
  | GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
195
197
  | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
@@ -216,6 +218,7 @@ By default, all supported compute capabilities are enabled. To customize this be
216
218
 
217
219
  ```bash
218
220
  cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
221
+ cmake --build build --config Release
219
222
  ```
220
223
 
221
224
  This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
@@ -256,8 +259,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
256
259
  cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
257
260
  && cmake --build build --config Release -- -j 16
258
261
  ```
259
- On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
260
- However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
261
262
 
262
263
  To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
263
264
 
@@ -293,6 +294,10 @@ You can download it from your Linux distro's package manager or from here: [ROCm
293
294
  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
294
295
  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
295
296
 
297
+ ### Unified Memory
298
+
299
+ On Linux it is possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
300
+
296
301
  ## Vulkan
297
302
 
298
303
  **Windows**
@@ -433,6 +438,116 @@ llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
433
438
 
434
439
  For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
435
440
 
441
+ ## Arm® KleidiAI™
442
+ KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend.
443
+
444
+ To enable KleidiAI, go to the llama.cpp directory and build using CMake
445
+ ```bash
446
+ cmake -B build -DGGML_CPU_KLEIDIAI=ON
447
+ cmake --build build --config Release
448
+ ```
449
+ You can verify that KleidiAI is being used by running
450
+ ```bash
451
+ ./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?"
452
+ ```
453
+ If KleidiAI is enabled, the ouput will contain a line similar to:
454
+ ```
455
+ load_tensors: CPU_KLEIDIAI model buffer size = 3474.00 MiB
456
+ ```
457
+ KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`.
458
+
459
+ Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`.
460
+
461
+ ## OpenCL
462
+
463
+ This provides GPU acceleration through OpenCL on recent Adreno GPU.
464
+ More information about OpenCL backend can be found in [OPENCL.md](./backend/OPENCL.md) for more information.
465
+
466
+ ### Android
467
+
468
+ Assume NDK is available in `$ANDROID_NDK`. First, install OpenCL headers and ICD loader library if not available,
469
+
470
+ ```sh
471
+ mkdir -p ~/dev/llm
472
+ cd ~/dev/llm
473
+
474
+ git clone https://github.com/KhronosGroup/OpenCL-Headers && \
475
+ cd OpenCL-Headers && \
476
+ cp -r CL $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
477
+
478
+ cd ~/dev/llm
479
+
480
+ git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && \
481
+ cd OpenCL-ICD-Loader && \
482
+ mkdir build_ndk && cd build_ndk && \
483
+ cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
484
+ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
485
+ -DOPENCL_ICD_LOADER_HEADERS_DIR=$ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
486
+ -DANDROID_ABI=arm64-v8a \
487
+ -DANDROID_PLATFORM=24 \
488
+ -DANDROID_STL=c++_shared && \
489
+ ninja && \
490
+ cp libOpenCL.so $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
491
+ ```
492
+
493
+ Then build llama.cpp with OpenCL enabled,
494
+
495
+ ```sh
496
+ cd ~/dev/llm
497
+
498
+ git clone https://github.com/ggml-org/llama.cpp && \
499
+ cd llama.cpp && \
500
+ mkdir build-android && cd build-android
501
+
502
+ cmake .. -G Ninja \
503
+ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
504
+ -DANDROID_ABI=arm64-v8a \
505
+ -DANDROID_PLATFORM=android-28 \
506
+ -DBUILD_SHARED_LIBS=OFF \
507
+ -DGGML_OPENCL=ON
508
+
509
+ ninja
510
+ ```
511
+
512
+ ### Windows Arm64
513
+
514
+ First, install OpenCL headers and ICD loader library if not available,
515
+
516
+ ```powershell
517
+ mkdir -p ~/dev/llm
518
+
519
+ cd ~/dev/llm
520
+ git clone https://github.com/KhronosGroup/OpenCL-Headers && cd OpenCL-Headers
521
+ mkdir build && cd build
522
+ cmake .. -G Ninja `
523
+ -DBUILD_TESTING=OFF `
524
+ -DOPENCL_HEADERS_BUILD_TESTING=OFF `
525
+ -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
526
+ -DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
527
+ cmake --build . --target install
528
+
529
+ cd ~/dev/llm
530
+ git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && cd OpenCL-ICD-Loader
531
+ mkdir build && cd build
532
+ cmake .. -G Ninja `
533
+ -DCMAKE_BUILD_TYPE=Release `
534
+ -DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
535
+ -DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
536
+ cmake --build . --target install
537
+ ```
538
+
539
+ Then build llama.cpp with OpenCL enabled,
540
+
541
+ ```powershell
542
+ cmake .. -G Ninja `
543
+ -DCMAKE_TOOLCHAIN_FILE="$HOME/dev/llm/llama.cpp/cmake/arm64-windows-llvm.cmake" `
544
+ -DCMAKE_BUILD_TYPE=Release `
545
+ -DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
546
+ -DBUILD_SHARED_LIBS=OFF `
547
+ -DGGML_OPENCL=ON
548
+ ninja
549
+ ```
550
+
436
551
  ## Android
437
552
 
438
553
  To read documentation for how to build on Android, [click here](./android.md)