@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -2,96 +2,131 @@
2
2
 
3
3
  #include "llama.h"
4
4
  #include "llama-io.h"
5
+ #include "llama-graph.h"
5
6
  #include "llama-memory.h"
6
7
 
7
8
  #include "ggml-cpp.h"
8
9
 
9
- #include <functional>
10
10
  #include <set>
11
11
  #include <vector>
12
12
 
13
13
  struct llama_cparams;
14
14
  struct llama_hparams;
15
15
  struct llama_ubatch;
16
+ struct llama_sbatch;
17
+ struct llama_model;
18
+ struct llama_context;
16
19
 
17
20
  struct llama_kv_cache : public llama_memory_i {
18
- using llama_memory_i::llama_memory_i;
21
+ virtual ~llama_kv_cache() = default;
19
22
 
20
- virtual int32_t get_n_tokens() const = 0;
21
- virtual uint32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
23
+ // call if batch processing fails - restores the cache state
24
+ virtual void restore() = 0;
22
25
 
23
- virtual bool get_can_shift() const = 0;
26
+ // call after successful batch processing - clears any pending state
27
+ virtual void commit() = 0;
28
+
29
+ // process any pending defrag/shift/etc. operations
30
+ // optionally call once before processing a new batch
31
+ virtual bool update(llama_context & lctx) = 0;
32
+
33
+ // schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
34
+ virtual void defrag_sched(float thold) = 0;
35
+
36
+ // simulate full cache, used for allocating worst-case compute buffers
37
+ virtual void set_full() = 0;
38
+
39
+ //
40
+ // batch processing
41
+ //
42
+
43
+ virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
44
+
45
+ // different KV caches require different batch splitting strategies
46
+ virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const = 0;
47
+
48
+ // find an empty slot of size "n_tokens" in the cache
49
+ virtual bool find_slot(const llama_ubatch & batch) = 0;
50
+
51
+ // getters
52
+ virtual int32_t get_n_tokens() const = 0;
53
+ virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
54
+ virtual llama_pos get_pos_max() const = 0;
55
+ virtual bool get_can_shift() const = 0;
24
56
 
25
57
  bool get_can_edit() const override { return get_can_shift(); }
58
+
59
+ //
60
+ // state write/read
61
+ //
62
+
63
+ virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
64
+ virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
26
65
  };
27
66
 
28
- struct llama_kv_cell {
29
- llama_pos pos = -1;
30
- llama_pos delta = 0;
31
- int32_t src = -1; // used by recurrent state models to copy states
32
- int32_t tail = -1;
67
+ //
68
+ // llama_kv_cache_guard
69
+ //
33
70
 
34
- std::set<llama_seq_id> seq_id;
71
+ struct llama_kv_cache_guard {
72
+ llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
35
73
 
36
- bool has_seq_id(const llama_seq_id & id) const {
37
- return seq_id.find(id) != seq_id.end();
74
+ ~llama_kv_cache_guard() {
75
+ kv->restore();
38
76
  }
39
77
 
40
- bool is_empty() const {
41
- return seq_id.empty();
78
+ void commit() {
79
+ kv->commit();
42
80
  }
43
81
 
44
- bool is_same_seq(const llama_kv_cell & other) const {
45
- return seq_id == other.seq_id;
46
- }
82
+ private:
83
+ llama_kv_cache * kv;
47
84
  };
48
85
 
49
- // a structure holds information about the slot found in llama_kv_cache_find_slot
50
- struct llama_kv_cache_slot_info {
51
- std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
52
- bool found = false; // the slot was found
53
-
54
- explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
55
- llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
56
-
57
- operator bool() const { return found; }
58
- };
86
+ //
87
+ // llama_kv_cache_unified
88
+ //
59
89
 
60
- // ring-buffer of cached KV data
61
- // TODO: pimpl
62
90
  // TODO: add notion of max sequences
63
91
  class llama_kv_cache_unified : public llama_kv_cache {
64
92
  public:
65
- // can be used to query data from the model if needed
66
- struct callbacks {
67
- std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
68
- };
93
+ struct kv_cell {
94
+ llama_pos pos = -1;
95
+ llama_pos delta = 0;
69
96
 
70
- llama_kv_cache_unified(
71
- const llama_hparams & hparams,
72
- callbacks cbs);
97
+ std::set<llama_seq_id> seq_id;
98
+
99
+ bool has_seq_id(const llama_seq_id & id) const {
100
+ return seq_id.find(id) != seq_id.end();
101
+ }
102
+
103
+ bool is_empty() const {
104
+ return seq_id.empty();
105
+ }
73
106
 
74
- virtual ~llama_kv_cache_unified() = default;
107
+ bool is_same_seq(const kv_cell & other) const {
108
+ return seq_id == other.seq_id;
109
+ }
110
+ };
75
111
 
76
- // TODO: become constructor
77
- bool init(
78
- const llama_model & model, // TODO: do not reference the model
79
- const llama_cparams & cparams,
112
+ static uint32_t get_padding(const llama_cparams & cparams);
113
+
114
+ llama_kv_cache_unified(
115
+ const llama_model & model,
80
116
  ggml_type type_k,
81
117
  ggml_type type_v,
118
+ bool v_trans,
119
+ bool offload,
82
120
  uint32_t kv_size,
83
- bool offload);
121
+ uint32_t padding);
84
122
 
85
- int32_t get_n_tokens() const override;
86
- uint32_t get_used_cells() const override;
123
+ ~llama_kv_cache_unified() = default;
87
124
 
88
- size_t total_size() const;
89
-
90
- // TODO: better data structures to reduce the cost of this operation
91
- llama_pos pos_max() const;
125
+ //
126
+ // llama_memory_i
127
+ //
92
128
 
93
129
  void clear() override;
94
- void defrag() override;
95
130
 
96
131
  bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
97
132
  void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
@@ -99,77 +134,120 @@ public:
99
134
  void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
100
135
  void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
101
136
 
102
- llama_pos seq_pos_max(llama_seq_id seq_id) override;
137
+ llama_pos seq_pos_max(llama_seq_id seq_id) const override;
103
138
 
104
- bool get_can_shift() const override;
139
+ //
140
+ // llama_kv_cache
141
+ //
105
142
 
106
- // find an empty slot of size "n_tokens" in the cache
107
- // updates the cache head
108
- // returns a structure holding information about the slot found
109
- // Note: On success, it's important that cache.head points
110
- // to the first cell of the slot.
111
- llama_kv_cache_slot_info find_slot(const llama_ubatch & batch);
143
+ void restore() override;
144
+ void commit() override;
112
145
 
113
- // TODO: maybe not needed
114
- uint32_t get_padding(const llama_cparams & cparams) const;
146
+ bool update(llama_context & ctx) override;
115
147
 
116
- // find how many cells are currently in use
117
- uint32_t cell_max() const;
148
+ void defrag_sched(float thold) override;
118
149
 
119
- size_t size_k_bytes() const;
120
- size_t size_v_bytes() const;
121
-
122
- // defrag
123
-
124
- struct {
125
- std::vector<uint32_t> ids;
126
- } defrag_info;
150
+ void set_full() override;
127
151
 
128
- // return true if cells have been moved
129
- bool defrag_prepare(int32_t n_max_nodes);
152
+ llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
130
153
 
131
- // state save/load
154
+ llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
132
155
 
133
- void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
134
- void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1);
156
+ // updates the cache head
157
+ // Note: On success, it's important that cache.head points
158
+ // to the first cell of the slot.
159
+ bool find_slot(const llama_ubatch & batch) override;
135
160
 
136
- // members
161
+ int32_t get_n_tokens() const override;
162
+ int32_t get_used_cells() const override;
137
163
 
138
- const llama_hparams & hparams;
164
+ // TODO: better data structures to reduce the cost of this operation
165
+ llama_pos get_pos_max() const override;
139
166
 
140
- callbacks cbs;
167
+ bool get_can_shift() const override;
141
168
 
142
- bool has_shift = false;
143
- bool do_defrag = false;
169
+ // state write/load
144
170
 
145
- // TODO: remove this and implement llama_kv_cache_recurrent instead
146
- bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
171
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
172
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
147
173
 
148
- bool v_trans = true; // the value tensor is transposed
149
- bool can_shift = false;
150
-
151
- // Note: The value of head isn't only used to optimize searching
152
- // for a free KV slot. llama_decode_impl also uses it, so it
153
- // cannot be freely changed after a slot has been allocated.
154
- uint32_t head = 0;
155
- uint32_t size = 0;
174
+ uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
175
+ uint32_t size = 0; // total number of cells, shared across all sequences
156
176
  uint32_t used = 0; // used cells (i.e. at least one seq_id)
157
177
 
158
178
  // computed before each graph build
159
179
  uint32_t n = 0;
160
180
 
161
- std::vector<llama_kv_cell> cells;
181
+ std::vector<kv_cell> cells;
162
182
 
163
183
  std::vector<ggml_tensor *> k_l; // per layer
164
184
  std::vector<ggml_tensor *> v_l;
165
185
 
166
186
  private:
187
+ const llama_model & model;
188
+ const llama_hparams & hparams;
189
+
190
+ bool has_shift = false;
191
+ bool do_defrag = false;
192
+
193
+ bool v_trans = true; // the value tensor is transposed
194
+ bool can_shift = false;
195
+
196
+ // required padding
197
+ uint32_t padding = 1;
198
+
167
199
  ggml_type type_k = GGML_TYPE_F16;
168
200
  ggml_type type_v = GGML_TYPE_F16;
169
201
 
170
202
  std::vector<ggml_context_ptr> ctxs;
171
203
  std::vector<ggml_backend_buffer_ptr> bufs;
172
204
 
205
+ // defrag
206
+ struct {
207
+ std::vector<uint32_t> ids;
208
+ } defrag_info;
209
+
210
+ // return true if cells have been moved
211
+ bool defrag_prepare(int32_t n_max_nodes);
212
+
213
+ // commit/restore cache
214
+ struct slot_range {
215
+ uint32_t c0 = 0; // note: these are cell indices, not sequence positions
216
+ uint32_t c1 = 0;
217
+ };
218
+
219
+ // pending cell updates that are not yet committed
220
+ struct {
221
+ std::vector<slot_range> ranges;
222
+ } pending;
223
+
224
+ // find how many cells are currently in use
225
+ uint32_t cell_max() const;
226
+
227
+ size_t total_size() const;
228
+
229
+ size_t size_k_bytes() const;
230
+ size_t size_v_bytes() const;
231
+
232
+ ggml_tensor * build_rope_shift(
233
+ const llama_cparams & cparams,
234
+ ggml_context * ctx,
235
+ ggml_tensor * cur,
236
+ ggml_tensor * shift,
237
+ ggml_tensor * factors,
238
+ float freq_base,
239
+ float freq_scale) const;
240
+
241
+ llm_graph_result_ptr build_graph_shift(
242
+ const llama_cparams & cparams,
243
+ ggml_context * ctx,
244
+ ggml_cgraph * gf) const;
245
+
246
+ llm_graph_result_ptr build_graph_defrag(
247
+ const llama_cparams & cparams,
248
+ ggml_context * ctx,
249
+ ggml_cgraph * gf) const;
250
+
173
251
  void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
174
252
  void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
175
253
 
@@ -177,106 +255,140 @@ private:
177
255
  bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
178
256
  };
179
257
 
180
- // TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
181
- //class llama_kv_cache_recurrent : public llama_kv_cache_unified {
182
- //public:
183
- // using llama_kv_cache_unified::llama_kv_cache_unified;
184
- //};
185
-
186
258
  //
187
- // kv cache restore
259
+ // llama_kv_cache_recurrent
188
260
  //
189
261
 
190
- // saves the kv_cache state for future recovery.
191
- // used to rollback llama_kv_cache_find_slot changes.
192
- struct llama_kv_slot_restorer {
193
- struct llama_kv_cache_state {
194
- uint32_t head = 0;
195
- uint32_t n = 0;
196
- } old_state;
197
-
198
- // for non-recurrent models only
199
- // list of slots to restore
200
- std::vector<std::pair<uint32_t, uint32_t>> slot_boundaries;
201
-
202
- bool do_restore = false;
262
+ class llama_kv_cache_recurrent : public llama_kv_cache {
263
+ public:
264
+ struct kv_cell {
265
+ llama_pos pos = -1;
266
+ int32_t src = -1; // used to copy states
267
+ int32_t tail = -1;
203
268
 
204
- llama_kv_cache_unified & cache;
269
+ std::set<llama_seq_id> seq_id;
205
270
 
206
- explicit llama_kv_slot_restorer(llama_kv_cache_unified & cache) : cache(cache) {
207
- old_state.head = cache.head;
208
- old_state.n = cache.n;
209
- }
271
+ bool has_seq_id(const llama_seq_id & id) const {
272
+ return seq_id.find(id) != seq_id.end();
273
+ }
210
274
 
211
- // saves a slot information for future restoration
212
- void save(const llama_kv_cache_slot_info & slot) {
213
- if (slot) {
214
- do_restore = true;
215
- if (slot.boundaries.first != slot.boundaries.second) {
216
- slot_boundaries.push_back(slot.boundaries);
217
- }
275
+ bool is_empty() const {
276
+ return seq_id.empty();
218
277
  }
219
- }
220
278
 
221
- // must be explicitly called to restore the kv_cache state
222
- // and rollback changes from all llama_kv_cache_find_slot calls
223
- void restore() {
224
- if (do_restore) {
225
- cache.head = old_state.head;
226
- cache.n = old_state.n;
227
-
228
- if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
229
- cache.seq_rm(-1, -1, -1);
230
- } else {
231
- for (auto & slot : slot_boundaries) {
232
- cache.seq_rm(-1, slot.first, slot.second);
233
- }
234
- }
279
+ bool is_same_seq(const kv_cell & other) const {
280
+ return seq_id == other.seq_id;
235
281
  }
236
- }
237
- };
282
+ };
283
+
284
+ llama_kv_cache_recurrent(
285
+ const llama_model & model,
286
+ ggml_type type_k,
287
+ ggml_type type_v,
288
+ bool offload,
289
+ uint32_t kv_size);
290
+
291
+ ~llama_kv_cache_recurrent() = default;
292
+
293
+ //
294
+ // llama_memory_i
295
+ //
296
+
297
+ void clear() override;
298
+
299
+ bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
300
+ void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
301
+ void seq_keep(llama_seq_id seq_id) override;
302
+ void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
303
+ void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
238
304
 
239
- // TODO: maybe become part of the public llama_kv_cache in the future
240
- int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv);
305
+ llama_pos seq_pos_max(llama_seq_id seq_id) const override;
241
306
 
242
- int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv);
307
+ //
308
+ // llama_kv_cache
309
+ //
243
310
 
244
- void llama_kv_cache_clear(llama_kv_cache * kv);
311
+ void restore() override;
312
+ void commit() override;
245
313
 
246
- bool llama_kv_cache_seq_rm(
247
- llama_kv_cache * kv,
248
- llama_seq_id seq_id,
249
- llama_pos p0,
250
- llama_pos p1);
314
+ bool update(llama_context & lctx) override;
251
315
 
252
- void llama_kv_cache_seq_cp(
253
- llama_kv_cache * kv,
254
- llama_seq_id seq_id_src,
255
- llama_seq_id seq_id_dst,
256
- llama_pos p0,
257
- llama_pos p1);
316
+ void defrag_sched(float thold) override;
258
317
 
259
- void llama_kv_cache_seq_keep(llama_kv_cache * kv, llama_seq_id seq_id);
318
+ void set_full() override;
260
319
 
261
- void llama_kv_cache_seq_add(
262
- llama_kv_cache * kv,
263
- llama_seq_id seq_id,
264
- llama_pos p0,
265
- llama_pos p1,
266
- llama_pos delta);
320
+ llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
267
321
 
268
- void llama_kv_cache_seq_div(
269
- llama_kv_cache * kv,
270
- llama_seq_id seq_id,
271
- llama_pos p0,
272
- llama_pos p1,
273
- int d);
322
+ llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
274
323
 
275
- llama_pos llama_kv_cache_seq_pos_max(llama_kv_cache * kv, llama_seq_id seq_id);
324
+ bool find_slot(const llama_ubatch & batch) override;
325
+
326
+ int32_t get_n_tokens() const override;
327
+ int32_t get_used_cells() const override;
328
+
329
+ // TODO: better data structures to reduce the cost of this operation
330
+ llama_pos get_pos_max() const override;
331
+
332
+ bool get_can_shift() const override;
276
333
 
277
- void llama_kv_cache_defrag(llama_kv_cache * kv);
334
+ // TODO: temporary methods - they are not really const as they do const_cast<>, fix this
335
+ int32_t s_copy(int i) const;
336
+ float s_mask(int i) const;
337
+
338
+ // state write/load
339
+
340
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
341
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
342
+
343
+ uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
344
+ uint32_t size = 0; // total number of cells, shared across all sequences
345
+ uint32_t used = 0; // used cells (i.e. at least one seq_id)
346
+
347
+ // computed before each graph build
348
+ uint32_t n = 0;
349
+
350
+ std::vector<kv_cell> cells;
351
+
352
+ std::vector<ggml_tensor *> k_l; // per layer
353
+ std::vector<ggml_tensor *> v_l;
354
+
355
+ private:
356
+ //const llama_model & model;
357
+ const llama_hparams & hparams;
358
+
359
+ // commit/restore cache
360
+ // TODO: rework for recurrent cache
361
+ struct slot_range {
362
+ uint32_t c0 = 0; // note: these are cell indices, not sequence positions
363
+ uint32_t c1 = 0;
364
+ };
365
+
366
+ // pending cell updates that are not yet committed
367
+ struct {
368
+ std::vector<slot_range> ranges;
369
+ } pending;
370
+
371
+ ggml_type type_k = GGML_TYPE_F16;
372
+ ggml_type type_v = GGML_TYPE_F16;
373
+
374
+ std::vector<ggml_context_ptr> ctxs;
375
+ std::vector<ggml_backend_buffer_ptr> bufs;
376
+
377
+ // find how many cells are currently in use
378
+ uint32_t cell_max() const;
379
+
380
+ size_t total_size() const;
381
+
382
+ size_t size_k_bytes() const;
383
+ size_t size_v_bytes() const;
384
+
385
+ void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
386
+ void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
387
+
388
+ bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
389
+ bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
390
+ };
278
391
 
279
- bool llama_kv_cache_can_shift(const llama_kv_cache * kv);
280
392
 
281
393
  //
282
394
  // kv cache view
@@ -2,12 +2,22 @@
2
2
 
3
3
  #include "llama.h"
4
4
 
5
+ struct llama_memory_params {
6
+ // kv cache
7
+ ggml_type type_k;
8
+ ggml_type type_v;
9
+
10
+ // parameters for other types of memory
11
+ // ...
12
+ };
13
+
5
14
  // general concept of LLM memory
6
15
  // the KV cache is a type of LLM memory, but there can be other types
7
16
  class llama_memory_i {
8
17
  public:
18
+ virtual ~llama_memory_i() = default;
19
+
9
20
  virtual void clear() = 0;
10
- virtual void defrag() = 0;
11
21
 
12
22
  virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
13
23
  virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
@@ -15,7 +25,7 @@ public:
15
25
  virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) = 0;
16
26
  virtual void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) = 0;
17
27
 
18
- virtual llama_pos seq_pos_max(llama_seq_id seq_id) = 0;
28
+ virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
19
29
 
20
30
  virtual bool get_can_edit() const = 0;
21
31
  };
@@ -476,7 +476,7 @@ struct llama_mlock::impl {
476
476
 
477
477
  char* errmsg = std::strerror(errno);
478
478
  bool suggest = (errno == ENOMEM);
479
- #if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV)
479
+ #if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX)
480
480
  // visionOS/tvOS dont't support RLIMIT_MEMLOCK
481
481
  // Skip resource limit checks on visionOS/tvOS
482
482
  suggest = false;