@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -6,6 +6,7 @@
6
6
 
7
7
  #include <set>
8
8
  #include <string>
9
+ #include <string_view>
9
10
  #include <vector>
10
11
  #include <sstream>
11
12
 
@@ -66,7 +67,6 @@ enum llama_example {
66
67
  LLAMA_EXAMPLE_COMMON,
67
68
  LLAMA_EXAMPLE_SPECULATIVE,
68
69
  LLAMA_EXAMPLE_MAIN,
69
- LLAMA_EXAMPLE_INFILL,
70
70
  LLAMA_EXAMPLE_EMBEDDING,
71
71
  LLAMA_EXAMPLE_PERPLEXITY,
72
72
  LLAMA_EXAMPLE_RETRIEVAL,
@@ -96,6 +96,7 @@ enum common_sampler_type {
96
96
  COMMON_SAMPLER_TYPE_XTC = 8,
97
97
  COMMON_SAMPLER_TYPE_INFILL = 9,
98
98
  COMMON_SAMPLER_TYPE_PENALTIES = 10,
99
+ COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
99
100
  };
100
101
 
101
102
  // dimensionality reduction methods, used by cvector-generator
@@ -121,10 +122,6 @@ struct common_grammar_trigger {
121
122
  common_grammar_trigger_type type;
122
123
  std::string value;
123
124
  llama_token token = LLAMA_TOKEN_NULL;
124
-
125
- // T can only be nlohmann::ordered_json
126
- template <class T> T to_json() const;
127
- template <class T> static common_grammar_trigger from_json(const T & in);
128
125
  };
129
126
 
130
127
  // sampling parameters
@@ -165,6 +162,7 @@ struct common_params_sampling {
165
162
  std::vector<enum common_sampler_type> samplers = {
166
163
  COMMON_SAMPLER_TYPE_PENALTIES,
167
164
  COMMON_SAMPLER_TYPE_DRY,
165
+ COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
168
166
  COMMON_SAMPLER_TYPE_TOP_K,
169
167
  COMMON_SAMPLER_TYPE_TYPICAL_P,
170
168
  COMMON_SAMPLER_TYPE_TOP_P,
@@ -184,6 +182,13 @@ struct common_params_sampling {
184
182
  std::string print() const;
185
183
  };
186
184
 
185
+ struct common_params_model {
186
+ std::string path = ""; // model local path // NOLINT
187
+ std::string url = ""; // model url to download // NOLINT
188
+ std::string hf_repo = ""; // HF repo // NOLINT
189
+ std::string hf_file = ""; // HF file // NOLINT
190
+ };
191
+
187
192
  struct common_params_speculative {
188
193
  std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
189
194
 
@@ -197,19 +202,11 @@ struct common_params_speculative {
197
202
  struct cpu_params cpuparams;
198
203
  struct cpu_params cpuparams_batch;
199
204
 
200
- std::string hf_repo = ""; // HF repo // NOLINT
201
- std::string hf_file = ""; // HF file // NOLINT
202
-
203
- std::string model = ""; // draft model for speculative decoding // NOLINT
204
- std::string model_url = ""; // model url to download // NOLINT
205
+ struct common_params_model model;
205
206
  };
206
207
 
207
208
  struct common_params_vocoder {
208
- std::string hf_repo = ""; // HF repo // NOLINT
209
- std::string hf_file = ""; // HF file // NOLINT
210
-
211
- std::string model = ""; // model path // NOLINT
212
- std::string model_url = ""; // model url to download // NOLINT
209
+ struct common_params_model model;
213
210
 
214
211
  std::string speaker_file = ""; // speaker file path // NOLINT
215
212
 
@@ -267,12 +264,10 @@ struct common_params {
267
264
  struct common_params_speculative speculative;
268
265
  struct common_params_vocoder vocoder;
269
266
 
270
- std::string model = ""; // model path // NOLINT
267
+ struct common_params_model model;
268
+
271
269
  std::string model_alias = ""; // model alias // NOLINT
272
- std::string model_url = ""; // model url to download // NOLINT
273
270
  std::string hf_token = ""; // HF token // NOLINT
274
- std::string hf_repo = ""; // HF repo // NOLINT
275
- std::string hf_file = ""; // HF file // NOLINT
276
271
  std::string prompt = ""; // NOLINT
277
272
  std::string system_prompt = ""; // NOLINT
278
273
  std::string prompt_file = ""; // store the external prompt file name // NOLINT
@@ -286,6 +281,7 @@ struct common_params {
286
281
  std::vector<std::string> in_files; // all input files
287
282
  std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
288
283
  std::vector<llama_model_kv_override> kv_overrides;
284
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
289
285
 
290
286
  bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
291
287
  std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
@@ -329,7 +325,6 @@ struct common_params {
329
325
  bool ctx_shift = true; // context shift on inifinite text generation
330
326
 
331
327
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
332
- bool logits_all = false; // return logits for all tokens in the batch
333
328
  bool use_mmap = true; // use mmap for faster loads
334
329
  bool use_mlock = false; // use mlock to keep model in memory
335
330
  bool verbose_prompt = false; // print prompt tokens before generation
@@ -338,6 +333,7 @@ struct common_params {
338
333
  bool no_kv_offload = false; // disable KV offloading
339
334
  bool warmup = true; // warmup run
340
335
  bool check_tensors = false; // validate tensor data
336
+ bool no_op_offload = false; // globally disable offload host tensor operations to device
341
337
 
342
338
  bool single_turn = false; // single turn chat conversation
343
339
 
@@ -346,8 +342,10 @@ struct common_params {
346
342
 
347
343
  common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
348
344
 
349
- // multimodal models (see examples/llava)
350
- std::string mmproj = ""; // path to multimodal projector // NOLINT
345
+ // multimodal models (see tools/mtmd)
346
+ struct common_params_model mmproj;
347
+ bool mmproj_use_gpu = true; // use GPU for multimodal model
348
+ bool no_mmproj = false; // explicitly disable multimodal model
351
349
  std::vector<std::string> image; // path to image file(s)
352
350
 
353
351
  // embedding
@@ -370,6 +368,7 @@ struct common_params {
370
368
  bool use_jinja = false; // NOLINT
371
369
  bool enable_chat_template = true;
372
370
  common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
371
+ bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
373
372
 
374
373
  std::vector<std::string> api_keys;
375
374
 
@@ -413,13 +412,14 @@ struct common_params {
413
412
 
414
413
  bool process_output = false; // collect data for the output tensor
415
414
  bool compute_ppl = true; // whether to compute perplexity
415
+ bool parse_special = false; // whether to parse special tokens during imatrix tokenization
416
416
 
417
417
  // cvector-generator params
418
418
  int n_pca_batch = 100;
419
419
  int n_pca_iterations = 1000;
420
420
  dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
421
- std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
422
- std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
421
+ std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
422
+ std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
423
423
 
424
424
  bool spm_infill = false; // suffix/prefix/middle pattern for infill
425
425
 
@@ -505,10 +505,9 @@ static bool string_starts_with(const std::string & str,
505
505
  return str.rfind(prefix, 0) == 0;
506
506
  }
507
507
 
508
- static bool string_ends_with(const std::string & str,
509
- const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
510
- return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
511
- }
508
+ // While we wait for C++20's std::string::ends_with...
509
+ bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
510
+ size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
512
511
 
513
512
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
514
513
  void string_process_escapes(std::string & input);
@@ -546,26 +545,11 @@ struct llama_model_params common_model_params_to_llama ( common_params
546
545
  struct llama_context_params common_context_params_to_llama(const common_params & params);
547
546
  struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
548
547
 
549
- struct llama_model * common_load_model_from_url(
550
- const std::string & model_url,
551
- const std::string & local_path,
552
- const std::string & hf_token,
553
- const struct llama_model_params & params);
554
-
555
- struct llama_model * common_load_model_from_hf(
556
- const std::string & repo,
557
- const std::string & remote_path,
558
- const std::string & local_path,
559
- const std::string & hf_token,
560
- const struct llama_model_params & params);
561
-
562
- std::pair<std::string, std::string> common_get_hf_file(
563
- const std::string & hf_repo_with_tag,
564
- const std::string & hf_token);
565
-
566
548
  // clear LoRA adapters from context, then apply new list of adapters
567
549
  void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
568
550
 
551
+ std::string get_model_endpoint();
552
+
569
553
  //
570
554
  // Batch utils
571
555
  //
@@ -683,3 +667,9 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
683
667
  const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
684
668
 
685
669
  }
670
+
671
+ //
672
+ // training utils
673
+ //
674
+
675
+ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
@@ -16,6 +16,9 @@ using json = nlohmann::ordered_json;
16
16
  static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
17
17
  auto has_max = max_items != std::numeric_limits<int>::max();
18
18
 
19
+ if (max_items == 0) {
20
+ return "";
21
+ }
19
22
  if (min_items == 0 && max_items == 1) {
20
23
  return item_rule + "?";
21
24
  }
@@ -11,25 +11,24 @@ struct llama_sampler_llg {
11
11
  std::string grammar_kind;
12
12
  std::string grammar_data;
13
13
  LlgTokenizer * tokenizer;
14
- LlgConstraint * grammar;
15
- LlgMaskResult llg_res;
16
- bool has_llg_res;
14
+ LlgMatcher * grammar;
17
15
  };
18
16
 
19
- static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
20
- const char * grammar_data) {
17
+ static LlgMatcher * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
18
+ const char * grammar_data) {
21
19
  LlgConstraintInit cinit;
22
20
  llg_constraint_init_set_defaults(&cinit, tokenizer);
23
21
  const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
24
22
  if (log_level && *log_level) {
25
23
  cinit.log_stderr_level = atoi(log_level);
26
24
  }
27
- auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
28
- if (llg_get_error(c)) {
29
- LOG_ERR("llg error: %s\n", llg_get_error(c));
30
- llg_free_constraint(c);
25
+ auto c = llg_new_matcher(&cinit, grammar_kind, grammar_data);
26
+ if (llg_matcher_get_error(c)) {
27
+ LOG_ERR("llg error: %s\n", llg_matcher_get_error(c));
28
+ llg_free_matcher(c);
31
29
  return nullptr;
32
30
  }
31
+
33
32
  return c;
34
33
  }
35
34
 
@@ -40,39 +39,29 @@ static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
40
39
  static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
41
40
  auto * ctx = (llama_sampler_llg *) smpl->ctx;
42
41
  if (ctx->grammar) {
43
- LlgCommitResult res;
44
- llg_commit_token(ctx->grammar, token, &res);
45
- ctx->has_llg_res = false;
42
+ llg_matcher_consume_token(ctx->grammar, token);
46
43
  }
47
44
  }
48
45
 
49
46
  static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
50
47
  auto * ctx = (llama_sampler_llg *) smpl->ctx;
51
48
  if (ctx->grammar) {
52
- if (!ctx->has_llg_res) {
53
- if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) {
54
- ctx->has_llg_res = true;
49
+ const uint32_t * mask = llg_matcher_get_mask(ctx->grammar);
50
+ if (mask == nullptr) {
51
+ if (llg_matcher_compute_mask(ctx->grammar) == 0) {
52
+ mask = llg_matcher_get_mask(ctx->grammar);
55
53
  } else {
56
- LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar));
57
- llg_free_constraint(ctx->grammar);
54
+ LOG_ERR("llg error: %s\n", llg_matcher_get_error(ctx->grammar));
55
+ llg_free_matcher(ctx->grammar);
58
56
  ctx->grammar = nullptr;
57
+ return;
59
58
  }
60
59
  }
61
- if (ctx->has_llg_res) {
62
- if (ctx->llg_res.is_stop) {
63
- for (size_t i = 0; i < cur_p->size; ++i) {
64
- if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) {
65
- cur_p->data[i].logit = -INFINITY;
66
- }
67
- }
68
- } else {
69
- const uint32_t * mask = ctx->llg_res.sample_mask;
70
- for (size_t i = 0; i < cur_p->size; ++i) {
71
- auto token = cur_p->data[i].id;
72
- if ((mask[token / 32] & (1 << (token % 32))) == 0) {
73
- cur_p->data[i].logit = -INFINITY;
74
- }
75
- }
60
+
61
+ for (size_t i = 0; i < cur_p->size; ++i) {
62
+ auto token = cur_p->data[i].id;
63
+ if ((mask[token / 32] & (1 << (token % 32))) == 0) {
64
+ cur_p->data[i].logit = -INFINITY;
76
65
  }
77
66
  }
78
67
  }
@@ -80,14 +69,9 @@ static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array
80
69
 
81
70
  static void llama_sampler_llg_reset(llama_sampler * smpl) {
82
71
  auto * ctx = (llama_sampler_llg *) smpl->ctx;
83
- if (!ctx->grammar) {
84
- return;
72
+ if (ctx->grammar) {
73
+ llg_matcher_reset(ctx->grammar);
85
74
  }
86
-
87
- auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
88
- llg_free_constraint(ctx->grammar);
89
- ctx->grammar = grammar_new;
90
- ctx->has_llg_res = false;
91
75
  }
92
76
 
93
77
  static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
@@ -102,7 +86,7 @@ static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
102
86
  if (ctx->grammar) {
103
87
  result_ctx->grammar_kind = ctx->grammar_kind;
104
88
  result_ctx->grammar_data = ctx->grammar_data;
105
- result_ctx->grammar = llg_clone_constraint(ctx->grammar);
89
+ result_ctx->grammar = llg_clone_matcher(ctx->grammar);
106
90
  result_ctx->tokenizer = llg_clone_tokenizer(ctx->tokenizer);
107
91
  }
108
92
  }
@@ -114,7 +98,7 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
114
98
  const auto * ctx = (llama_sampler_llg *) smpl->ctx;
115
99
 
116
100
  if (ctx->grammar) {
117
- llg_free_constraint(ctx->grammar);
101
+ llg_free_matcher(ctx->grammar);
118
102
  llg_free_tokenizer(ctx->tokenizer);
119
103
  }
120
104
 
@@ -205,6 +189,7 @@ static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab)
205
189
  /* .tokenize_fn = */ llama_sampler_llg_tokenize_fn,
206
190
  /* .use_approximate_greedy_tokenize_fn = */ false,
207
191
  /* .tokenize_user_data = */ vocab,
192
+ /* .slices = */ nullptr,
208
193
  };
209
194
 
210
195
  char error_buffer[1024];
@@ -239,9 +224,11 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
239
224
  /* .grammar_data = */ grammar_data,
240
225
  /* .tokenizer = */ tokenizer,
241
226
  /* .grammar = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
242
- /* .llg_res = */ {},
243
- /* .has_llg_res = */ false,
244
227
  };
228
+ if (ctx->grammar) {
229
+ GGML_ASSERT(((size_t) llama_vocab_n_tokens(vocab) + 31) / 32 * 4 ==
230
+ llg_matcher_get_mask_byte_size(ctx->grammar));
231
+ }
245
232
  } else {
246
233
  *ctx = {
247
234
  /* .vocab = */ vocab,
@@ -249,15 +236,12 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
249
236
  /* .grammar_data = */ {},
250
237
  /* .tokenizer = */ nullptr,
251
238
  /* .grammar = */ nullptr,
252
- /* .llg_res = */ {},
253
- /* .has_llg_res = */ false,
254
239
  };
255
240
  }
256
241
 
257
242
  return llama_sampler_init(
258
243
  /* .iface = */ &llama_sampler_llg_i,
259
- /* .ctx = */ ctx
260
- );
244
+ /* .ctx = */ ctx);
261
245
  }
262
246
 
263
247
  #else
@@ -9,10 +9,21 @@
9
9
  #pragma once
10
10
 
11
11
  #include "minja.hpp"
12
- #include <json.hpp>
12
+
13
+ #include <chrono>
14
+ #include <cstddef>
15
+ #include <cstdio>
16
+ #include <ctime>
17
+ #include <exception>
18
+ #include <iomanip>
19
+ #include <memory>
20
+ #include <sstream>
21
+ #include <stdexcept>
13
22
  #include <string>
14
23
  #include <vector>
15
24
 
25
+ #include <json.hpp>
26
+
16
27
  using json = nlohmann::ordered_json;
17
28
 
18
29
  namespace minja {
@@ -384,8 +395,8 @@ class chat_template {
384
395
 
385
396
  for (const auto & message_ : adjusted_messages) {
386
397
  auto message = message_;
387
- if (!message.contains("role") || !message.contains("content")) {
388
- throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
398
+ if (!message.contains("role") || (!message.contains("content") && !message.contains("tool_calls"))) {
399
+ throw std::runtime_error("message must have 'role' and one of 'content' or 'tool_calls' fields: " + message.dump());
389
400
  }
390
401
  std::string role = message.at("role");
391
402
 
@@ -406,7 +417,6 @@ class chat_template {
406
417
  }
407
418
  }
408
419
  if (polyfill_tool_calls) {
409
- auto content = message.at("content");
410
420
  auto tool_calls = json::array();
411
421
  for (const auto & tool_call : message.at("tool_calls")) {
412
422
  if (tool_call.at("type") != "function") {
@@ -425,8 +435,11 @@ class chat_template {
425
435
  auto obj = json {
426
436
  {"tool_calls", tool_calls},
427
437
  };
428
- if (!content.is_null() && content != "") {
429
- obj["content"] = content;
438
+ if (message.contains("content")) {
439
+ auto content = message.at("content");
440
+ if (!content.is_null() && !content.empty()) {
441
+ obj["content"] = content;
442
+ }
430
443
  }
431
444
  message["content"] = obj.dump(2);
432
445
  message.erase("tool_calls");
@@ -435,13 +448,12 @@ class chat_template {
435
448
  if (polyfill_tool_responses && role == "tool") {
436
449
  message["role"] = "user";
437
450
  auto obj = json {
438
- {"tool_response", {
439
- {"content", message.at("content")},
440
- }},
451
+ {"tool_response", json::object()},
441
452
  };
442
453
  if (message.contains("name")) {
443
- obj["tool_response"]["name"] = message.at("name");
454
+ obj["tool_response"]["tool"] = message.at("name");
444
455
  }
456
+ obj["tool_response"]["content"] = message.at("content");
445
457
  if (message.contains("tool_call_id")) {
446
458
  obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
447
459
  }
@@ -510,7 +522,7 @@ class chat_template {
510
522
  static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) {
511
523
  json messages_with_system = messages;
512
524
 
513
- if (messages_with_system.size() > 0 && messages_with_system[0].at("role") == "system") {
525
+ if (!messages_with_system.empty() && messages_with_system[0].at("role") == "system") {
514
526
  std::string existing_system = messages_with_system.at(0).at("content");
515
527
  messages_with_system[0] = json {
516
528
  {"role", "system"},