@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -299,10 +299,10 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector<struct
299
299
  return false;
300
300
  }
301
301
  } catch (std::length_error &) {
302
- fprintf(stderr, "%s: encountered length_error while reading value for key '%s'\n", __func__, key.c_str());
302
+ GGML_LOG_ERROR("%s: encountered length_error while reading value for key '%s'\n", __func__, key.c_str());
303
303
  return false;
304
304
  } catch (std::bad_alloc &) {
305
- fprintf(stderr, "%s: encountered bad_alloc error while reading value for key '%s'\n", __func__, key.c_str());
305
+ GGML_LOG_ERROR("%s: encountered bad_alloc error while reading value for key '%s'\n", __func__, key.c_str());
306
306
  return false;
307
307
  }
308
308
  kv.emplace_back(key, value);
@@ -328,14 +328,14 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
328
328
  ok = ok && gr.read(magic, 4);
329
329
 
330
330
  if (!ok) {
331
- fprintf(stderr, "%s: failed to read magic\n", __func__);
331
+ GGML_LOG_ERROR("%s: failed to read magic\n", __func__);
332
332
  gguf_free(ctx);
333
333
  return nullptr;
334
334
  }
335
335
 
336
336
  for (uint32_t i = 0; i < magic.size(); i++) {
337
337
  if (magic[i] != GGUF_MAGIC[i]) {
338
- fprintf(stderr, "%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
338
+ GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
339
339
  gguf_free(ctx);
340
340
  return nullptr;
341
341
  }
@@ -348,11 +348,11 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
348
348
 
349
349
  if (ok && gr.read(ctx->version)) {
350
350
  if (ctx->version == 1) {
351
- fprintf(stderr, "%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__);
351
+ GGML_LOG_ERROR("%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__);
352
352
  ok = false;
353
353
  }
354
354
  if (ctx->version > GGUF_VERSION) {
355
- fprintf(stderr, "%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n",
355
+ GGML_LOG_ERROR("%s: this GGUF file is version %" PRIu32 " but this software only supports up to version %d\n",
356
356
  __func__, ctx->version, GGUF_VERSION);
357
357
  ok = false;
358
358
  }
@@ -363,7 +363,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
363
363
  if (ok && gr.read(n_tensors)) {
364
364
  static_assert(sizeof(size_t) <= 8 && sizeof(gguf_tensor_info) >= 2, "int64_t insufficient for indexing");
365
365
  if (n_tensors < 0 || n_tensors > int64_t(SIZE_MAX/sizeof(gguf_tensor_info))) {
366
- fprintf(stderr, "%s: number of tensors is %" PRIi64 " but must be in [0, %zu]\n",
366
+ GGML_LOG_ERROR("%s: number of tensors is %" PRIi64 " but must be in [0, %zu]\n",
367
367
  __func__, n_tensors, SIZE_MAX/sizeof(gguf_tensor_info));
368
368
  ok = false;
369
369
  }
@@ -374,7 +374,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
374
374
  if (ok && gr.read(n_kv)) {
375
375
  static_assert(sizeof(size_t) <= 8 && sizeof(gguf_tensor_info) >= 2, "int64_t insufficient for indexing");
376
376
  if (n_kv < 0 || n_kv > int64_t(SIZE_MAX/sizeof(gguf_kv))) {
377
- fprintf(stderr, "%s: number of key value pairs is %" PRIi64 " but must be in [0, %zu]\n",
377
+ GGML_LOG_ERROR("%s: number of key value pairs is %" PRIi64 " but must be in [0, %zu]\n",
378
378
  __func__, n_kv, SIZE_MAX/sizeof(gguf_kv));
379
379
  ok = false;
380
380
  }
@@ -383,7 +383,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
383
383
  }
384
384
 
385
385
  if (!ok) {
386
- fprintf(stderr, "%s: failed to read header\n", __func__);
386
+ GGML_LOG_ERROR("%s: failed to read header\n", __func__);
387
387
  gguf_free(ctx);
388
388
  return nullptr;
389
389
  }
@@ -399,15 +399,15 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
399
399
  try {
400
400
  ok = ok && gr.read(key);
401
401
  } catch (std::length_error &) {
402
- fprintf(stderr, "%s: encountered length_error while reading key %" PRIi64 "\n", __func__, i);
402
+ GGML_LOG_ERROR("%s: encountered length_error while reading key %" PRIi64 "\n", __func__, i);
403
403
  ok = false;
404
404
  } catch (std::bad_alloc &) {
405
- fprintf(stderr, "%s: encountered bad_alloc error while reading key %" PRIi64 "\n", __func__, i);
405
+ GGML_LOG_ERROR("%s: encountered bad_alloc error while reading key %" PRIi64 "\n", __func__, i);
406
406
  ok = false;
407
407
  }
408
408
  for (size_t j = 0; ok && j < ctx->kv.size(); ++j) {
409
409
  if (key == ctx->kv[j].key) {
410
- fprintf(stderr, "%s: duplicate key '%s' for tensors %zu and %" PRIi64 " \n", __func__, key.c_str(), j, i);
410
+ GGML_LOG_ERROR("%s: duplicate key '%s' for tensors %zu and %" PRIi64 " \n", __func__, key.c_str(), j, i);
411
411
  ok = false;
412
412
  }
413
413
  }
@@ -441,14 +441,14 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
441
441
  case GGUF_TYPE_ARRAY:
442
442
  default:
443
443
  {
444
- fprintf(stderr, "%s: key '%s' has invalid GGUF type %d\n", __func__, key.c_str(), type);
444
+ GGML_LOG_ERROR("%s: key '%s' has invalid GGUF type %d\n", __func__, key.c_str(), type);
445
445
  ok = false;
446
446
  } break;
447
447
  }
448
448
  }
449
449
 
450
450
  if (!ok) {
451
- fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
451
+ GGML_LOG_ERROR("%s: failed to read key-value pairs\n", __func__);
452
452
  gguf_free(ctx);
453
453
  return nullptr;
454
454
  }
@@ -458,7 +458,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
458
458
  ctx->alignment = alignment_idx == -1 ? GGUF_DEFAULT_ALIGNMENT : gguf_get_val_u32(ctx, alignment_idx);
459
459
 
460
460
  if (ctx->alignment == 0 || (ctx->alignment & (ctx->alignment - 1)) != 0) {
461
- fprintf(stderr, "%s: alignment %zu is not a power of 2\n", __func__, ctx->alignment);
461
+ GGML_LOG_ERROR("%s: alignment %zu is not a power of 2\n", __func__, ctx->alignment);
462
462
  gguf_free(ctx);
463
463
  return nullptr;
464
464
  }
@@ -474,14 +474,14 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
474
474
  try {
475
475
  ok = ok && gr.read(name);
476
476
  } catch (std::length_error &) {
477
- fprintf(stderr, "%s: encountered length_error while reading tensor name %" PRIi64 "\n", __func__, i);
477
+ GGML_LOG_ERROR("%s: encountered length_error while reading tensor name %" PRIi64 "\n", __func__, i);
478
478
  ok = false;
479
479
  } catch (std::bad_alloc &) {
480
- fprintf(stderr, "%s: encountered bad_alloc error while reading tensor name %" PRIi64 "\n", __func__, i);
480
+ GGML_LOG_ERROR("%s: encountered bad_alloc error while reading tensor name %" PRIi64 "\n", __func__, i);
481
481
  ok = false;
482
482
  }
483
483
  if (name.length() >= GGML_MAX_NAME) {
484
- fprintf(stderr, "%s: tensor name %" PRIi64 " is too long: %zu >= %d\n", __func__, i, name.length(), GGML_MAX_NAME);
484
+ GGML_LOG_ERROR("%s: tensor name %" PRIi64 " is too long: %zu >= %d\n", __func__, i, name.length(), GGML_MAX_NAME);
485
485
  ok = false;
486
486
  break;
487
487
  }
@@ -490,7 +490,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
490
490
  // make sure there are no duplicate tensor names
491
491
  for (int64_t j = 0; ok && j < i; ++j) {
492
492
  if (strcmp(info.t.name, ctx->info[j].t.name) == 0) {
493
- fprintf(stderr, "%s: duplicate tensor name '%s' for tensors %" PRIi64 " and %" PRIi64 "\n", __func__, info.t.name, j, i);
493
+ GGML_LOG_ERROR("%s: duplicate tensor name '%s' for tensors %" PRIi64 " and %" PRIi64 "\n", __func__, info.t.name, j, i);
494
494
  ok = false;
495
495
  break;
496
496
  }
@@ -505,7 +505,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
505
505
  uint32_t n_dims = -1;
506
506
  ok = ok && gr.read(n_dims);
507
507
  if (n_dims > GGML_MAX_DIMS) {
508
- fprintf(stderr, "%s: tensor '%s' has invalid number of dimensions: %" PRIu32 " > %" PRIu32 "\n",
508
+ GGML_LOG_ERROR("%s: tensor '%s' has invalid number of dimensions: %" PRIu32 " > %" PRIu32 "\n",
509
509
  __func__, info.t.name, n_dims, GGML_MAX_DIMS);
510
510
  ok = false;
511
511
  break;
@@ -518,7 +518,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
518
518
 
519
519
  // check that all ne are non-negative
520
520
  if (info.t.ne[j] < 0) {
521
- fprintf(stderr, "%s: tensor '%s' dimension %" PRIu32 " has invalid number of elements: %" PRIi64 " < 0\n",
521
+ GGML_LOG_ERROR("%s: tensor '%s' dimension %" PRIu32 " has invalid number of elements: %" PRIi64 " < 0\n",
522
522
  __func__, info.t.name, j, info.t.ne[j]);
523
523
  ok = false;
524
524
  break;
@@ -530,7 +530,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
530
530
  (INT64_MAX/info.t.ne[2] <= info.t.ne[0]*info.t.ne[1]) ||
531
531
  (INT64_MAX/info.t.ne[3] <= info.t.ne[0]*info.t.ne[1]*info.t.ne[2]))) {
532
532
 
533
- fprintf(stderr, "%s: total number of elements in tensor '%s' with shape "
533
+ GGML_LOG_ERROR("%s: total number of elements in tensor '%s' with shape "
534
534
  "(%" PRIi64 ", %" PRIi64 ", %" PRIi64 ", %" PRIi64 ") is >= %" PRIi64 "\n",
535
535
  __func__, info.t.name, info.t.ne[0], info.t.ne[1], info.t.ne[2], info.t.ne[3], INT64_MAX);
536
536
  ok = false;
@@ -547,7 +547,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
547
547
 
548
548
  // check that tensor type is within defined range
549
549
  if (info.t.type < 0 || info.t.type >= GGML_TYPE_COUNT) {
550
- fprintf(stderr, "%s: tensor '%s' has invalid ggml type %d (%s)\n",
550
+ GGML_LOG_ERROR("%s: tensor '%s' has invalid ggml type %d (%s)\n",
551
551
  __func__, info.t.name, info.t.type, ggml_type_name(info.t.type));
552
552
  ok = false;
553
553
  break;
@@ -557,7 +557,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
557
557
 
558
558
  // check that row size is divisible by block size
559
559
  if (blck_size == 0 || info.t.ne[0] % blck_size != 0) {
560
- fprintf(stderr, "%s: tensor '%s' of type %d (%s) has %" PRId64 " elements per row, "
560
+ GGML_LOG_ERROR("%s: tensor '%s' of type %d (%s) has %" PRId64 " elements per row, "
561
561
  "not a multiple of block size (%" PRId64 ")\n",
562
562
  __func__, info.t.name, (int) info.t.type, ggml_type_name(info.t.type), info.t.ne[0], blck_size);
563
563
  ok = false;
@@ -582,7 +582,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
582
582
  }
583
583
 
584
584
  if (!ok) {
585
- fprintf(stderr, "%s: failed to read tensor info\n", __func__);
585
+ GGML_LOG_ERROR("%s: failed to read tensor info\n", __func__);
586
586
  gguf_free(ctx);
587
587
  return nullptr;
588
588
  }
@@ -590,7 +590,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
590
590
 
591
591
  // we require the data section to be aligned, so take into account any padding
592
592
  if (fseek(file, GGML_PAD(ftell(file), ctx->alignment), SEEK_SET) != 0) {
593
- fprintf(stderr, "%s: failed to seek to beginning of data section\n", __func__);
593
+ GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__);
594
594
  gguf_free(ctx);
595
595
  return nullptr;
596
596
  }
@@ -604,9 +604,9 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
604
604
  for (size_t i = 0; i < ctx->info.size(); ++i) {
605
605
  const gguf_tensor_info & ti = ctx->info[i];
606
606
  if (ti.offset != ctx->size) {
607
- fprintf(stderr, "%s: tensor '%s' has offset %" PRIu64 ", expected %zu\n",
607
+ GGML_LOG_ERROR("%s: tensor '%s' has offset %" PRIu64 ", expected %zu\n",
608
608
  __func__, ti.t.name, ti.offset, ctx->size);
609
- fprintf(stderr, "%s: failed to read tensor data\n", __func__);
609
+ GGML_LOG_ERROR("%s: failed to read tensor data\n", __func__);
610
610
  gguf_free(ctx);
611
611
  return nullptr;
612
612
  }
@@ -634,7 +634,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
634
634
 
635
635
  *params.ctx = ggml_init(pdata);
636
636
  if (*params.ctx == nullptr) {
637
- fprintf(stderr, "%s: failed to initialize ggml context for storing tensors\n", __func__);
637
+ GGML_LOG_ERROR("%s: failed to initialize ggml context for storing tensors\n", __func__);
638
638
  gguf_free(ctx);
639
639
  return nullptr;
640
640
  }
@@ -656,7 +656,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
656
656
  ok = ok && gr.read(data->data, ctx->size);
657
657
 
658
658
  if (!ok) {
659
- fprintf(stderr, "%s: failed to read tensor data binary blob\n", __func__);
659
+ GGML_LOG_ERROR("%s: failed to read tensor data binary blob\n", __func__);
660
660
  ggml_free(ctx_data);
661
661
  *params.ctx = nullptr;
662
662
  gguf_free(ctx);
@@ -689,7 +689,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
689
689
  }
690
690
 
691
691
  if (!ok) {
692
- fprintf(stderr, "%s: failed to create tensors\n", __func__);
692
+ GGML_LOG_ERROR("%s: failed to create tensors\n", __func__);
693
693
  ggml_free(ctx_data);
694
694
  *params.ctx = nullptr;
695
695
  gguf_free(ctx);
@@ -706,7 +706,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
706
706
  FILE * file = ggml_fopen(fname, "rb");
707
707
 
708
708
  if (!file) {
709
- fprintf(stderr, "%s: failed to open GGUF file '%s'\n", __func__, fname);
709
+ GGML_LOG_ERROR("%s: failed to open GGUF file '%s'\n", __func__, fname);
710
710
  return nullptr;
711
711
  }
712
712
 
@@ -932,6 +932,7 @@ static void gguf_check_reserved_keys(const std::string & key, const T val) {
932
932
  if constexpr (std::is_same<T, uint32_t>::value) {
933
933
  GGML_ASSERT(val > 0 && (val & (val - 1)) == 0 && GGUF_KEY_GENERAL_ALIGNMENT " must be power of 2");
934
934
  } else {
935
+ GGML_UNUSED(val);
935
936
  GGML_ABORT(GGUF_KEY_GENERAL_ALIGNMENT " must be type u32");
936
937
  }
937
938
  }
@@ -1304,7 +1305,7 @@ bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, boo
1304
1305
  FILE * file = ggml_fopen(fname, "wb");
1305
1306
 
1306
1307
  if (!file) {
1307
- fprintf(stderr, "%s: failed to open file '%s' for writing GGUF data\n", __func__, fname);
1308
+ GGML_LOG_ERROR("%s: failed to open file '%s' for writing GGUF data\n", __func__, fname);
1308
1309
  return false;
1309
1310
  }
1310
1311
 
@@ -4,6 +4,7 @@
4
4
  #include "ggml.h"
5
5
  #include "ggml-cpu.h"
6
6
  #include "ggml-backend.h"
7
+ #include "ggml-opt.h"
7
8
 
8
9
  #include <stddef.h>
9
10
  #include <stdint.h>
@@ -107,6 +108,12 @@ extern "C" {
107
108
  LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
108
109
  LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
109
110
  LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
111
+ LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
112
+ LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
113
+ LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
114
+ LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
115
+ LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
116
+ LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
110
117
  };
111
118
 
112
119
  enum llama_rope_type {
@@ -277,10 +284,18 @@ extern "C" {
277
284
  };
278
285
  };
279
286
 
287
+ struct llama_model_tensor_buft_override {
288
+ const char * pattern;
289
+ ggml_backend_buffer_type_t buft;
290
+ };
291
+
280
292
  struct llama_model_params {
281
293
  // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
282
294
  ggml_backend_dev_t * devices;
283
295
 
296
+ // NULL-terminated list of buffer types to use for tensors that match a pattern
297
+ const struct llama_model_tensor_buft_override * tensor_buft_overrides;
298
+
284
299
  int32_t n_gpu_layers; // number of layers to store in VRAM
285
300
  enum llama_split_mode split_mode; // how to split the model across multiple GPUs
286
301
 
@@ -330,7 +345,7 @@ extern "C" {
330
345
  float yarn_beta_fast; // YaRN low correction dim
331
346
  float yarn_beta_slow; // YaRN high correction dim
332
347
  uint32_t yarn_orig_ctx; // YaRN original context size
333
- float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default)
348
+ float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
334
349
 
335
350
  ggml_backend_sched_eval_callback cb_eval;
336
351
  void * cb_eval_user_data;
@@ -338,34 +353,34 @@ extern "C" {
338
353
  enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
339
354
  enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
340
355
 
341
- // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
342
- // TODO: move at the end of the struct
343
- bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
344
- bool embeddings; // if true, extract embeddings (together with logits)
345
- bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
346
- bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
347
- bool no_perf; // whether to measure performance timings
348
-
349
356
  // Abort callback
350
357
  // if it returns true, execution of llama_decode() will be aborted
351
358
  // currently works only with CPU execution
352
359
  ggml_abort_callback abort_callback;
353
360
  void * abort_callback_data;
361
+
362
+ // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
363
+ bool embeddings; // if true, extract embeddings (together with logits)
364
+ bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
365
+ bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
366
+ bool no_perf; // whether to measure performance timings
367
+ bool op_offload; // whether to offload host tensor operations to device
354
368
  };
355
369
 
356
370
  // model quantization parameters
357
371
  typedef struct llama_model_quantize_params {
358
- int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
359
- enum llama_ftype ftype; // quantize to this llama_ftype
360
- enum ggml_type output_tensor_type; // output tensor type
361
- enum ggml_type token_embedding_type; // token embeddings tensor type
362
- bool allow_requantize; // allow quantizing non-f32/f16 tensors
363
- bool quantize_output_tensor; // quantize output.weight
364
- bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
365
- bool pure; // quantize all tensors to the default type
366
- bool keep_split; // quantize to the same number of shards
367
- void * imatrix; // pointer to importance matrix data
368
- void * kv_overrides; // pointer to vector containing overrides
372
+ int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
373
+ enum llama_ftype ftype; // quantize to this llama_ftype
374
+ enum ggml_type output_tensor_type; // output tensor type
375
+ enum ggml_type token_embedding_type; // token embeddings tensor type
376
+ bool allow_requantize; // allow quantizing non-f32/f16 tensors
377
+ bool quantize_output_tensor; // quantize output.weight
378
+ bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
379
+ bool pure; // quantize all tensors to the default type
380
+ bool keep_split; // quantize to the same number of shards
381
+ void * imatrix; // pointer to importance matrix data
382
+ void * kv_overrides; // pointer to vector containing overrides
383
+ void * tensor_types; // pointer to vector containing tensor types
369
384
  } llama_model_quantize_params;
370
385
 
371
386
  typedef struct llama_logit_bias {
@@ -431,6 +446,10 @@ extern "C" {
431
446
  size_t n_paths,
432
447
  struct llama_model_params params);
433
448
 
449
+ LLAMA_API void llama_model_save_to_file(
450
+ const struct llama_model * model,
451
+ const char * path_model);
452
+
434
453
  DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
435
454
  "use llama_model_free instead");
436
455
 
@@ -910,14 +929,19 @@ extern "C" {
910
929
  // Frees a batch of tokens allocated with llama_batch_init()
911
930
  LLAMA_API void llama_batch_free(struct llama_batch batch);
912
931
 
913
- // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
914
- // Stores the encoder output internally for later use by the decoder cross-attention layers.
932
+ // Process a batch of tokens.
933
+ // In contrast to llama_decode() - this call does not use KV cache.
934
+ // For encode-decoder contexts, processes the batch using the encoder.
935
+ // Can store the encoder output internally for later use by the decoder's cross-attention layers.
915
936
  // 0 - success
916
937
  // < 0 - error. the KV cache state is restored to the state before this call
917
938
  LLAMA_API int32_t llama_encode(
918
939
  struct llama_context * ctx,
919
940
  struct llama_batch batch);
920
941
 
942
+ // Process a batch of tokens.
943
+ // Requires KV cache.
944
+ // For encode-decoder contexts, processes the batch using the decoder.
921
945
  // Positive return values does not mean a fatal error, but rather a warning.
922
946
  // 0 - success
923
947
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
@@ -1218,6 +1242,7 @@ extern "C" {
1218
1242
  "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
1219
1243
 
1220
1244
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1245
+ /// Setting k <= 0 makes this a noop
1221
1246
  LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
1222
1247
 
1223
1248
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@@ -1264,6 +1289,10 @@ extern "C" {
1264
1289
  float tau,
1265
1290
  float eta);
1266
1291
 
1292
+ /// @details Intializes a GBNF grammar, see grammars/README.md for details.
1293
+ /// @param vocab The vocabulary that this grammar will be used with.
1294
+ /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
1295
+ /// @param grammar_root The name of the start symbol for the grammar.
1267
1296
  LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
1268
1297
  const struct llama_vocab * vocab,
1269
1298
  const char * grammar_str,
@@ -1409,6 +1438,37 @@ extern "C" {
1409
1438
  LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
1410
1439
  LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
1411
1440
 
1441
+ //
1442
+ // training
1443
+ //
1444
+
1445
+ // function that returns whether or not a given tensor contains trainable parameters
1446
+ typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
1447
+
1448
+ // always returns true
1449
+ LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
1450
+
1451
+ struct llama_opt_params {
1452
+ uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
1453
+
1454
+ llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
1455
+ void * param_filter_ud; // userdata for determining which tensors contain trainable parameters
1456
+
1457
+ ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
1458
+ void * get_opt_pars_ud; // userdata for calculating optimizer parameters
1459
+ };
1460
+
1461
+ LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
1462
+
1463
+ LLAMA_API void llama_opt_epoch(
1464
+ struct llama_context * lctx,
1465
+ ggml_opt_dataset_t dataset,
1466
+ ggml_opt_result_t result_train,
1467
+ ggml_opt_result_t result_eval,
1468
+ int64_t idata_split,
1469
+ ggml_opt_epoch_callback callback_train,
1470
+ ggml_opt_epoch_callback callback_eval);
1471
+
1412
1472
  #ifdef __cplusplus
1413
1473
  }
1414
1474
  #endif
@@ -0,0 +1,112 @@
1
+ ied 4 ½ months
2
+ __ggml_vocab_test__
3
+ Führer
4
+ __ggml_vocab_test__
5
+
6
+ __ggml_vocab_test__
7
+
8
+ __ggml_vocab_test__
9
+
10
+ __ggml_vocab_test__
11
+
12
+ __ggml_vocab_test__
13
+
14
+ __ggml_vocab_test__
15
+
16
+
17
+ __ggml_vocab_test__
18
+
19
+
20
+
21
+ __ggml_vocab_test__
22
+
23
+
24
+
25
+
26
+ __ggml_vocab_test__
27
+
28
+
29
+ __ggml_vocab_test__
30
+ Hello world
31
+ __ggml_vocab_test__
32
+ Hello world
33
+ __ggml_vocab_test__
34
+ Hello World
35
+ __ggml_vocab_test__
36
+ Hello World
37
+ __ggml_vocab_test__
38
+ Hello World!
39
+ __ggml_vocab_test__
40
+ Hello, world!
41
+ __ggml_vocab_test__
42
+ Hello, world!
43
+ __ggml_vocab_test__
44
+ this is 🦙.cpp
45
+ __ggml_vocab_test__
46
+ w048 7tuijk dsdfhu
47
+ __ggml_vocab_test__
48
+ нещо на Български
49
+ __ggml_vocab_test__
50
+ កាន់តែពិសេសអាចខលចេញ
51
+ __ggml_vocab_test__
52
+ 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53
+ __ggml_vocab_test__
54
+ Hello
55
+ __ggml_vocab_test__
56
+ Hello
57
+ __ggml_vocab_test__
58
+ Hello
59
+ __ggml_vocab_test__
60
+ Hello
61
+ __ggml_vocab_test__
62
+ Hello
63
+ __ggml_vocab_test__
64
+ Hello
65
+ Hello
66
+ __ggml_vocab_test__
67
+ (
68
+ __ggml_vocab_test__
69
+
70
+ =
71
+ __ggml_vocab_test__
72
+ ' era
73
+ __ggml_vocab_test__
74
+ Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75
+ __ggml_vocab_test__
76
+ !!!!!!
77
+ __ggml_vocab_test__
78
+ 3
79
+ __ggml_vocab_test__
80
+ 33
81
+ __ggml_vocab_test__
82
+ 333
83
+ __ggml_vocab_test__
84
+ 3333
85
+ __ggml_vocab_test__
86
+ 33333
87
+ __ggml_vocab_test__
88
+ 333333
89
+ __ggml_vocab_test__
90
+ 3333333
91
+ __ggml_vocab_test__
92
+ 33333333
93
+ __ggml_vocab_test__
94
+ 333333333
95
+ __ggml_vocab_test__
96
+ Cửa Việt
97
+ __ggml_vocab_test__
98
+ discards
99
+ __ggml_vocab_test__
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+ 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
112
+ __ggml_vocab_test__
@@ -0,0 +1,46 @@
1
+ 1190 220 32 220 18215 7112
2
+ 50 16800 258
3
+
4
+ 220
5
+ 256
6
+ 277
7
+ 197
8
+ 198
9
+ 368
10
+ 2946
11
+ 3271
12
+ 19873 3817
13
+ 39715 3817
14
+ 19873 7353
15
+ 39715 7353
16
+ 39715 7353 13
17
+ 19873 24 3817 13
18
+ 39715 24 3817 13
19
+ 544 373 9522 112 247 26 36315
20
+ 99 39923 220 35 9607 21498 21470 3679 9433
21
+ 1595 7653 633 79829 34051 1636
22
+ 8755 102595 115960 21125 148305 96819 102816 39048 14105 22528 160234
23
+ 114590 222 330 14879 21 51358 127 12817 93293 117 24204 330 68239 881 120327 170428 21 89101 330 7384 88230 511 947 1492 3742 7233 21
24
+ 19873
25
+ 39715
26
+ 220 39715
27
+ 256 39715
28
+ 277 39715
29
+ 277 39715 198 277 39715
30
+ 330
31
+ 198 319
32
+ 19 7359
33
+ 19873 24 386 87799 13 2403 583 650 51358 223 1663 155736 1522 42056 7544 13336 28785 29 4412 20645
34
+ 17931 4959
35
+ 31
36
+ 1922
37
+ 12325
38
+ 12325 31
39
+ 12325 1922
40
+ 12325 12325
41
+ 12325 12325 31
42
+ 12325 12325 1922
43
+ 12325 12325 12325
44
+ 47 19811 12077
45
+ 3260 3579
46
+ 198 7283 51499 191231 20192 3271 3322 9287 2143 17860 114590 222 330 14879 21 51358 127 12817 93293 117 24204 330 68239 881 120327 170428 21 89101 9522 112 247 172394 247 220 31 220 1922 220 12325 220 12325 31 220 12325 1922 220 12325 12325 220 12325 12325 31 220 12325 12325 1922 220 31 26 31 220 31 396 31 220 31 1043 31 117131 102595 115960 21125 148305 96819 102816 80883 223 1663 155736 1522 42056 7544 13336 28785 29 4412 20645 79745 150278 117079 633 79829 34051 1636 25611 41990 109428 1488 91054 24072 17931 4959 29795 9296 16517 1806 481 96 1386 36633 1609 24 481 1109 650 5074 43 481 57 702 5074 27088 2170 536 24 481 48 650 1933 1696 30262 43 1665 19 32818 262 27236 56