@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -36,14 +36,18 @@ enum llm_type {
36
36
  LLM_TYPE_335M,
37
37
  LLM_TYPE_410M,
38
38
  LLM_TYPE_450M,
39
+ LLM_TYPE_475M,
39
40
  LLM_TYPE_770M,
40
41
  LLM_TYPE_780M,
41
42
  LLM_TYPE_0_5B,
43
+ LLM_TYPE_0_6B,
42
44
  LLM_TYPE_1B,
43
45
  LLM_TYPE_1_3B,
44
46
  LLM_TYPE_1_4B,
45
47
  LLM_TYPE_1_5B,
46
48
  LLM_TYPE_1_6B,
49
+ LLM_TYPE_1_7B,
50
+ LLM_TYPE_1_8B,
47
51
  LLM_TYPE_2B,
48
52
  LLM_TYPE_2_8B,
49
53
  LLM_TYPE_2_9B,
@@ -61,6 +65,7 @@ enum llm_type {
61
65
  LLM_TYPE_15B,
62
66
  LLM_TYPE_16B,
63
67
  LLM_TYPE_20B,
68
+ LLM_TYPE_27B,
64
69
  LLM_TYPE_30B,
65
70
  LLM_TYPE_32B,
66
71
  LLM_TYPE_34B,
@@ -69,7 +74,9 @@ enum llm_type {
69
74
  LLM_TYPE_65B,
70
75
  LLM_TYPE_70B,
71
76
  LLM_TYPE_236B,
77
+ LLM_TYPE_290B,
72
78
  LLM_TYPE_314B,
79
+ LLM_TYPE_405B,
73
80
  LLM_TYPE_671B,
74
81
  LLM_TYPE_SMALL,
75
82
  LLM_TYPE_MEDIUM,
@@ -83,9 +90,14 @@ enum llm_type {
83
90
  LLM_TYPE_16x3_8B,
84
91
  LLM_TYPE_10B_128x3_66B,
85
92
  LLM_TYPE_57B_A14B,
86
- LLM_TYPE_27B,
93
+ LLM_TYPE_17B_16E, // llama4 Scout
94
+ LLM_TYPE_17B_128E, // llama4 Maverick
95
+ LLM_TYPE_30B_A3B,
96
+ LLM_TYPE_235B_A22B,
87
97
  };
88
98
 
99
+ std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
100
+
89
101
  struct llama_layer_posnet {
90
102
  // resnet
91
103
  struct ggml_tensor * norm1 = nullptr;
@@ -167,6 +179,8 @@ struct llama_layer {
167
179
  struct ggml_tensor * wq_b = nullptr;
168
180
  struct ggml_tensor * wkv_a_mqa = nullptr;
169
181
  struct ggml_tensor * wkv_b = nullptr;
182
+ struct ggml_tensor * wk_b = nullptr;
183
+ struct ggml_tensor * wv_b = nullptr;
170
184
  struct ggml_tensor * wq_cross = nullptr;
171
185
  struct ggml_tensor * wk_cross = nullptr;
172
186
  struct ggml_tensor * wv_cross = nullptr;
@@ -380,10 +394,15 @@ struct llama_model {
380
394
 
381
395
  ggml_backend_buffer_type_t select_buft(int il) const;
382
396
 
397
+ bool has_tensor_overrides() const;
398
+
383
399
  const struct ggml_tensor * get_tensor(const char * name) const;
384
400
 
401
+ ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
402
+
403
+ // note: can mutate `cparams`
385
404
  // TODO: move this to new llm_arch_model_i interface
386
- llama_memory_i * create_memory() const; // TODO: params
405
+ llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
387
406
 
388
407
  // TODO: move this to new llm_arch_model_i interface
389
408
  llm_graph_result_ptr build_graph(
@@ -10,9 +10,16 @@
10
10
  #include <cinttypes>
11
11
  #include <fstream>
12
12
  #include <mutex>
13
+ #include <regex>
13
14
  #include <thread>
14
15
  #include <unordered_map>
15
16
 
17
+ // Quantization types. Changes to this struct must be replicated in quantize.cpp
18
+ struct tensor_quantization {
19
+ std::string name;
20
+ ggml_type quant = GGML_TYPE_COUNT;
21
+ };
22
+
16
23
  static void zeros(std::ofstream & file, size_t n) {
17
24
  char zero = 0;
18
25
  for (size_t i = 0; i < n; ++i) {
@@ -48,7 +55,7 @@ struct quantize_state_impl {
48
55
  };
49
56
 
50
57
  static void llama_tensor_dequantize_impl(
51
- struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
58
+ ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
52
59
  const size_t nelements, const int nthread
53
60
  ) {
54
61
  if (output.size() < nelements) {
@@ -512,7 +519,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
512
519
  nthread = std::thread::hardware_concurrency();
513
520
  }
514
521
 
515
- // mmap consistently increases speed Linux, and also increases speed on Windows with
522
+ // mmap consistently increases speed on Linux, and also increases speed on Windows with
516
523
  // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
517
524
  #if defined(__linux__) || defined(_WIN32)
518
525
  constexpr bool use_mmap = true;
@@ -522,12 +529,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
522
529
 
523
530
  llama_model_kv_override * kv_overrides = nullptr;
524
531
  if (params->kv_overrides) {
525
- auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
532
+ auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
526
533
  kv_overrides = v->data();
527
534
  }
528
535
 
529
536
  std::vector<std::string> splits = {};
530
- llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
537
+ llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
531
538
  ml.init_mappings(false); // no prefetching
532
539
 
533
540
  llama_model model(llama_model_default_params());
@@ -536,7 +543,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
536
543
  model.load_hparams(ml);
537
544
  model.load_stats (ml);
538
545
 
539
- struct quantize_state_impl qs(model, params);
546
+ quantize_state_impl qs(model, params);
540
547
 
541
548
  if (params->only_copy) {
542
549
  ftype = ml.ftype;
@@ -661,7 +668,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
661
668
  // populate the original tensors so we get an initial meta data
662
669
  for (const auto * it : tensors) {
663
670
  uint16_t i_split = params->keep_split ? it->idx : 0;
664
- struct ggml_tensor * tensor = it->tensor;
671
+ ggml_tensor * tensor = it->tensor;
665
672
  if (!ctx_outs[i_split]) {
666
673
  ctx_outs[i_split].reset(gguf_init_empty());
667
674
  }
@@ -710,7 +717,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
710
717
  new_ofstream(0);
711
718
  for (const auto * it : tensors) {
712
719
  const auto & weight = *it;
713
- struct ggml_tensor * tensor = weight.tensor;
720
+ ggml_tensor * tensor = weight.tensor;
714
721
  if (weight.idx != cur_split && params->keep_split) {
715
722
  close_ofstream();
716
723
  new_ofstream(weight.idx);
@@ -776,7 +783,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
776
783
  // do not quantize relative position bias (T5)
777
784
  quantize &= name.find("attn_rel_b.weight") == std::string::npos;
778
785
 
779
- enum ggml_type new_type;
786
+ ggml_type new_type;
780
787
  void * new_data;
781
788
  size_t new_size;
782
789
 
@@ -786,7 +793,22 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
786
793
  // get more optimal quantization type based on the tensor shape, layer, etc.
787
794
  if (!params->pure && ggml_is_quantized(default_type)) {
788
795
  new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
796
+ // unless the user specifies a type
797
+ if (params->tensor_types) {
798
+ const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
799
+ const std::string tensor_name(tensor->name);
800
+ for (const auto & [tname, qtype] : tensor_types) {
801
+ if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
802
+ if (qtype != new_type) {
803
+ LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
804
+ new_type = qtype;
805
+ break; // if two or more types are specified for the tensor, first match wins
806
+ }
807
+ }
808
+ }
809
+ }
789
810
  }
811
+
790
812
  if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
791
813
  new_type = params->token_embedding_type;
792
814
  }
@@ -910,8 +932,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
910
932
  // interface implementation
911
933
  //
912
934
 
913
- struct llama_model_quantize_params llama_model_quantize_default_params() {
914
- struct llama_model_quantize_params result = {
935
+ llama_model_quantize_params llama_model_quantize_default_params() {
936
+ llama_model_quantize_params result = {
915
937
  /*.nthread =*/ 0,
916
938
  /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
917
939
  /*.output_tensor_type =*/ GGML_TYPE_COUNT,
@@ -923,6 +945,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
923
945
  /*.keep_split =*/ false,
924
946
  /*.imatrix =*/ nullptr,
925
947
  /*.kv_overrides =*/ nullptr,
948
+ /*.tensor_type =*/ nullptr,
926
949
  };
927
950
 
928
951
  return result;
@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
232
232
  // }
233
233
 
234
234
  if (k <= 0) {
235
- k = cur_p->size;
235
+ return;
236
236
  }
237
237
 
238
238
  k = std::min(k, (int) cur_p->size);
@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
298
298
  }
299
299
  cur_p->sorted = true;
300
300
  }
301
+
301
302
  cur_p->size = k;
302
303
  }
303
304
 
@@ -1477,6 +1478,7 @@ static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sam
1477
1478
  const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
1478
1479
 
1479
1480
  auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
1481
+ GGML_ASSERT(result);
1480
1482
 
1481
1483
  // copy the state
1482
1484
  {
@@ -1548,6 +1550,10 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
1548
1550
  /* .grammar_root = */ grammar_root,
1549
1551
  /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
1550
1552
  };
1553
+ if (!ctx->grammar) {
1554
+ delete ctx;
1555
+ return nullptr;
1556
+ }
1551
1557
  } else {
1552
1558
  *ctx = {
1553
1559
  /* .vocab = */ vocab,
@@ -1744,23 +1750,35 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
1744
1750
  static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
1745
1751
  const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
1746
1752
 
1753
+ if (ctx->n <= 0.0f || cur_p->size <= 1) {
1754
+ return;
1755
+ }
1756
+
1747
1757
  // find max logit and calculate mean
1748
1758
  float max = cur_p->data[0].logit;
1749
1759
  float logits_sum = 0;
1760
+ size_t valid_count = 0;
1750
1761
  for (size_t i = 0; i < cur_p->size; ++i) {
1751
- if (cur_p->data[i].logit > max) {
1752
- max = cur_p->data[i].logit;
1762
+ // Only count non-negative infinity values
1763
+ if (cur_p->data[i].logit != -INFINITY) {
1764
+ if (cur_p->data[i].logit > max) {
1765
+ max = cur_p->data[i].logit;
1766
+ }
1767
+ logits_sum += cur_p->data[i].logit;
1768
+ valid_count++;
1753
1769
  }
1754
- logits_sum += cur_p->data[i].logit;
1755
1770
  }
1756
- float mean = logits_sum/cur_p->size;
1771
+ float mean = valid_count > 0 ? logits_sum/valid_count : 0;
1757
1772
 
1758
1773
  // calculate standard deviation
1759
1774
  float acc = 0;
1760
1775
  for (size_t i = 0; i < cur_p->size; ++i) {
1761
- acc += pow(cur_p->data[i].logit - mean, 2);
1776
+ // Skip -infinity in std calculation
1777
+ if (cur_p->data[i].logit != -INFINITY) {
1778
+ acc += pow(cur_p->data[i].logit - mean, 2);
1779
+ }
1762
1780
  }
1763
- float std = sqrt(acc/cur_p->size);
1781
+ float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
1764
1782
 
1765
1783
  //apply mask
1766
1784
  for (size_t i = 0; i < cur_p->size; ++i) {
@@ -1,5 +1,7 @@
1
1
  #include "llama-vocab.h"
2
2
 
3
+ #include "ggml.h"
4
+ #include "gguf.h"
3
5
  #include "llama-impl.h"
4
6
  #include "llama-model-loader.h"
5
7
 
@@ -342,6 +344,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
342
344
  case LLAMA_VOCAB_PRE_TYPE_MPT:
343
345
  case LLAMA_VOCAB_PRE_TYPE_OLMO:
344
346
  case LLAMA_VOCAB_PRE_TYPE_JAIS:
347
+ case LLAMA_VOCAB_PRE_TYPE_TRILLION:
345
348
  regex_exprs = {
346
349
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
347
350
  };
@@ -400,6 +403,27 @@ struct llm_tokenizer_bpe : llm_tokenizer {
400
403
  "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
401
404
  };
402
405
  break;
406
+ case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
407
+ regex_exprs = {
408
+ "\\p{N}+",
409
+ "(?=(\\d{3})+(?!\\d))",
410
+ };
411
+ break;
412
+ case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
413
+ regex_exprs = {
414
+ // original regex from tokenizer.json
415
+ // "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
416
+ // FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
417
+ "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
418
+ };
419
+ break;
420
+ case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
421
+ regex_exprs = {
422
+ // original regex from tokenizer.json
423
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
424
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
425
+ };
426
+ break;
403
427
  default:
404
428
  // default regex for BPE tokenization pre-processing
405
429
  regex_exprs = {
@@ -1212,6 +1236,9 @@ struct fragment_buffer_variant {
1212
1236
  struct llama_vocab::impl {
1213
1237
  uint32_t n_token_types = 0; // for BERT-style token types
1214
1238
 
1239
+ std::string tokenizer_model;
1240
+ std::string tokenizer_pre;
1241
+
1215
1242
  enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
1216
1243
  enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1217
1244
 
@@ -1347,9 +1374,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1347
1374
 
1348
1375
  // determine vocab type
1349
1376
  {
1350
- std::string tokenizer_model;
1351
- std::string tokenizer_pre;
1352
-
1353
1377
  ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
1354
1378
  ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
1355
1379
 
@@ -1444,7 +1468,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1444
1468
 
1445
1469
  const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
1446
1470
  if (precompiled_charsmap_keyidx != -1) {
1447
- size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
1471
+ const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
1472
+ GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
1473
+
1474
+ const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
1448
1475
  const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
1449
1476
  precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
1450
1477
  #ifdef IS_BIG_ENDIAN
@@ -1491,7 +1518,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1491
1518
  tokenizer_pre == "llama3" ||
1492
1519
  tokenizer_pre == "llama-v3" ||
1493
1520
  tokenizer_pre == "llama-bpe"||
1494
- tokenizer_pre == "falcon3") {
1521
+ tokenizer_pre == "falcon3" ||
1522
+ tokenizer_pre == "pixtral") {
1495
1523
  pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
1496
1524
  ignore_merges = true;
1497
1525
  add_bos = true;
@@ -1557,6 +1585,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1557
1585
  pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
1558
1586
  clean_spaces = false;
1559
1587
  } else if (
1588
+ tokenizer_pre == "glm4" ||
1560
1589
  tokenizer_pre == "chatglm-bpe") {
1561
1590
  pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
1562
1591
  special_bos_id = LLAMA_TOKEN_NULL;
@@ -1601,9 +1630,26 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1601
1630
  tokenizer_pre == "megrez") {
1602
1631
  pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1603
1632
  } else if (
1604
- tokenizer_pre == "gpt-4o") {
1633
+ tokenizer_pre == "gpt-4o" ||
1634
+ tokenizer_pre == "llama4") {
1605
1635
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
1606
1636
  clean_spaces = false;
1637
+ } else if (
1638
+ tokenizer_pre == "superbpe") {
1639
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
1640
+ clean_spaces = false;
1641
+ } else if (
1642
+ tokenizer_pre == "trillion") {
1643
+ pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
1644
+ clean_spaces = false;
1645
+ } else if (
1646
+ tokenizer_pre == "bailingmoe") {
1647
+ pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
1648
+ clean_spaces = false;
1649
+ } else if (
1650
+ tokenizer_pre == "seed-coder") {
1651
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
1652
+ clean_spaces = false;
1607
1653
  } else {
1608
1654
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1609
1655
  }
@@ -1781,6 +1827,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1781
1827
  || t.first == "<end_of_turn>"
1782
1828
  || t.first == "<|endoftext|>"
1783
1829
  || t.first == "<EOT>"
1830
+ || t.first == "_<EOT>"
1784
1831
  || t.first == "<|end▁of▁sentence|>" // DeepSeek
1785
1832
  ) {
1786
1833
  special_eot_id = t.second;
@@ -1811,8 +1858,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1811
1858
  if (false
1812
1859
  || t.first == "<|fim_prefix|>" // Qwen
1813
1860
  || t.first == "<fim-prefix>"
1861
+ || t.first == "<fim_prefix>" // Granite
1814
1862
  || t.first == "<|fim▁begin|>" // DeepSeek
1815
1863
  || t.first == "<PRE>"
1864
+ || t.first == "▁<PRE>" // CodeLlama
1816
1865
  ) {
1817
1866
  special_fim_pre_id = t.second;
1818
1867
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1828,8 +1877,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1828
1877
  if (false
1829
1878
  || t.first == "<|fim_suffix|>" // Qwen
1830
1879
  || t.first == "<fim-suffix>"
1880
+ || t.first == "<fim_suffix>" // Granite
1831
1881
  || t.first == "<|fim▁hole|>" // DeepSeek
1832
1882
  || t.first == "<SUF>"
1883
+ || t.first == "▁<SUF>" // CodeLlama
1833
1884
  ) {
1834
1885
  special_fim_suf_id = t.second;
1835
1886
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1845,8 +1896,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1845
1896
  if (false
1846
1897
  || t.first == "<|fim_middle|>" // Qwen
1847
1898
  || t.first == "<fim-middle>"
1899
+ || t.first == "<fim_middle>" // Granite
1848
1900
  || t.first == "<|fim▁end|>" // DeepSeek
1849
1901
  || t.first == "<MID>"
1902
+ || t.first == "▁<MID>" // CodeLlama
1850
1903
  ) {
1851
1904
  special_fim_mid_id = t.second;
1852
1905
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1862,6 +1915,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1862
1915
  if (false
1863
1916
  || t.first == "<|fim_pad|>" // Qwen
1864
1917
  || t.first == "<fim-pad>"
1918
+ || t.first == "<fim_pad>" // Granite
1865
1919
  || t.first == "<PAD>"
1866
1920
  ) {
1867
1921
  special_fim_pad_id = t.second;
@@ -1880,6 +1934,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1880
1934
  || t.first == "<|repo_name|>"
1881
1935
  || t.first == "<fim-repo>"
1882
1936
  || t.first == "<REPO>"
1937
+ || t.first == "<reponame>" // Granite
1883
1938
  ) {
1884
1939
  special_fim_rep_id = t.second;
1885
1940
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1931,6 +1986,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1931
1986
  || t.first == "<|endoftext|>"
1932
1987
  || t.first == "<|eom_id|>"
1933
1988
  || t.first == "<EOT>"
1989
+ || t.first == "_<EOT>"
1934
1990
  ) {
1935
1991
  special_eog_ids.insert(t.second);
1936
1992
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2189,14 +2245,12 @@ void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer
2189
2245
  // find the first occurrence of a given special token in this fragment
2190
2246
  // passing offset argument only limit the "search area" but match coordinates
2191
2247
  // are still relative to the source full raw_text
2192
- auto match = raw_text.find(text, raw_text_base_offset);
2248
+ // string_view begins at pos 0 for the same reason
2249
+ auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);
2193
2250
 
2194
2251
  // no occurrences found, stop processing this fragment for a given special token
2195
2252
  if (match == std::string::npos) break;
2196
2253
 
2197
- // check if match is within bounds of offset <-> length
2198
- if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
2199
-
2200
2254
  #ifdef PRETOKENIZERDEBUG
2201
2255
  LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
2202
2256
  #endif
@@ -2740,6 +2794,14 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
2740
2794
  pimpl->load(ml, kv);
2741
2795
  }
2742
2796
 
2797
+ std::string llama_vocab::get_tokenizer_model() const {
2798
+ return pimpl->tokenizer_model;
2799
+ }
2800
+
2801
+ std::string llama_vocab::get_tokenizer_pre() const {
2802
+ return pimpl->tokenizer_pre;
2803
+ }
2804
+
2743
2805
  enum llama_vocab_type llama_vocab::get_type() const {
2744
2806
  return pimpl->type;
2745
2807
  }
@@ -2962,6 +3024,20 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
2962
3024
  return it->second;
2963
3025
  }
2964
3026
 
3027
+ std::vector<std::string> llama_vocab::get_bpe_merges() const {
3028
+ std::vector<std::string> result(pimpl->bpe_ranks.size());
3029
+
3030
+ for (const auto & pair : pimpl->bpe_ranks) {
3031
+ result[pair.second] = pair.first.first + " " + pair.first.second;
3032
+ }
3033
+
3034
+ return result;
3035
+ }
3036
+
3037
+ std::vector<char> llama_vocab::get_precompiled_charsmap() const {
3038
+ return pimpl->precompiled_charsmap;
3039
+ }
3040
+
2965
3041
  int32_t llama_vocab::tokenize(
2966
3042
  const char * text,
2967
3043
  int32_t text_len,
@@ -21,6 +21,9 @@ struct llama_vocab {
21
21
 
22
22
  void load(llama_model_loader & ml, const LLM_KV & kv);
23
23
 
24
+ std::string get_tokenizer_model() const;
25
+ std::string get_tokenizer_pre() const;
26
+
24
27
  enum llama_vocab_type get_type() const;
25
28
  enum llama_vocab_pre_type get_pre_type() const;
26
29
 
@@ -80,6 +83,9 @@ struct llama_vocab {
80
83
  int max_token_len() const;
81
84
 
82
85
  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
86
+ std::vector<std::string> get_bpe_merges() const;
87
+
88
+ std::vector<char> get_precompiled_charsmap() const;
83
89
 
84
90
  int32_t tokenize(
85
91
  const char * text,
@@ -4,6 +4,7 @@
4
4
  #include "llama-mmap.h"
5
5
  #include "llama-vocab.h"
6
6
  #include "llama-model-loader.h"
7
+ #include "llama-model-saver.h"
7
8
  #include "llama-model.h"
8
9
 
9
10
  #include "ggml.h"
@@ -92,7 +93,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
92
93
  model.t_start_us = tm.t_start_us;
93
94
 
94
95
  try {
95
- llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
96
+ llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
96
97
 
97
98
  ml.print_info();
98
99
 
@@ -139,6 +140,11 @@ static struct llama_model * llama_model_load_from_file_impl(
139
140
  struct llama_model_params params) {
140
141
  ggml_time_init();
141
142
 
143
+ if (!params.vocab_only && ggml_backend_reg_count() == 0) {
144
+ LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
145
+ return nullptr;
146
+ }
147
+
142
148
  unsigned cur_percentage = 0;
143
149
  if (params.progress_callback == NULL) {
144
150
  params.progress_callback_user_data = &cur_percentage;
@@ -253,6 +259,13 @@ struct llama_model * llama_model_load_from_splits(
253
259
  return llama_model_load_from_file_impl(splits.front(), splits, params);
254
260
  }
255
261
 
262
+ void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
263
+ llama_model_saver ms(*model);
264
+ ms.add_kv_from_model();
265
+ ms.add_tensors_from_model();
266
+ ms.save(path_model);
267
+ }
268
+
256
269
  //
257
270
  // chat templates
258
271
  //
@@ -338,3 +351,4 @@ const char * llama_print_system_info(void) {
338
351
 
339
352
  return s.c_str();
340
353
  }
354
+