@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -0,0 +1,112 @@
1
+ ied 4 ½ months
2
+ __ggml_vocab_test__
3
+ Führer
4
+ __ggml_vocab_test__
5
+
6
+ __ggml_vocab_test__
7
+
8
+ __ggml_vocab_test__
9
+
10
+ __ggml_vocab_test__
11
+
12
+ __ggml_vocab_test__
13
+
14
+ __ggml_vocab_test__
15
+
16
+
17
+ __ggml_vocab_test__
18
+
19
+
20
+
21
+ __ggml_vocab_test__
22
+
23
+
24
+
25
+
26
+ __ggml_vocab_test__
27
+
28
+
29
+ __ggml_vocab_test__
30
+ Hello world
31
+ __ggml_vocab_test__
32
+ Hello world
33
+ __ggml_vocab_test__
34
+ Hello World
35
+ __ggml_vocab_test__
36
+ Hello World
37
+ __ggml_vocab_test__
38
+ Hello World!
39
+ __ggml_vocab_test__
40
+ Hello, world!
41
+ __ggml_vocab_test__
42
+ Hello, world!
43
+ __ggml_vocab_test__
44
+ this is 🦙.cpp
45
+ __ggml_vocab_test__
46
+ w048 7tuijk dsdfhu
47
+ __ggml_vocab_test__
48
+ нещо на Български
49
+ __ggml_vocab_test__
50
+ កាន់តែពិសេសអាចខលចេញ
51
+ __ggml_vocab_test__
52
+ 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53
+ __ggml_vocab_test__
54
+ Hello
55
+ __ggml_vocab_test__
56
+ Hello
57
+ __ggml_vocab_test__
58
+ Hello
59
+ __ggml_vocab_test__
60
+ Hello
61
+ __ggml_vocab_test__
62
+ Hello
63
+ __ggml_vocab_test__
64
+ Hello
65
+ Hello
66
+ __ggml_vocab_test__
67
+ (
68
+ __ggml_vocab_test__
69
+
70
+ =
71
+ __ggml_vocab_test__
72
+ ' era
73
+ __ggml_vocab_test__
74
+ Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75
+ __ggml_vocab_test__
76
+ !!!!!!
77
+ __ggml_vocab_test__
78
+ 3
79
+ __ggml_vocab_test__
80
+ 33
81
+ __ggml_vocab_test__
82
+ 333
83
+ __ggml_vocab_test__
84
+ 3333
85
+ __ggml_vocab_test__
86
+ 33333
87
+ __ggml_vocab_test__
88
+ 333333
89
+ __ggml_vocab_test__
90
+ 3333333
91
+ __ggml_vocab_test__
92
+ 33333333
93
+ __ggml_vocab_test__
94
+ 333333333
95
+ __ggml_vocab_test__
96
+ Cửa Việt
97
+ __ggml_vocab_test__
98
+ discards
99
+ __ggml_vocab_test__
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+ 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
112
+ __ggml_vocab_test__
@@ -0,0 +1,46 @@
1
+ 2014 1032 1052 1032 28504 6972
2
+ 1070 7088 1258
3
+
4
+ 1032
5
+ 1256
6
+ 1293
7
+ 1009
8
+ 1010
9
+ 1267
10
+ 4688
11
+ 1009 1010
12
+ 22177 4304
13
+ 45383 4304
14
+ 22177 5325
15
+ 45383 5325
16
+ 45383 5325 1033
17
+ 22177 1044 4304 1033
18
+ 45383 1044 4304 1033
19
+ 1593 1395 119685 1166 1153 1046 51228
20
+ 1119 1048 1052 1056 1032 1055 17391 23216 30203 7785 17279
21
+ 3337 30757 1902 4200 63073 3671
22
+ 1225 1158 1128 1225 1158 1182 1225 1158 1147 1225 1159 1139 1225 1158 1143 1225 1159 1130 1225 1158 1150 1225 1158 1183 1225 1158 1159 1225 21359 1225 1158 1159 1225 1158 1162 1225 1158 1182 1225 1158 1133 1225 1158 1129 1225 1158 1155 1225 1158 1133 1225 21359 1225 1158 1137
23
+ 1240 1159 1154 1128 1319 13052 1041 119685 1152 1182 29568 1240 1159 1140 1171 1239 1184 1143 1319 88181 1873 3659 1275 56421 1621 1041 126241 1133 1319 11234 1873 26303 1455 1934 2246 3754 10835 1041
24
+ 22177
25
+ 45383
26
+ 1032 45383
27
+ 1256 45383
28
+ 1293 45383
29
+ 1293 45383 1010 1293 45383
30
+ 1319
31
+ 1010 1376
32
+ 1039 4033
33
+ 22177 1044 1404 48054 1033 3075 1584 1636 119685 1152 1129 3082 26060 2998 63614 82278 1049 1051 1049 1052 1049 1053 1049 6434 6749
34
+ 7290 7290 7290
35
+ 1051
36
+ 1051 1051
37
+ 1051 1051 1051
38
+ 1051 1051 1051 1051
39
+ 1051 1051 1051 1051 1051
40
+ 1051 1051 1051 1051 1051 1051
41
+ 1051 1051 1051 1051 1051 1051 1051
42
+ 1051 1051 1051 1051 1051 1051 1051 1051
43
+ 1051 1051 1051 1051 1051 1051 1051 1051 1051
44
+ 1067 59503 28783
45
+ 3724 4058
46
+ 1010 1032 1267 1032 4688 1032 17152 1458 29356 1010 1256 1010 1293 1010 1260 1010 1652 1010 1240 1159 1154 1128 1319 13052 1041 119685 1152 1182 29568 1240 1159 1140 1171 1239 1184 1143 1319 88181 1873 3659 1275 56421 1621 1041 126241 1133 119685 1166 1153 1240 1159 1166 1153 1032 1051 1032 1051 1051 1032 1051 1051 1051 1032 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1051 1051 1032 1051 1046 1051 1032 1051 1791 1051 1032 1051 2880 1051 71881 1158 1128 1225 1158 1182 1225 1158 1147 1225 1159 1139 1225 1158 1143 1225 1159 1130 1225 1158 1150 1225 1158 1183 1225 1158 1159 1225 21359 1225 1158 1159 1225 1158 1162 1225 1158 1182 1225 1158 1133 1240 1159 1152 1129 3082 26060 2998 63614 82278 1049 1051 1049 1052 1049 1053 1049 6434 6749 45577 1045 6626 43555 2843 30757 1902 4200 63073 3671 14931 20040 20040 1657 1657 1975 14135 14135 83923 7290 7290 7290 45509 45509 45509 1362 6483 2151 1576 1116 2189 1514 1681 2156 1044 1576 3609 1636 5257 1063 1576 1077 1605 5257 1362 7534 3180 1494 1044 1576 1068 1636 2479 2269 26883 1063 2837 1039 45654 1261 54297 1076
@@ -1,6 +1,6 @@
1
- -r ../examples/llava/requirements.txt
2
- -r ../examples/server/bench/requirements.txt
3
- -r ../examples/server/tests/requirements.txt
1
+ -r ../tools/mtmd/requirements.txt
2
+ -r ../tools/server/bench/requirements.txt
3
+ -r ../tools/server/tests/requirements.txt
4
4
 
5
5
  -r ./requirements-compare-llama-bench.txt
6
6
  -r ./requirements-pydantic.txt
@@ -11,3 +11,5 @@
11
11
  -r ./requirements-convert_legacy_llama.txt
12
12
  -r ./requirements-convert_llama_ggml_to_gguf.txt
13
13
  -r ./requirements-tool_bench.txt
14
+
15
+ -r ./requirements-gguf_editor_gui.txt
@@ -0,0 +1,3 @@
1
+ numpy~=1.26.4
2
+ PySide6~=6.9.0
3
+ gguf>=0.16.0
@@ -1,5 +1,5 @@
1
1
  # CMake equivalent of `xxd -i ${INPUT} ${OUTPUT}`
2
- # Usage: cmake -DINPUT=examples/server/public/index.html -DOUTPUT=examples/server/index.html.hpp -P scripts/xxd.cmake
2
+ # Usage: cmake -DINPUT=tools/server/public/index.html -DOUTPUT=tools/server/index.html.hpp -P scripts/xxd.cmake
3
3
 
4
4
  SET(INPUT "" CACHE STRING "Input File")
5
5
  SET(OUTPUT "" CACHE STRING "Output File")
@@ -23,6 +23,7 @@ add_library(llama
23
23
  llama-memory.cpp
24
24
  llama-mmap.cpp
25
25
  llama-model-loader.cpp
26
+ llama-model-saver.cpp
26
27
  llama-model.cpp
27
28
  llama-quant.cpp
28
29
  llama-sampling.cpp
@@ -32,8 +33,9 @@ add_library(llama
32
33
  unicode.h
33
34
  )
34
35
 
35
- target_include_directories(llama PUBLIC . ../include ../common)
36
- target_compile_features (llama PUBLIC cxx_std_17) # don't bump
36
+ target_include_directories(llama PRIVATE .)
37
+ target_include_directories(llama PUBLIC ../include)
38
+ target_compile_features (llama PRIVATE cxx_std_17) # don't bump
37
39
 
38
40
  target_link_libraries(llama PUBLIC ggml)
39
41
 
@@ -247,6 +247,29 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
247
247
  }
248
248
  }
249
249
 
250
+ // get extra buffer types of the CPU
251
+ // TODO: a more general solution for non-CPU extra buft should be imlpemented in the future
252
+ // ref: https://github.com/ggml-org/llama.cpp/pull/12593#pullrequestreview-2718659948
253
+ std::vector<ggml_backend_buffer_type_t> buft_extra;
254
+ {
255
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
256
+ if (!cpu_dev) {
257
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
258
+ }
259
+ auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
260
+
261
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
262
+ ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
263
+
264
+ if (ggml_backend_dev_get_extra_bufts_fn) {
265
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
266
+ while (extra_bufts && *extra_bufts) {
267
+ buft_extra.emplace_back(*extra_bufts);
268
+ ++extra_bufts;
269
+ }
270
+ }
271
+ }
272
+
250
273
  // add tensors
251
274
  for (auto & it : ab_map) {
252
275
  const std::string & name = it.first;
@@ -263,7 +286,26 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
263
286
  throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
264
287
  }
265
288
 
266
- ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
289
+ auto * buft = ggml_backend_buffer_get_type(model_tensor->buffer);
290
+
291
+ // do not load loras to extra buffer types (i.e. bufts for repacking) -> use the CPU in that case
292
+ for (auto & ex : buft_extra) {
293
+ if (ex == buft) {
294
+ LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
295
+
296
+ auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
297
+ if (!cpu_dev) {
298
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
299
+ }
300
+ buft = ggml_backend_dev_buffer_type(cpu_dev);
301
+
302
+ break;
303
+ }
304
+ }
305
+
306
+ LLAMA_LOG_DEBUG("%s: lora for '%s' -> '%s'\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
307
+
308
+ ggml_context * dev_ctx = ctx_for_buft(buft);
267
309
  // validate tensor shape
268
310
  if (is_token_embd) {
269
311
  // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
@@ -6,6 +6,7 @@
6
6
 
7
7
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
8
8
  { LLM_ARCH_LLAMA, "llama" },
9
+ { LLM_ARCH_LLAMA4, "llama4" },
9
10
  { LLM_ARCH_DECI, "deci" },
10
11
  { LLM_ARCH_FALCON, "falcon" },
11
12
  { LLM_ARCH_GROK, "grok" },
@@ -18,6 +19,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
18
19
  { LLM_ARCH_REFACT, "refact" },
19
20
  { LLM_ARCH_BERT, "bert" },
20
21
  { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
22
+ { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
21
23
  { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
22
24
  { LLM_ARCH_BLOOM, "bloom" },
23
25
  { LLM_ARCH_STABLELM, "stablelm" },
@@ -25,6 +27,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
25
27
  { LLM_ARCH_QWEN2, "qwen2" },
26
28
  { LLM_ARCH_QWEN2MOE, "qwen2moe" },
27
29
  { LLM_ARCH_QWEN2VL, "qwen2vl" },
30
+ { LLM_ARCH_QWEN3, "qwen3" },
31
+ { LLM_ARCH_QWEN3MOE, "qwen3moe" },
28
32
  { LLM_ARCH_PHI2, "phi2" },
29
33
  { LLM_ARCH_PHI3, "phi3" },
30
34
  { LLM_ARCH_PHIMOE, "phimoe" },
@@ -51,6 +55,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
51
55
  { LLM_ARCH_DEEPSEEK, "deepseek" },
52
56
  { LLM_ARCH_DEEPSEEK2, "deepseek2" },
53
57
  { LLM_ARCH_CHATGLM, "chatglm" },
58
+ { LLM_ARCH_GLM4, "glm4" },
54
59
  { LLM_ARCH_BITNET, "bitnet" },
55
60
  { LLM_ARCH_T5, "t5" },
56
61
  { LLM_ARCH_T5ENCODER, "t5encoder" },
@@ -65,6 +70,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
65
70
  { LLM_ARCH_GRANITE_MOE, "granitemoe" },
66
71
  { LLM_ARCH_CHAMELEON, "chameleon" },
67
72
  { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
73
+ { LLM_ARCH_PLM, "plm" },
74
+ { LLM_ARCH_BAILINGMOE, "bailingmoe" },
68
75
  { LLM_ARCH_UNKNOWN, "(unknown)" },
69
76
  };
70
77
 
@@ -73,6 +80,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
73
80
  { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
74
81
  { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
75
82
  { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
83
+ { LLM_KV_GENERAL_FILE_TYPE, "general.file_type" },
76
84
  { LLM_KV_GENERAL_NAME, "general.name" },
77
85
  { LLM_KV_GENERAL_AUTHOR, "general.author" },
78
86
  { LLM_KV_GENERAL_VERSION, "general.version" },
@@ -99,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
99
107
  { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
100
108
  { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
101
109
  { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
110
+ { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
102
111
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
103
112
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
104
113
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -111,6 +120,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
111
120
  { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
112
121
  { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
113
122
  { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
123
+ { LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
114
124
 
115
125
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
116
126
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -132,6 +142,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
132
142
  { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
133
143
  { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
134
144
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
145
+ { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
146
+ { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
135
147
 
136
148
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
137
149
  { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -230,6 +242,35 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
230
242
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
231
243
  },
232
244
  },
245
+ {
246
+ LLM_ARCH_LLAMA4,
247
+ {
248
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
249
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
250
+ { LLM_TENSOR_OUTPUT, "output" },
251
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
252
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
253
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
254
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
255
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
256
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
257
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
258
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
259
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
260
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
261
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
262
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
263
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
264
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
265
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
266
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
267
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
268
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
269
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
270
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
271
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
272
+ },
273
+ },
233
274
  {
234
275
  LLM_ARCH_DECI,
235
276
  {
@@ -433,6 +474,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
433
474
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
434
475
  },
435
476
  },
477
+ {
478
+ LLM_ARCH_NOMIC_BERT_MOE,
479
+ {
480
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
481
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
482
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
483
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
484
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
485
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
486
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
487
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
488
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
489
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
490
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
491
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
492
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
493
+ },
494
+ },
436
495
  {
437
496
  LLM_ARCH_JINA_BERT_V2,
438
497
  {
@@ -561,6 +620,45 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
561
620
  { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
562
621
  },
563
622
  },
623
+ {
624
+ LLM_ARCH_QWEN3,
625
+ {
626
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
627
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
628
+ { LLM_TENSOR_OUTPUT, "output" },
629
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
630
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
631
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
632
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
633
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
634
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
635
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
636
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
637
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
638
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
639
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
640
+ },
641
+ },
642
+ {
643
+ LLM_ARCH_QWEN3MOE,
644
+ {
645
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
646
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
647
+ { LLM_TENSOR_OUTPUT, "output" },
648
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
649
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
650
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
651
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
652
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
653
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
654
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
655
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
656
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
657
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
658
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
659
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
660
+ },
661
+ },
564
662
  {
565
663
  LLM_ARCH_PHI2,
566
664
  {
@@ -1027,6 +1125,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1027
1125
  { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
1028
1126
  { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
1029
1127
  { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1128
+ { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" },
1129
+ { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" },
1030
1130
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1031
1131
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1032
1132
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
@@ -1043,6 +1143,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1043
1143
  { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1044
1144
  },
1045
1145
  },
1146
+ {
1147
+ LLM_ARCH_PLM,
1148
+ {
1149
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1150
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1151
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1152
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1153
+ { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
1154
+ { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
1155
+ { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1156
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1157
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1158
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1159
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1160
+ },
1161
+ },
1046
1162
  {
1047
1163
  LLM_ARCH_CHATGLM,
1048
1164
  {
@@ -1061,6 +1177,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1061
1177
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1062
1178
  },
1063
1179
  },
1180
+ {
1181
+ LLM_ARCH_GLM4,
1182
+ {
1183
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1184
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1185
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1186
+ { LLM_TENSOR_OUTPUT, "output" },
1187
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1188
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1189
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1190
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1191
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1192
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1193
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1194
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1195
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1196
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
1197
+ },
1198
+ },
1064
1199
  {
1065
1200
  LLM_ARCH_BITNET,
1066
1201
  {
@@ -1346,6 +1481,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1346
1481
  { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1347
1482
  { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1348
1483
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1484
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1485
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1486
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1349
1487
  },
1350
1488
  },
1351
1489
  {
@@ -1392,6 +1530,29 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1392
1530
  { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
1393
1531
  },
1394
1532
  },
1533
+ {
1534
+ LLM_ARCH_BAILINGMOE,
1535
+ {
1536
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1537
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1538
+ { LLM_TENSOR_OUTPUT, "output" },
1539
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1540
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1541
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1542
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1543
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1544
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1545
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1546
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1547
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1548
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1549
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1550
+ { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1551
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1552
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1553
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1554
+ },
1555
+ },
1395
1556
  {
1396
1557
  LLM_ARCH_UNKNOWN,
1397
1558
  {
@@ -1429,23 +1590,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1429
1590
  {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1430
1591
  {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1431
1592
  {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1432
- {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1433
- {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1434
- {LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1435
- {LLM_TENSOR_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1436
- {LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1437
- {LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1438
- {LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1439
- {LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1440
- {LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1441
- {LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1442
- {LLM_TENSOR_FFN_DOWN_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1443
- {LLM_TENSOR_FFN_GATE_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1444
- {LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1445
- {LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1446
- {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1447
- {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1448
- {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1593
+ {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1594
+ {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1449
1595
  {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1450
1596
  {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
1451
1597
  {LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -10,6 +10,7 @@
10
10
 
11
11
  enum llm_arch {
12
12
  LLM_ARCH_LLAMA,
13
+ LLM_ARCH_LLAMA4,
13
14
  LLM_ARCH_DECI,
14
15
  LLM_ARCH_FALCON,
15
16
  LLM_ARCH_BAICHUAN,
@@ -22,6 +23,7 @@ enum llm_arch {
22
23
  LLM_ARCH_REFACT,
23
24
  LLM_ARCH_BERT,
24
25
  LLM_ARCH_NOMIC_BERT,
26
+ LLM_ARCH_NOMIC_BERT_MOE,
25
27
  LLM_ARCH_JINA_BERT_V2,
26
28
  LLM_ARCH_BLOOM,
27
29
  LLM_ARCH_STABLELM,
@@ -29,6 +31,8 @@ enum llm_arch {
29
31
  LLM_ARCH_QWEN2,
30
32
  LLM_ARCH_QWEN2MOE,
31
33
  LLM_ARCH_QWEN2VL,
34
+ LLM_ARCH_QWEN3,
35
+ LLM_ARCH_QWEN3MOE,
32
36
  LLM_ARCH_PHI2,
33
37
  LLM_ARCH_PHI3,
34
38
  LLM_ARCH_PHIMOE,
@@ -55,6 +59,7 @@ enum llm_arch {
55
59
  LLM_ARCH_DEEPSEEK,
56
60
  LLM_ARCH_DEEPSEEK2,
57
61
  LLM_ARCH_CHATGLM,
62
+ LLM_ARCH_GLM4,
58
63
  LLM_ARCH_BITNET,
59
64
  LLM_ARCH_T5,
60
65
  LLM_ARCH_T5ENCODER,
@@ -69,6 +74,8 @@ enum llm_arch {
69
74
  LLM_ARCH_GRANITE_MOE,
70
75
  LLM_ARCH_CHAMELEON,
71
76
  LLM_ARCH_WAVTOKENIZER_DEC,
77
+ LLM_ARCH_PLM,
78
+ LLM_ARCH_BAILINGMOE,
72
79
  LLM_ARCH_UNKNOWN,
73
80
  };
74
81
 
@@ -77,6 +84,7 @@ enum llm_kv {
77
84
  LLM_KV_GENERAL_ARCHITECTURE,
78
85
  LLM_KV_GENERAL_QUANTIZATION_VERSION,
79
86
  LLM_KV_GENERAL_ALIGNMENT,
87
+ LLM_KV_GENERAL_FILE_TYPE,
80
88
  LLM_KV_GENERAL_NAME,
81
89
  LLM_KV_GENERAL_AUTHOR,
82
90
  LLM_KV_GENERAL_VERSION,
@@ -103,6 +111,7 @@ enum llm_kv {
103
111
  LLM_KV_EXPERT_WEIGHTS_SCALE,
104
112
  LLM_KV_EXPERT_WEIGHTS_NORM,
105
113
  LLM_KV_EXPERT_GATING_FUNC,
114
+ LLM_KV_MOE_EVERY_N_LAYERS,
106
115
  LLM_KV_POOLING_TYPE,
107
116
  LLM_KV_LOGIT_SCALE,
108
117
  LLM_KV_DECODER_START_TOKEN_ID,
@@ -115,6 +124,7 @@ enum llm_kv {
115
124
  LLM_KV_RESIDUAL_SCALE,
116
125
  LLM_KV_EMBEDDING_SCALE,
117
126
  LLM_KV_TOKEN_SHIFT_COUNT,
127
+ LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
118
128
 
119
129
  LLM_KV_ATTENTION_HEAD_COUNT,
120
130
  LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -136,6 +146,8 @@ enum llm_kv {
136
146
  LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
137
147
  LLM_KV_ATTENTION_SLIDING_WINDOW,
138
148
  LLM_KV_ATTENTION_SCALE,
149
+ LLM_KV_ATTENTION_KEY_LENGTH_MLA,
150
+ LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
139
151
 
140
152
  LLM_KV_ROPE_DIMENSION_COUNT,
141
153
  LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -249,6 +261,8 @@ enum llm_tensor {
249
261
  LLM_TENSOR_ATTN_Q_NORM,
250
262
  LLM_TENSOR_ATTN_K_NORM,
251
263
  LLM_TENSOR_LAYER_OUT_NORM,
264
+ LLM_TENSOR_POST_ATTN_NORM,
265
+ LLM_TENSOR_POST_MLP_NORM,
252
266
  LLM_TENSOR_SSM_IN,
253
267
  LLM_TENSOR_SSM_CONV1D,
254
268
  LLM_TENSOR_SSM_X,
@@ -296,6 +310,8 @@ enum llm_tensor {
296
310
  LLM_TENSOR_ATTN_Q_B,
297
311
  LLM_TENSOR_ATTN_KV_A_MQA,
298
312
  LLM_TENSOR_ATTN_KV_B,
313
+ LLM_TENSOR_ATTN_K_B,
314
+ LLM_TENSOR_ATTN_V_B,
299
315
  LLM_TENSOR_ATTN_Q_A_NORM,
300
316
  LLM_TENSOR_ATTN_KV_A_NORM,
301
317
  LLM_TENSOR_ATTN_SUB_NORM,