cui-llama.rn 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (285) hide show
  1. package/README.md +35 -7
  2. package/android/src/main/CMakeLists.txt +22 -11
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +42 -6
  4. package/android/src/main/java/com/rnllama/RNLlama.java +139 -4
  5. package/android/src/main/jni.cpp +173 -18
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +24 -4
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +22 -2
  16. package/cpp/LICENSE +21 -0
  17. package/cpp/chat.cpp +129 -107
  18. package/cpp/chat.h +2 -0
  19. package/cpp/common.cpp +58 -78
  20. package/cpp/common.h +29 -21
  21. package/cpp/ggml-alloc.c +4 -1
  22. package/cpp/ggml-backend.cpp +9 -5
  23. package/cpp/ggml-backend.h +4 -4
  24. package/cpp/ggml-cpp.h +1 -1
  25. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  26. package/cpp/ggml-cpu/amx/amx.h +8 -0
  27. package/cpp/ggml-cpu/amx/common.h +91 -0
  28. package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
  29. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  30. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/binary-ops.h +1 -1
  31. package/cpp/ggml-cpu/common.h +72 -0
  32. package/cpp/{ggml-cpu-aarch64.cpp → ggml-cpu/ggml-cpu-aarch64.cpp} +809 -103
  33. package/cpp/{ggml-cpu-quants.c → ggml-cpu/ggml-cpu-quants.c} +306 -6
  34. package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} +114 -55
  35. package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} +32 -16
  36. package/cpp/{ops.cpp → ggml-cpu/ops.cpp} +353 -173
  37. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/ops.h +2 -20
  38. package/cpp/{sgemm.cpp → ggml-cpu/sgemm.cpp} +501 -0
  39. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/simd-mappings.h +7 -3
  40. package/{ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/unary-ops.h +1 -1
  41. package/cpp/{vec.cpp → ggml-cpu/vec.cpp} +0 -6
  42. package/{ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/vec.h +16 -0
  43. package/cpp/ggml-cpu.h +5 -0
  44. package/cpp/ggml-impl.h +16 -9
  45. package/cpp/ggml-llama-sim.metallib +0 -0
  46. package/cpp/ggml-llama.metallib +0 -0
  47. package/cpp/ggml-metal-impl.h +36 -11
  48. package/cpp/ggml-metal.m +810 -176
  49. package/cpp/ggml-opt.cpp +373 -190
  50. package/cpp/ggml-opt.h +49 -28
  51. package/cpp/ggml-quants.c +0 -6
  52. package/cpp/ggml.c +227 -282
  53. package/cpp/ggml.h +82 -101
  54. package/cpp/gguf.cpp +33 -33
  55. package/cpp/json-schema-to-grammar.cpp +3 -0
  56. package/cpp/llama-adapter.cpp +6 -0
  57. package/cpp/llama-arch.cpp +49 -17
  58. package/cpp/llama-arch.h +9 -0
  59. package/cpp/llama-batch.cpp +8 -2
  60. package/cpp/llama-batch.h +2 -1
  61. package/cpp/llama-chat.cpp +39 -16
  62. package/cpp/llama-chat.h +4 -2
  63. package/cpp/llama-context.cpp +440 -611
  64. package/cpp/llama-context.h +44 -33
  65. package/cpp/llama-cparams.h +1 -0
  66. package/cpp/llama-graph.cpp +214 -291
  67. package/cpp/llama-graph.h +69 -21
  68. package/cpp/llama-hparams.cpp +17 -1
  69. package/cpp/llama-hparams.h +39 -5
  70. package/cpp/llama-kv-cache.cpp +2067 -620
  71. package/cpp/llama-kv-cache.h +410 -108
  72. package/cpp/llama-memory.h +12 -1
  73. package/cpp/llama-model-loader.cpp +24 -15
  74. package/cpp/llama-model-saver.cpp +281 -0
  75. package/cpp/llama-model-saver.h +37 -0
  76. package/cpp/llama-model.cpp +1089 -359
  77. package/cpp/llama-model.h +19 -3
  78. package/cpp/llama-sampling.cpp +20 -7
  79. package/cpp/llama-vocab.cpp +54 -9
  80. package/cpp/llama-vocab.h +6 -0
  81. package/cpp/llama.cpp +14 -0
  82. package/cpp/llama.h +86 -142
  83. package/cpp/minja/chat-template.hpp +9 -5
  84. package/cpp/minja/minja.hpp +69 -36
  85. package/cpp/rn-llama.cpp +602 -190
  86. package/cpp/rn-llama.h +34 -8
  87. package/cpp/sampling.cpp +57 -50
  88. package/cpp/tools/mtmd/clip-impl.h +462 -0
  89. package/cpp/tools/mtmd/clip.cpp +4024 -0
  90. package/cpp/tools/mtmd/clip.h +101 -0
  91. package/cpp/tools/mtmd/miniaudio.h +93468 -0
  92. package/cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  93. package/cpp/tools/mtmd/mtmd-audio.h +62 -0
  94. package/cpp/tools/mtmd/mtmd-helper.cpp +297 -0
  95. package/cpp/tools/mtmd/mtmd.cpp +942 -0
  96. package/cpp/tools/mtmd/mtmd.h +362 -0
  97. package/cpp/tools/mtmd/stb_image.h +7988 -0
  98. package/ios/CMakeLists.txt +20 -10
  99. package/ios/RNLlama.h +6 -0
  100. package/ios/RNLlama.mm +82 -3
  101. package/ios/RNLlamaContext.h +5 -1
  102. package/ios/RNLlamaContext.mm +131 -38
  103. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +2 -0
  104. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +29 -21
  105. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  106. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  107. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  108. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  109. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  110. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  111. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +82 -101
  112. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  113. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  114. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  115. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  116. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  117. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  118. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  119. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  120. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  121. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  122. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +86 -142
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  131. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  132. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  133. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  134. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  135. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  136. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  137. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  138. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  139. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  140. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  141. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  142. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  143. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  144. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  145. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  146. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  147. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  148. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  149. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  150. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  151. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  152. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  153. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  154. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  155. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  156. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  160. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +2 -0
  161. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +29 -21
  162. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  163. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  164. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  165. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  166. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  167. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  168. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +82 -101
  169. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  170. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  171. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  172. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  173. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  174. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  175. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  176. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  177. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  178. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  179. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  180. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  181. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +86 -142
  182. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  183. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  184. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  185. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
  186. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  187. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  188. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  189. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  190. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  191. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  192. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  193. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  194. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  195. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  196. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  197. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  198. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  199. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  200. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  201. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  202. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  203. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  204. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  205. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  206. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  207. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  208. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  209. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  210. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  211. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  212. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  213. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  214. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  215. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  216. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  217. package/jest/mock.js +33 -7
  218. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  219. package/lib/commonjs/index.js +153 -21
  220. package/lib/commonjs/index.js.map +1 -1
  221. package/lib/module/NativeRNLlama.js.map +1 -1
  222. package/lib/module/index.js +152 -20
  223. package/lib/module/index.js.map +1 -1
  224. package/lib/typescript/NativeRNLlama.d.ts +54 -4
  225. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  226. package/lib/typescript/index.d.ts +72 -6
  227. package/lib/typescript/index.d.ts.map +1 -1
  228. package/package.json +1 -1
  229. package/src/NativeRNLlama.ts +72 -4
  230. package/src/index.ts +212 -38
  231. package/cpp/binary-ops.h +0 -16
  232. package/cpp/ops.h +0 -128
  233. package/cpp/simd-mappings.h +0 -888
  234. package/cpp/unary-ops.h +0 -28
  235. package/cpp/vec.h +0 -802
  236. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  237. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  238. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  239. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  240. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  241. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +0 -128
  242. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  243. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  244. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +0 -802
  245. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  246. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  247. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  248. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  249. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  250. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  251. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  252. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  253. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  254. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  255. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  256. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  257. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +0 -128
  258. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  259. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  260. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +0 -28
  261. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +0 -16
  262. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  263. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  264. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  265. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  266. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +0 -128
  267. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  268. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +0 -888
  269. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  270. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  271. package/lib/commonjs/chat.js +0 -37
  272. package/lib/commonjs/chat.js.map +0 -1
  273. package/lib/module/chat.js +0 -33
  274. package/lib/module/chat.js.map +0 -1
  275. package/lib/typescript/chat.d.ts +0 -10
  276. package/lib/typescript/chat.d.ts.map +0 -1
  277. package/src/chat.ts +0 -44
  278. /package/cpp/{binary-ops.cpp → ggml-cpu/binary-ops.cpp} +0 -0
  279. /package/cpp/{ggml-cpu-aarch64.h → ggml-cpu/ggml-cpu-aarch64.h} +0 -0
  280. /package/cpp/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -0
  281. /package/cpp/{ggml-cpu-quants.h → ggml-cpu/ggml-cpu-quants.h} +0 -0
  282. /package/cpp/{ggml-cpu-traits.cpp → ggml-cpu/ggml-cpu-traits.cpp} +0 -0
  283. /package/cpp/{ggml-cpu-traits.h → ggml-cpu/ggml-cpu-traits.h} +0 -0
  284. /package/cpp/{sgemm.h → ggml-cpu/sgemm.h} +0 -0
  285. /package/cpp/{unary-ops.cpp → ggml-cpu/unary-ops.cpp} +0 -0
@@ -19,6 +19,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
19
19
  { LLM_ARCH_REFACT, "refact" },
20
20
  { LLM_ARCH_BERT, "bert" },
21
21
  { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
22
+ { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
22
23
  { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
23
24
  { LLM_ARCH_BLOOM, "bloom" },
24
25
  { LLM_ARCH_STABLELM, "stablelm" },
@@ -54,6 +55,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
54
55
  { LLM_ARCH_DEEPSEEK, "deepseek" },
55
56
  { LLM_ARCH_DEEPSEEK2, "deepseek2" },
56
57
  { LLM_ARCH_CHATGLM, "chatglm" },
58
+ { LLM_ARCH_GLM4, "glm4" },
57
59
  { LLM_ARCH_BITNET, "bitnet" },
58
60
  { LLM_ARCH_T5, "t5" },
59
61
  { LLM_ARCH_T5ENCODER, "t5encoder" },
@@ -105,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
105
107
  { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
106
108
  { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
107
109
  { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
110
+ { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
108
111
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
109
112
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
110
113
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -139,6 +142,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
139
142
  { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
140
143
  { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
141
144
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
145
+ { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
146
+ { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
142
147
 
143
148
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
144
149
  { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -469,6 +474,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
469
474
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
470
475
  },
471
476
  },
477
+ {
478
+ LLM_ARCH_NOMIC_BERT_MOE,
479
+ {
480
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
481
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
482
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
483
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
484
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
485
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
486
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
487
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
488
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
489
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
490
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
491
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
492
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
493
+ },
494
+ },
472
495
  {
473
496
  LLM_ARCH_JINA_BERT_V2,
474
497
  {
@@ -1102,6 +1125,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1102
1125
  { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
1103
1126
  { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
1104
1127
  { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1128
+ { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" },
1129
+ { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" },
1105
1130
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1106
1131
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1107
1132
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
@@ -1152,6 +1177,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1152
1177
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1153
1178
  },
1154
1179
  },
1180
+ {
1181
+ LLM_ARCH_GLM4,
1182
+ {
1183
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1184
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1185
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1186
+ { LLM_TENSOR_OUTPUT, "output" },
1187
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1188
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1189
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1190
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1191
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1192
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1193
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1194
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1195
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1196
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
1197
+ },
1198
+ },
1155
1199
  {
1156
1200
  LLM_ARCH_BITNET,
1157
1201
  {
@@ -1437,6 +1481,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1437
1481
  { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1438
1482
  { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1439
1483
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1484
+ { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1485
+ { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1486
+ { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1440
1487
  },
1441
1488
  },
1442
1489
  {
@@ -1543,23 +1590,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1543
1590
  {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1544
1591
  {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1545
1592
  {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1546
- {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1547
- {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1548
- {LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1549
- {LLM_TENSOR_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1550
- {LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1551
- {LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1552
- {LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1553
- {LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1554
- {LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1555
- {LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1556
- {LLM_TENSOR_FFN_DOWN_SHEXP, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1557
- {LLM_TENSOR_FFN_GATE_SHEXP, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1558
- {LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1559
- {LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1560
- {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1561
- {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1562
- {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1593
+ {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1594
+ {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1563
1595
  {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1564
1596
  {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1565
1597
  {LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
package/cpp/llama-arch.h CHANGED
@@ -23,6 +23,7 @@ enum llm_arch {
23
23
  LLM_ARCH_REFACT,
24
24
  LLM_ARCH_BERT,
25
25
  LLM_ARCH_NOMIC_BERT,
26
+ LLM_ARCH_NOMIC_BERT_MOE,
26
27
  LLM_ARCH_JINA_BERT_V2,
27
28
  LLM_ARCH_BLOOM,
28
29
  LLM_ARCH_STABLELM,
@@ -58,6 +59,7 @@ enum llm_arch {
58
59
  LLM_ARCH_DEEPSEEK,
59
60
  LLM_ARCH_DEEPSEEK2,
60
61
  LLM_ARCH_CHATGLM,
62
+ LLM_ARCH_GLM4,
61
63
  LLM_ARCH_BITNET,
62
64
  LLM_ARCH_T5,
63
65
  LLM_ARCH_T5ENCODER,
@@ -109,6 +111,7 @@ enum llm_kv {
109
111
  LLM_KV_EXPERT_WEIGHTS_SCALE,
110
112
  LLM_KV_EXPERT_WEIGHTS_NORM,
111
113
  LLM_KV_EXPERT_GATING_FUNC,
114
+ LLM_KV_MOE_EVERY_N_LAYERS,
112
115
  LLM_KV_POOLING_TYPE,
113
116
  LLM_KV_LOGIT_SCALE,
114
117
  LLM_KV_DECODER_START_TOKEN_ID,
@@ -143,6 +146,8 @@ enum llm_kv {
143
146
  LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
144
147
  LLM_KV_ATTENTION_SLIDING_WINDOW,
145
148
  LLM_KV_ATTENTION_SCALE,
149
+ LLM_KV_ATTENTION_KEY_LENGTH_MLA,
150
+ LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
146
151
 
147
152
  LLM_KV_ROPE_DIMENSION_COUNT,
148
153
  LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -256,6 +261,8 @@ enum llm_tensor {
256
261
  LLM_TENSOR_ATTN_Q_NORM,
257
262
  LLM_TENSOR_ATTN_K_NORM,
258
263
  LLM_TENSOR_LAYER_OUT_NORM,
264
+ LLM_TENSOR_POST_ATTN_NORM,
265
+ LLM_TENSOR_POST_MLP_NORM,
259
266
  LLM_TENSOR_SSM_IN,
260
267
  LLM_TENSOR_SSM_CONV1D,
261
268
  LLM_TENSOR_SSM_X,
@@ -303,6 +310,8 @@ enum llm_tensor {
303
310
  LLM_TENSOR_ATTN_Q_B,
304
311
  LLM_TENSOR_ATTN_KV_A_MQA,
305
312
  LLM_TENSOR_ATTN_KV_B,
313
+ LLM_TENSOR_ATTN_K_B,
314
+ LLM_TENSOR_ATTN_V_B,
306
315
  LLM_TENSOR_ATTN_Q_A_NORM,
307
316
  LLM_TENSOR_ATTN_KV_A_NORM,
308
317
  LLM_TENSOR_ATTN_SUB_NORM,
@@ -1,5 +1,6 @@
1
1
  #include "llama-batch.h"
2
2
 
3
+ #include <cassert>
3
4
  #include <cstring>
4
5
  #include <algorithm>
5
6
 
@@ -189,7 +190,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
189
190
  return ubatch;
190
191
  }
191
192
 
192
- void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
193
+ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
193
194
  LM_GGML_ASSERT(batch.n_tokens >= 0);
194
195
  this->batch = &batch;
195
196
  this->n_embd = n_embd;
@@ -203,6 +204,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
203
204
  for (size_t i = 0; i < n_tokens; ++i) {
204
205
  ids[i] = i;
205
206
  }
207
+
206
208
  if (simple_split) {
207
209
  seq.resize(1);
208
210
  llama_sbatch_seq & s = seq[0];
@@ -212,6 +214,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
212
214
  s.length = n_tokens;
213
215
  return;
214
216
  }
217
+
215
218
  std::sort(ids.begin(), ids.end(),
216
219
  [&batch](size_t a, size_t b) {
217
220
  int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
@@ -239,6 +242,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
239
242
  return n_seq_a > n_seq_b;
240
243
  }
241
244
  );
245
+
242
246
  // init seq
243
247
  llama_sbatch_seq * last_seq = nullptr;
244
248
 
@@ -262,6 +266,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
262
266
  seq.push_back(new_seq);
263
267
  last_seq = &seq.back();
264
268
  }
269
+
265
270
  // keep shared prompts first at the end, then sort by length descending.
266
271
  std::sort(seq.begin(), seq.end(),
267
272
  [](llama_sbatch_seq & a, llama_sbatch_seq & b) {
@@ -277,9 +282,10 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
277
282
  batch = in_batch;
278
283
  LM_GGML_ASSERT(batch.n_tokens > 0);
279
284
  if (!batch.pos) {
285
+ assert(p0 >= 0);
280
286
  pos.resize(batch.n_tokens);
281
287
  for (int32_t i = 0; i < batch.n_tokens; i++) {
282
- pos[i] = i + p0;
288
+ pos[i] = p0 + i;
283
289
  }
284
290
  batch.pos = pos.data();
285
291
  }
package/cpp/llama-batch.h CHANGED
@@ -70,7 +70,8 @@ struct llama_sbatch {
70
70
  // sequence-wise split
71
71
  llama_ubatch split_seq(size_t n_ubatch);
72
72
 
73
- void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
73
+ llama_sbatch() = default;
74
+ llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
74
75
  };
75
76
 
76
77
  // temporary allocate memory for the input batch if needed
@@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
35
35
  { "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
36
36
  { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
37
37
  { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
38
+ { "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
38
39
  { "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
39
40
  { "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
40
41
  { "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
@@ -50,8 +51,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
50
51
  { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
51
52
  { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
52
53
  { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
53
- { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
54
- { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
54
+ { "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
55
+ { "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
55
56
  { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
56
57
  { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
57
58
  { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
@@ -62,6 +63,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
62
63
  { "yandex", LLM_CHAT_TEMPLATE_YANDEX },
63
64
  { "bailing", LLM_CHAT_TEMPLATE_BAILING },
64
65
  { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
66
+ { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
65
67
  };
66
68
 
67
69
  llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -81,7 +83,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
81
83
  if (tmpl_contains("<|im_start|>")) {
82
84
  return tmpl_contains("<|im_sep|>")
83
85
  ? LLM_CHAT_TEMPLATE_PHI_4
84
- : LLM_CHAT_TEMPLATE_CHATML;
86
+ : tmpl_contains("<end_of_utterance>")
87
+ ? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
88
+ : LLM_CHAT_TEMPLATE_CHATML;
85
89
  } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
86
90
  if (tmpl_contains("[SYSTEM_PROMPT]")) {
87
91
  return LLM_CHAT_TEMPLATE_MISTRAL_V7;
@@ -119,8 +123,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
119
123
  }
120
124
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
121
125
  return LLM_CHAT_TEMPLATE_PHI_3;
126
+ } else if (tmpl_contains("[gMASK]<sop>")) {
127
+ return LLM_CHAT_TEMPLATE_CHATGLM_4;
122
128
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
123
129
  return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
130
+ } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
131
+ return LLM_CHAT_TEMPLATE_GLMEDGE;
124
132
  } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
125
133
  return LLM_CHAT_TEMPLATE_ZEPHYR;
126
134
  } else if (tmpl_contains("bos_token + message['role']")) {
@@ -149,9 +157,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
149
157
  return LLM_CHAT_TEMPLATE_LLAMA_3;
150
158
  } else if (tmpl_contains("[gMASK]sop")) {
151
159
  // chatglm3-6b
152
- return LLM_CHAT_TEMPLATE_CHATGML_3;
153
- } else if (tmpl_contains("[gMASK]<sop>")) {
154
- return LLM_CHAT_TEMPLATE_CHATGML_4;
160
+ return LLM_CHAT_TEMPLATE_CHATGLM_3;
155
161
  } else if (tmpl_contains(LU8("<用户>"))) {
156
162
  // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
157
163
  return LLM_CHAT_TEMPLATE_MINICPM;
@@ -197,19 +203,20 @@ int32_t llm_chat_apply_template(
197
203
  if (add_ass) {
198
204
  ss << "<|im_start|>assistant\n";
199
205
  }
200
- } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
206
+ } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
201
207
  // Official mistral 'v7' template
202
208
  // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
209
+ // https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
210
+ const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
203
211
  for (auto message : chat) {
204
212
  std::string role(message->role);
205
213
  std::string content(message->content);
206
214
  if (role == "system") {
207
- ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
215
+ ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
208
216
  } else if (role == "user") {
209
- ss << "[INST] " << content << "[/INST]";
210
- }
211
- else {
212
- ss << " " << content << "</s>";
217
+ ss << "[INST]" << trailing_space << content << "[/INST]";
218
+ } else {
219
+ ss << trailing_space << content << "</s>";
213
220
  }
214
221
  }
215
222
  } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
@@ -432,7 +439,7 @@ int32_t llm_chat_apply_template(
432
439
  if (add_ass) {
433
440
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
434
441
  }
435
- } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
442
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
436
443
  // chatglm3-6b
437
444
  ss << "[gMASK]" << "sop";
438
445
  for (auto message : chat) {
@@ -442,14 +449,14 @@ int32_t llm_chat_apply_template(
442
449
  if (add_ass) {
443
450
  ss << "<|assistant|>";
444
451
  }
445
- } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
452
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
446
453
  ss << "[gMASK]" << "<sop>";
447
454
  for (auto message : chat) {
448
455
  std::string role(message->role);
449
456
  ss << "<|" << role << "|>" << "\n" << message->content;
450
457
  }
451
458
  if (add_ass) {
452
- ss << "<|assistant|>";
459
+ ss << "<|assistant|>\n";
453
460
  }
454
461
  } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
455
462
  for (auto message : chat) {
@@ -620,7 +627,23 @@ int32_t llm_chat_apply_template(
620
627
  if (add_ass) {
621
628
  ss << "<|header_start|>assistant<|header_end|>\n\n";
622
629
  }
623
- } else {
630
+ } else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
631
+ // SmolVLM
632
+ ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
633
+ for (auto message : chat) {
634
+ std::string role(message->role);
635
+ if (role == "system") {
636
+ ss << message->content << "\n\n";
637
+ } else if (role == "user") {
638
+ ss << "User: " << message->content << "<end_of_utterance>\n";
639
+ } else {
640
+ ss << "Assistant: " << message->content << "<end_of_utterance>\n";
641
+ }
642
+ }
643
+ if (add_ass) {
644
+ ss << "Assistant:";
645
+ }
646
+ } else {
624
647
  // template not supported
625
648
  return -1;
626
649
  }
package/cpp/llama-chat.h CHANGED
@@ -14,6 +14,7 @@ enum llm_chat_template {
14
14
  LLM_CHAT_TEMPLATE_MISTRAL_V3,
15
15
  LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
16
16
  LLM_CHAT_TEMPLATE_MISTRAL_V7,
17
+ LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
17
18
  LLM_CHAT_TEMPLATE_PHI_3,
18
19
  LLM_CHAT_TEMPLATE_PHI_4,
19
20
  LLM_CHAT_TEMPLATE_FALCON_3,
@@ -29,8 +30,8 @@ enum llm_chat_template {
29
30
  LLM_CHAT_TEMPLATE_DEEPSEEK_3,
30
31
  LLM_CHAT_TEMPLATE_COMMAND_R,
31
32
  LLM_CHAT_TEMPLATE_LLAMA_3,
32
- LLM_CHAT_TEMPLATE_CHATGML_3,
33
- LLM_CHAT_TEMPLATE_CHATGML_4,
33
+ LLM_CHAT_TEMPLATE_CHATGLM_3,
34
+ LLM_CHAT_TEMPLATE_CHATGLM_4,
34
35
  LLM_CHAT_TEMPLATE_GLMEDGE,
35
36
  LLM_CHAT_TEMPLATE_MINICPM,
36
37
  LLM_CHAT_TEMPLATE_EXAONE_3,
@@ -41,6 +42,7 @@ enum llm_chat_template {
41
42
  LLM_CHAT_TEMPLATE_YANDEX,
42
43
  LLM_CHAT_TEMPLATE_BAILING,
43
44
  LLM_CHAT_TEMPLATE_LLAMA4,
45
+ LLM_CHAT_TEMPLATE_SMOLVLM,
44
46
  LLM_CHAT_TEMPLATE_UNKNOWN,
45
47
  };
46
48