cui-llama.rn 1.6.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. package/README.md +35 -7
  2. package/android/src/main/CMakeLists.txt +16 -11
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +4 -1
  4. package/android/src/main/jni.cpp +20 -4
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  13. package/cpp/LICENSE +21 -0
  14. package/cpp/chat.cpp +1 -1
  15. package/cpp/common.cpp +17 -2
  16. package/cpp/common.h +7 -3
  17. package/cpp/ggml-alloc.c +4 -1
  18. package/cpp/ggml-cpp.h +1 -1
  19. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  20. package/cpp/ggml-cpu/amx/amx.h +8 -0
  21. package/cpp/ggml-cpu/amx/common.h +91 -0
  22. package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
  23. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  24. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/binary-ops.h +1 -1
  25. package/cpp/ggml-cpu/common.h +72 -0
  26. package/cpp/{ggml-cpu-aarch64.cpp → ggml-cpu/ggml-cpu-aarch64.cpp} +809 -101
  27. package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} +109 -42
  28. package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} +3 -0
  29. package/cpp/{ops.cpp → ggml-cpu/ops.cpp} +246 -160
  30. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/ops.h +2 -20
  31. package/cpp/{sgemm.cpp → ggml-cpu/sgemm.cpp} +501 -0
  32. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/simd-mappings.h +7 -3
  33. package/{ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/unary-ops.h +1 -1
  34. package/cpp/ggml-cpu.h +5 -0
  35. package/cpp/ggml-impl.h +16 -9
  36. package/cpp/ggml-llama-sim.metallib +0 -0
  37. package/cpp/ggml-llama.metallib +0 -0
  38. package/cpp/ggml-metal.m +492 -47
  39. package/cpp/ggml.c +134 -244
  40. package/cpp/ggml.h +61 -94
  41. package/cpp/json-schema-to-grammar.cpp +3 -0
  42. package/cpp/llama-arch.cpp +46 -17
  43. package/cpp/llama-arch.h +9 -0
  44. package/cpp/llama-batch.cpp +5 -1
  45. package/cpp/llama-batch.h +2 -1
  46. package/cpp/llama-chat.cpp +31 -10
  47. package/cpp/llama-chat.h +3 -2
  48. package/cpp/llama-context.cpp +104 -489
  49. package/cpp/llama-context.h +14 -30
  50. package/cpp/llama-graph.cpp +69 -62
  51. package/cpp/llama-graph.h +21 -18
  52. package/cpp/llama-hparams.h +5 -0
  53. package/cpp/llama-kv-cache.cpp +1497 -391
  54. package/cpp/llama-kv-cache.h +272 -80
  55. package/cpp/llama-memory.h +11 -1
  56. package/cpp/llama-model.cpp +502 -176
  57. package/cpp/llama-model.h +13 -3
  58. package/cpp/llama-sampling.cpp +2 -1
  59. package/cpp/llama-vocab.cpp +8 -1
  60. package/cpp/llama.h +14 -11
  61. package/cpp/rn-llama.cpp +20 -172
  62. package/cpp/rn-llama.h +1 -5
  63. package/ios/CMakeLists.txt +13 -10
  64. package/ios/RNLlama.h +6 -0
  65. package/ios/RNLlama.mm +5 -0
  66. package/ios/RNLlamaContext.mm +26 -28
  67. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +7 -3
  68. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  69. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  70. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  71. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +61 -94
  72. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  73. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  74. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +3 -2
  75. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +14 -30
  76. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +21 -18
  77. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +5 -0
  78. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +272 -80
  79. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +11 -1
  80. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +13 -3
  81. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +14 -11
  82. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +1 -5
  83. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  84. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  85. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +7 -3
  86. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  87. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  88. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  89. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +61 -94
  90. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  91. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  92. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +3 -2
  93. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +14 -30
  94. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +21 -18
  95. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +5 -0
  96. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +272 -80
  97. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +11 -1
  98. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +13 -3
  99. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +14 -11
  100. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +1 -5
  101. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  102. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  103. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +7 -3
  104. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  105. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  106. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  107. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +61 -94
  108. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  109. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  110. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +3 -2
  111. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +14 -30
  112. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +21 -18
  113. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +5 -0
  114. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +272 -80
  115. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +11 -1
  116. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +13 -3
  117. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +14 -11
  118. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +1 -5
  119. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  120. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  121. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +7 -3
  122. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  123. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  124. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  125. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +61 -94
  126. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  127. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  128. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +3 -2
  129. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +14 -30
  130. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +21 -18
  131. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +5 -0
  132. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +272 -80
  133. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +11 -1
  134. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +13 -3
  135. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +14 -11
  136. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +1 -5
  137. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  138. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  139. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  140. package/lib/module/NativeRNLlama.js.map +1 -1
  141. package/lib/typescript/NativeRNLlama.d.ts +4 -0
  142. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  143. package/package.json +1 -1
  144. package/src/NativeRNLlama.ts +5 -0
  145. package/cpp/binary-ops.h +0 -16
  146. package/cpp/ops.h +0 -128
  147. package/cpp/simd-mappings.h +0 -888
  148. package/cpp/unary-ops.h +0 -28
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +0 -128
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  157. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +0 -802
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  165. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  166. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  167. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  168. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  169. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  170. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +0 -128
  171. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  172. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  173. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +0 -28
  174. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/vec.h +0 -802
  175. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +0 -16
  176. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  177. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  178. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  179. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  180. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +0 -128
  181. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  182. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +0 -888
  183. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  184. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  185. /package/cpp/{binary-ops.cpp → ggml-cpu/binary-ops.cpp} +0 -0
  186. /package/cpp/{ggml-cpu-aarch64.h → ggml-cpu/ggml-cpu-aarch64.h} +0 -0
  187. /package/cpp/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -0
  188. /package/cpp/{ggml-cpu-quants.c → ggml-cpu/ggml-cpu-quants.c} +0 -0
  189. /package/cpp/{ggml-cpu-quants.h → ggml-cpu/ggml-cpu-quants.h} +0 -0
  190. /package/cpp/{ggml-cpu-traits.cpp → ggml-cpu/ggml-cpu-traits.cpp} +0 -0
  191. /package/cpp/{ggml-cpu-traits.h → ggml-cpu/ggml-cpu-traits.h} +0 -0
  192. /package/cpp/{sgemm.h → ggml-cpu/sgemm.h} +0 -0
  193. /package/cpp/{unary-ops.cpp → ggml-cpu/unary-ops.cpp} +0 -0
  194. /package/cpp/{vec.cpp → ggml-cpu/vec.cpp} +0 -0
  195. /package/cpp/{vec.h → ggml-cpu/vec.h} +0 -0
package/cpp/ggml.h CHANGED
@@ -394,8 +394,8 @@ extern "C" {
394
394
 
395
395
  // precision
396
396
  enum lm_ggml_prec {
397
- LM_GGML_PREC_DEFAULT,
398
- LM_GGML_PREC_F32,
397
+ LM_GGML_PREC_DEFAULT = 0, // stored as lm_ggml_tensor.op_params, 0 by default
398
+ LM_GGML_PREC_F32 = 10,
399
399
  };
400
400
 
401
401
  // model file types
@@ -482,6 +482,7 @@ extern "C" {
482
482
  LM_GGML_OP_CONV_TRANSPOSE_1D,
483
483
  LM_GGML_OP_IM2COL,
484
484
  LM_GGML_OP_IM2COL_BACK,
485
+ LM_GGML_OP_CONV_2D_DW,
485
486
  LM_GGML_OP_CONV_TRANSPOSE_2D,
486
487
  LM_GGML_OP_POOL_1D,
487
488
  LM_GGML_OP_POOL_2D,
@@ -508,17 +509,12 @@ extern "C" {
508
509
 
509
510
  LM_GGML_OP_UNARY,
510
511
 
511
- LM_GGML_OP_MAP_UNARY,
512
- LM_GGML_OP_MAP_BINARY,
513
-
514
- LM_GGML_OP_MAP_CUSTOM1_F32,
515
- LM_GGML_OP_MAP_CUSTOM2_F32,
516
- LM_GGML_OP_MAP_CUSTOM3_F32,
517
-
518
512
  LM_GGML_OP_MAP_CUSTOM1,
519
513
  LM_GGML_OP_MAP_CUSTOM2,
520
514
  LM_GGML_OP_MAP_CUSTOM3,
521
515
 
516
+ LM_GGML_OP_CUSTOM,
517
+
522
518
  LM_GGML_OP_CROSS_ENTROPY_LOSS,
523
519
  LM_GGML_OP_CROSS_ENTROPY_LOSS_BACK,
524
520
  LM_GGML_OP_OPT_STEP_ADAMW,
@@ -683,6 +679,9 @@ extern "C" {
683
679
  LM_GGML_API bool lm_ggml_is_contiguous_1(const struct lm_ggml_tensor * tensor); // contiguous for dims >= 1
684
680
  LM_GGML_API bool lm_ggml_is_contiguous_2(const struct lm_ggml_tensor * tensor); // contiguous for dims >= 2
685
681
 
682
+ // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
683
+ LM_GGML_API bool lm_ggml_is_contiguous_channels(const struct lm_ggml_tensor * tensor);
684
+
686
685
  LM_GGML_API bool lm_ggml_are_same_shape (const struct lm_ggml_tensor * t0, const struct lm_ggml_tensor * t1);
687
686
  LM_GGML_API bool lm_ggml_are_same_stride(const struct lm_ggml_tensor * t0, const struct lm_ggml_tensor * t1);
688
687
 
@@ -1666,7 +1665,7 @@ extern "C" {
1666
1665
  struct lm_ggml_tensor * a,
1667
1666
  struct lm_ggml_tensor * b);
1668
1667
 
1669
- // depthwise
1668
+ // depthwise (via im2col and mul_mat)
1670
1669
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_2d_dw(
1671
1670
  struct lm_ggml_context * ctx,
1672
1671
  struct lm_ggml_tensor * a, // convolution kernel
@@ -1678,6 +1677,22 @@ extern "C" {
1678
1677
  int d0, // dilation dimension 0
1679
1678
  int d1); // dilation dimension 1
1680
1679
 
1680
+ // Depthwise 2D convolution
1681
+ // may be faster than lm_ggml_conv_2d_dw, but not available in all backends
1682
+ // a: KW KH 1 C convolution kernel
1683
+ // b: W H C N input data
1684
+ // res: W_out H_out C N
1685
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_2d_dw_direct(
1686
+ struct lm_ggml_context * ctx,
1687
+ struct lm_ggml_tensor * a,
1688
+ struct lm_ggml_tensor * b,
1689
+ int stride0,
1690
+ int stride1,
1691
+ int pad0,
1692
+ int pad1,
1693
+ int dilation0,
1694
+ int dilation1);
1695
+
1681
1696
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_conv_transpose_2d_p0(
1682
1697
  struct lm_ggml_context * ctx,
1683
1698
  struct lm_ggml_tensor * a,
@@ -1723,24 +1738,29 @@ extern "C" {
1723
1738
  float p0,
1724
1739
  float p1);
1725
1740
 
1726
- // nearest interpolate
1741
+ enum lm_ggml_scale_mode {
1742
+ LM_GGML_SCALE_MODE_NEAREST = 0,
1743
+ LM_GGML_SCALE_MODE_BILINEAR = 1,
1744
+ };
1745
+
1746
+ // interpolate
1727
1747
  // multiplies ne0 and ne1 by scale factor
1728
- // used in stable-diffusion
1729
1748
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_upscale(
1730
1749
  struct lm_ggml_context * ctx,
1731
1750
  struct lm_ggml_tensor * a,
1732
- int scale_factor);
1751
+ int scale_factor,
1752
+ enum lm_ggml_scale_mode mode);
1733
1753
 
1734
- // nearest interpolate
1735
- // nearest interpolate to specified dimensions
1736
- // used in tortoise.cpp
1754
+ // interpolate
1755
+ // interpolate scale to specified dimensions
1737
1756
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_upscale_ext(
1738
1757
  struct lm_ggml_context * ctx,
1739
1758
  struct lm_ggml_tensor * a,
1740
1759
  int ne0,
1741
1760
  int ne1,
1742
1761
  int ne2,
1743
- int ne3);
1762
+ int ne3,
1763
+ enum lm_ggml_scale_mode mode);
1744
1764
 
1745
1765
  // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1746
1766
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_pad(
@@ -1917,83 +1937,6 @@ extern "C" {
1917
1937
 
1918
1938
  // custom operators
1919
1939
 
1920
- typedef void (*lm_ggml_unary_op_f32_t) (const int, float *, const float *);
1921
- typedef void (*lm_ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1922
-
1923
- typedef void (*lm_ggml_custom1_op_f32_t)(struct lm_ggml_tensor *, const struct lm_ggml_tensor *);
1924
- typedef void (*lm_ggml_custom2_op_f32_t)(struct lm_ggml_tensor *, const struct lm_ggml_tensor *, const struct lm_ggml_tensor *);
1925
- typedef void (*lm_ggml_custom3_op_f32_t)(struct lm_ggml_tensor *, const struct lm_ggml_tensor *, const struct lm_ggml_tensor *, const struct lm_ggml_tensor *);
1926
-
1927
- LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_unary_f32(
1928
- struct lm_ggml_context * ctx,
1929
- struct lm_ggml_tensor * a,
1930
- lm_ggml_unary_op_f32_t fun),
1931
- "use lm_ggml_map_custom1 instead");
1932
-
1933
- LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_unary_inplace_f32(
1934
- struct lm_ggml_context * ctx,
1935
- struct lm_ggml_tensor * a,
1936
- lm_ggml_unary_op_f32_t fun),
1937
- "use lm_ggml_map_custom1_inplace instead");
1938
-
1939
- LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_binary_f32(
1940
- struct lm_ggml_context * ctx,
1941
- struct lm_ggml_tensor * a,
1942
- struct lm_ggml_tensor * b,
1943
- lm_ggml_binary_op_f32_t fun),
1944
- "use lm_ggml_map_custom2 instead");
1945
-
1946
- LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_binary_inplace_f32(
1947
- struct lm_ggml_context * ctx,
1948
- struct lm_ggml_tensor * a,
1949
- struct lm_ggml_tensor * b,
1950
- lm_ggml_binary_op_f32_t fun),
1951
- "use lm_ggml_map_custom2_inplace instead");
1952
-
1953
- LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom1_f32(
1954
- struct lm_ggml_context * ctx,
1955
- struct lm_ggml_tensor * a,
1956
- lm_ggml_custom1_op_f32_t fun),
1957
- "use lm_ggml_map_custom1 instead");
1958
-
1959
- LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom1_inplace_f32(
1960
- struct lm_ggml_context * ctx,
1961
- struct lm_ggml_tensor * a,
1962
- lm_ggml_custom1_op_f32_t fun),
1963
- "use lm_ggml_map_custom1_inplace instead");
1964
-
1965
- LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom2_f32(
1966
- struct lm_ggml_context * ctx,
1967
- struct lm_ggml_tensor * a,
1968
- struct lm_ggml_tensor * b,
1969
- lm_ggml_custom2_op_f32_t fun),
1970
- "use lm_ggml_map_custom2 instead");
1971
-
1972
- LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom2_inplace_f32(
1973
- struct lm_ggml_context * ctx,
1974
- struct lm_ggml_tensor * a,
1975
- struct lm_ggml_tensor * b,
1976
- lm_ggml_custom2_op_f32_t fun),
1977
- "use lm_ggml_map_custom2_inplace instead");
1978
-
1979
- LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom3_f32(
1980
- struct lm_ggml_context * ctx,
1981
- struct lm_ggml_tensor * a,
1982
- struct lm_ggml_tensor * b,
1983
- struct lm_ggml_tensor * c,
1984
- lm_ggml_custom3_op_f32_t fun),
1985
- "use lm_ggml_map_custom3 instead");
1986
-
1987
- LM_GGML_DEPRECATED(LM_GGML_API struct lm_ggml_tensor * lm_ggml_map_custom3_inplace_f32(
1988
- struct lm_ggml_context * ctx,
1989
- struct lm_ggml_tensor * a,
1990
- struct lm_ggml_tensor * b,
1991
- struct lm_ggml_tensor * c,
1992
- lm_ggml_custom3_op_f32_t fun),
1993
- "use lm_ggml_map_custom3_inplace instead");
1994
-
1995
- // custom operators v2
1996
-
1997
1940
  typedef void (*lm_ggml_custom1_op_t)(struct lm_ggml_tensor * dst , const struct lm_ggml_tensor * a, int ith, int nth, void * userdata);
1998
1941
  typedef void (*lm_ggml_custom2_op_t)(struct lm_ggml_tensor * dst , const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b, int ith, int nth, void * userdata);
1999
1942
  typedef void (*lm_ggml_custom3_op_t)(struct lm_ggml_tensor * dst , const struct lm_ggml_tensor * a, const struct lm_ggml_tensor * b, const struct lm_ggml_tensor * c, int ith, int nth, void * userdata);
@@ -2049,6 +1992,30 @@ extern "C" {
2049
1992
  int n_tasks,
2050
1993
  void * userdata);
2051
1994
 
1995
+ typedef void (*lm_ggml_custom_op_t)(struct lm_ggml_tensor * dst , int ith, int nth, void * userdata);
1996
+
1997
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_custom_4d(
1998
+ struct lm_ggml_context * ctx,
1999
+ enum lm_ggml_type type,
2000
+ int64_t ne0,
2001
+ int64_t ne1,
2002
+ int64_t ne2,
2003
+ int64_t ne3,
2004
+ struct lm_ggml_tensor ** args,
2005
+ int n_args,
2006
+ lm_ggml_custom_op_t fun,
2007
+ int n_tasks,
2008
+ void * userdata);
2009
+
2010
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_custom_inplace(
2011
+ struct lm_ggml_context * ctx,
2012
+ struct lm_ggml_tensor * a,
2013
+ struct lm_ggml_tensor ** args,
2014
+ int n_args,
2015
+ lm_ggml_custom_op_t fun,
2016
+ int n_tasks,
2017
+ void * userdata);
2018
+
2052
2019
  // loss function
2053
2020
 
2054
2021
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_cross_entropy_loss(
@@ -16,6 +16,9 @@ using json = nlohmann::ordered_json;
16
16
  static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
17
17
  auto has_max = max_items != std::numeric_limits<int>::max();
18
18
 
19
+ if (max_items == 0) {
20
+ return "";
21
+ }
19
22
  if (min_items == 0 && max_items == 1) {
20
23
  return item_rule + "?";
21
24
  }
@@ -19,6 +19,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
19
19
  { LLM_ARCH_REFACT, "refact" },
20
20
  { LLM_ARCH_BERT, "bert" },
21
21
  { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
22
+ { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
22
23
  { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
23
24
  { LLM_ARCH_BLOOM, "bloom" },
24
25
  { LLM_ARCH_STABLELM, "stablelm" },
@@ -54,6 +55,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
54
55
  { LLM_ARCH_DEEPSEEK, "deepseek" },
55
56
  { LLM_ARCH_DEEPSEEK2, "deepseek2" },
56
57
  { LLM_ARCH_CHATGLM, "chatglm" },
58
+ { LLM_ARCH_GLM4, "glm4" },
57
59
  { LLM_ARCH_BITNET, "bitnet" },
58
60
  { LLM_ARCH_T5, "t5" },
59
61
  { LLM_ARCH_T5ENCODER, "t5encoder" },
@@ -105,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
105
107
  { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
106
108
  { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
107
109
  { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
110
+ { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
108
111
  { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
109
112
  { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
110
113
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -139,6 +142,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
139
142
  { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
140
143
  { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
141
144
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
145
+ { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
146
+ { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
142
147
 
143
148
  { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
144
149
  { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -469,6 +474,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
469
474
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
470
475
  },
471
476
  },
477
+ {
478
+ LLM_ARCH_NOMIC_BERT_MOE,
479
+ {
480
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
481
+ { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
482
+ { LLM_TENSOR_TOKEN_TYPES, "token_types" },
483
+ { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
484
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
485
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
486
+ { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
487
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
488
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
489
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
490
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
491
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
492
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
493
+ },
494
+ },
472
495
  {
473
496
  LLM_ARCH_JINA_BERT_V2,
474
497
  {
@@ -1102,6 +1125,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1102
1125
  { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
1103
1126
  { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
1104
1127
  { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1128
+ { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" },
1129
+ { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" },
1105
1130
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1106
1131
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1107
1132
  { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
@@ -1152,6 +1177,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
1152
1177
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1153
1178
  },
1154
1179
  },
1180
+ {
1181
+ LLM_ARCH_GLM4,
1182
+ {
1183
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1184
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
1185
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1186
+ { LLM_TENSOR_OUTPUT, "output" },
1187
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1188
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1189
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1190
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1191
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1192
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1193
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1194
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1195
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
1196
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
1197
+ },
1198
+ },
1155
1199
  {
1156
1200
  LLM_ARCH_BITNET,
1157
1201
  {
@@ -1543,23 +1587,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
1543
1587
  {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1544
1588
  {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1545
1589
  {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1546
- {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1547
- {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1548
- {LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1549
- {LLM_TENSOR_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1550
- {LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1551
- {LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1552
- {LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1553
- {LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1554
- {LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1555
- {LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1556
- {LLM_TENSOR_FFN_DOWN_SHEXP, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1557
- {LLM_TENSOR_FFN_GATE_SHEXP, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1558
- {LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1559
- {LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1560
- {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1561
- {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1562
- {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1590
+ {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1591
+ {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1563
1592
  {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1564
1593
  {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
1565
1594
  {LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
package/cpp/llama-arch.h CHANGED
@@ -23,6 +23,7 @@ enum llm_arch {
23
23
  LLM_ARCH_REFACT,
24
24
  LLM_ARCH_BERT,
25
25
  LLM_ARCH_NOMIC_BERT,
26
+ LLM_ARCH_NOMIC_BERT_MOE,
26
27
  LLM_ARCH_JINA_BERT_V2,
27
28
  LLM_ARCH_BLOOM,
28
29
  LLM_ARCH_STABLELM,
@@ -58,6 +59,7 @@ enum llm_arch {
58
59
  LLM_ARCH_DEEPSEEK,
59
60
  LLM_ARCH_DEEPSEEK2,
60
61
  LLM_ARCH_CHATGLM,
62
+ LLM_ARCH_GLM4,
61
63
  LLM_ARCH_BITNET,
62
64
  LLM_ARCH_T5,
63
65
  LLM_ARCH_T5ENCODER,
@@ -109,6 +111,7 @@ enum llm_kv {
109
111
  LLM_KV_EXPERT_WEIGHTS_SCALE,
110
112
  LLM_KV_EXPERT_WEIGHTS_NORM,
111
113
  LLM_KV_EXPERT_GATING_FUNC,
114
+ LLM_KV_MOE_EVERY_N_LAYERS,
112
115
  LLM_KV_POOLING_TYPE,
113
116
  LLM_KV_LOGIT_SCALE,
114
117
  LLM_KV_DECODER_START_TOKEN_ID,
@@ -143,6 +146,8 @@ enum llm_kv {
143
146
  LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
144
147
  LLM_KV_ATTENTION_SLIDING_WINDOW,
145
148
  LLM_KV_ATTENTION_SCALE,
149
+ LLM_KV_ATTENTION_KEY_LENGTH_MLA,
150
+ LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
146
151
 
147
152
  LLM_KV_ROPE_DIMENSION_COUNT,
148
153
  LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -256,6 +261,8 @@ enum llm_tensor {
256
261
  LLM_TENSOR_ATTN_Q_NORM,
257
262
  LLM_TENSOR_ATTN_K_NORM,
258
263
  LLM_TENSOR_LAYER_OUT_NORM,
264
+ LLM_TENSOR_POST_ATTN_NORM,
265
+ LLM_TENSOR_POST_MLP_NORM,
259
266
  LLM_TENSOR_SSM_IN,
260
267
  LLM_TENSOR_SSM_CONV1D,
261
268
  LLM_TENSOR_SSM_X,
@@ -303,6 +310,8 @@ enum llm_tensor {
303
310
  LLM_TENSOR_ATTN_Q_B,
304
311
  LLM_TENSOR_ATTN_KV_A_MQA,
305
312
  LLM_TENSOR_ATTN_KV_B,
313
+ LLM_TENSOR_ATTN_K_B,
314
+ LLM_TENSOR_ATTN_V_B,
306
315
  LLM_TENSOR_ATTN_Q_A_NORM,
307
316
  LLM_TENSOR_ATTN_KV_A_NORM,
308
317
  LLM_TENSOR_ATTN_SUB_NORM,
@@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
189
189
  return ubatch;
190
190
  }
191
191
 
192
- void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
192
+ llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
193
193
  LM_GGML_ASSERT(batch.n_tokens >= 0);
194
194
  this->batch = &batch;
195
195
  this->n_embd = n_embd;
@@ -203,6 +203,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
203
203
  for (size_t i = 0; i < n_tokens; ++i) {
204
204
  ids[i] = i;
205
205
  }
206
+
206
207
  if (simple_split) {
207
208
  seq.resize(1);
208
209
  llama_sbatch_seq & s = seq[0];
@@ -212,6 +213,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
212
213
  s.length = n_tokens;
213
214
  return;
214
215
  }
216
+
215
217
  std::sort(ids.begin(), ids.end(),
216
218
  [&batch](size_t a, size_t b) {
217
219
  int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
@@ -239,6 +241,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
239
241
  return n_seq_a > n_seq_b;
240
242
  }
241
243
  );
244
+
242
245
  // init seq
243
246
  llama_sbatch_seq * last_seq = nullptr;
244
247
 
@@ -262,6 +265,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
262
265
  seq.push_back(new_seq);
263
266
  last_seq = &seq.back();
264
267
  }
268
+
265
269
  // keep shared prompts first at the end, then sort by length descending.
266
270
  std::sort(seq.begin(), seq.end(),
267
271
  [](llama_sbatch_seq & a, llama_sbatch_seq & b) {
package/cpp/llama-batch.h CHANGED
@@ -70,7 +70,8 @@ struct llama_sbatch {
70
70
  // sequence-wise split
71
71
  llama_ubatch split_seq(size_t n_ubatch);
72
72
 
73
- void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
73
+ llama_sbatch() = default;
74
+ llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
74
75
  };
75
76
 
76
77
  // temporary allocate memory for the input batch if needed
@@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
50
50
  { "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
51
51
  { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
52
52
  { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
53
- { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
54
- { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
53
+ { "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
54
+ { "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
55
55
  { "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
56
56
  { "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
57
57
  { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
@@ -62,6 +62,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
62
62
  { "yandex", LLM_CHAT_TEMPLATE_YANDEX },
63
63
  { "bailing", LLM_CHAT_TEMPLATE_BAILING },
64
64
  { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
65
+ { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
65
66
  };
66
67
 
67
68
  llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -81,7 +82,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
81
82
  if (tmpl_contains("<|im_start|>")) {
82
83
  return tmpl_contains("<|im_sep|>")
83
84
  ? LLM_CHAT_TEMPLATE_PHI_4
84
- : LLM_CHAT_TEMPLATE_CHATML;
85
+ : tmpl_contains("<end_of_utterance>")
86
+ ? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
87
+ : LLM_CHAT_TEMPLATE_CHATML;
85
88
  } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
86
89
  if (tmpl_contains("[SYSTEM_PROMPT]")) {
87
90
  return LLM_CHAT_TEMPLATE_MISTRAL_V7;
@@ -119,8 +122,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
119
122
  }
120
123
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
121
124
  return LLM_CHAT_TEMPLATE_PHI_3;
125
+ } else if (tmpl_contains("[gMASK]<sop>")) {
126
+ return LLM_CHAT_TEMPLATE_CHATGLM_4;
122
127
  } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
123
128
  return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
129
+ } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
130
+ return LLM_CHAT_TEMPLATE_GLMEDGE;
124
131
  } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
125
132
  return LLM_CHAT_TEMPLATE_ZEPHYR;
126
133
  } else if (tmpl_contains("bos_token + message['role']")) {
@@ -149,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
149
156
  return LLM_CHAT_TEMPLATE_LLAMA_3;
150
157
  } else if (tmpl_contains("[gMASK]sop")) {
151
158
  // chatglm3-6b
152
- return LLM_CHAT_TEMPLATE_CHATGML_3;
153
- } else if (tmpl_contains("[gMASK]<sop>")) {
154
- return LLM_CHAT_TEMPLATE_CHATGML_4;
159
+ return LLM_CHAT_TEMPLATE_CHATGLM_3;
155
160
  } else if (tmpl_contains(LU8("<用户>"))) {
156
161
  // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
157
162
  return LLM_CHAT_TEMPLATE_MINICPM;
@@ -432,7 +437,7 @@ int32_t llm_chat_apply_template(
432
437
  if (add_ass) {
433
438
  ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
434
439
  }
435
- } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
440
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
436
441
  // chatglm3-6b
437
442
  ss << "[gMASK]" << "sop";
438
443
  for (auto message : chat) {
@@ -442,14 +447,14 @@ int32_t llm_chat_apply_template(
442
447
  if (add_ass) {
443
448
  ss << "<|assistant|>";
444
449
  }
445
- } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
450
+ } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
446
451
  ss << "[gMASK]" << "<sop>";
447
452
  for (auto message : chat) {
448
453
  std::string role(message->role);
449
454
  ss << "<|" << role << "|>" << "\n" << message->content;
450
455
  }
451
456
  if (add_ass) {
452
- ss << "<|assistant|>";
457
+ ss << "<|assistant|>\n";
453
458
  }
454
459
  } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
455
460
  for (auto message : chat) {
@@ -620,7 +625,23 @@ int32_t llm_chat_apply_template(
620
625
  if (add_ass) {
621
626
  ss << "<|header_start|>assistant<|header_end|>\n\n";
622
627
  }
623
- } else {
628
+ } else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
629
+ // SmolVLM
630
+ ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
631
+ for (auto message : chat) {
632
+ std::string role(message->role);
633
+ if (role == "system") {
634
+ ss << message->content << "\n\n";
635
+ } else if (role == "user") {
636
+ ss << "User: " << message->content << "<end_of_utterance>\n";
637
+ } else {
638
+ ss << "Assistant: " << message->content << "<end_of_utterance>\n";
639
+ }
640
+ }
641
+ if (add_ass) {
642
+ ss << "Assistant:";
643
+ }
644
+ } else {
624
645
  // template not supported
625
646
  return -1;
626
647
  }
package/cpp/llama-chat.h CHANGED
@@ -29,8 +29,8 @@ enum llm_chat_template {
29
29
  LLM_CHAT_TEMPLATE_DEEPSEEK_3,
30
30
  LLM_CHAT_TEMPLATE_COMMAND_R,
31
31
  LLM_CHAT_TEMPLATE_LLAMA_3,
32
- LLM_CHAT_TEMPLATE_CHATGML_3,
33
- LLM_CHAT_TEMPLATE_CHATGML_4,
32
+ LLM_CHAT_TEMPLATE_CHATGLM_3,
33
+ LLM_CHAT_TEMPLATE_CHATGLM_4,
34
34
  LLM_CHAT_TEMPLATE_GLMEDGE,
35
35
  LLM_CHAT_TEMPLATE_MINICPM,
36
36
  LLM_CHAT_TEMPLATE_EXAONE_3,
@@ -41,6 +41,7 @@ enum llm_chat_template {
41
41
  LLM_CHAT_TEMPLATE_YANDEX,
42
42
  LLM_CHAT_TEMPLATE_BAILING,
43
43
  LLM_CHAT_TEMPLATE_LLAMA4,
44
+ LLM_CHAT_TEMPLATE_SMOLVLM,
44
45
  LLM_CHAT_TEMPLATE_UNKNOWN,
45
46
  };
46
47