@novastera-oss/llamarn 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/android/src/main/cpp/include/llama.h +8 -3
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  10. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  11. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  13. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  15. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  18. package/cpp/LlamaCppModel.cpp +56 -22
  19. package/cpp/build-info.cpp +2 -2
  20. package/cpp/llama.cpp/CMakeLists.txt +1 -2
  21. package/cpp/llama.cpp/README.md +4 -5
  22. package/cpp/llama.cpp/build-xcframework.sh +1 -1
  23. package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
  24. package/cpp/llama.cpp/common/arg.cpp +24 -0
  25. package/cpp/llama.cpp/common/chat.cpp +37 -20
  26. package/cpp/llama.cpp/common/chat.h +2 -0
  27. package/cpp/llama.cpp/common/common.cpp +3 -0
  28. package/cpp/llama.cpp/common/common.h +5 -0
  29. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  30. package/cpp/llama.cpp/convert_hf_to_gguf.py +860 -23
  31. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
  32. package/cpp/llama.cpp/ggml/CMakeLists.txt +8 -2
  33. package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
  34. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  35. package/cpp/llama.cpp/ggml/include/ggml.h +206 -10
  36. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
  38. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
  39. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
  40. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +37 -3
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -103
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1405 -240
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +8 -0
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +212 -34
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +35 -11
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +187 -54
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +71 -29
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +12 -6
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +269 -110
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  88. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  89. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
  90. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
  91. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
  92. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  93. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
  94. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  95. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  96. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
  99. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
  100. package/cpp/llama.cpp/ggml/src/ggml-impl.h +125 -183
  101. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
  102. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +51 -9
  103. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +394 -80
  104. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +616 -239
  105. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +741 -571
  108. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
  110. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  112. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
  114. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
  115. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
  116. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
  117. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  118. package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +697 -1098
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +104 -62
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  133. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  134. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  135. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
  136. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  137. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +39 -38
  138. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
  139. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  140. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  141. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  142. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  143. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  144. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  145. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  146. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +767 -292
  147. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
  149. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
  150. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
  151. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
  152. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
  153. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
  154. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
  161. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
  162. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  164. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  166. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  167. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  168. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  169. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
  170. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  172. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
  173. package/cpp/llama.cpp/ggml/src/ggml.c +449 -72
  174. package/cpp/llama.cpp/ggml/src/gguf.cpp +13 -2
  175. package/cpp/llama.cpp/gguf-py/gguf/constants.py +285 -0
  176. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +27 -0
  177. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +137 -21
  178. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +109 -7
  179. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  180. package/cpp/llama.cpp/include/llama.h +8 -43
  181. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  182. package/cpp/llama.cpp/src/llama-arch.cpp +265 -3
  183. package/cpp/llama.cpp/src/llama-arch.h +36 -1
  184. package/cpp/llama.cpp/src/llama-batch.cpp +596 -359
  185. package/cpp/llama.cpp/src/llama-batch.h +105 -70
  186. package/cpp/llama.cpp/src/llama-chat.cpp +26 -6
  187. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  188. package/cpp/llama.cpp/src/llama-context.cpp +101 -107
  189. package/cpp/llama.cpp/src/llama-context.h +13 -13
  190. package/cpp/llama.cpp/src/llama-graph.cpp +286 -404
  191. package/cpp/llama.cpp/src/llama-graph.h +78 -79
  192. package/cpp/llama.cpp/src/llama-hparams.cpp +11 -1
  193. package/cpp/llama.cpp/src/llama-hparams.h +11 -0
  194. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +74 -66
  195. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
  196. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +312 -157
  197. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +79 -46
  198. package/cpp/llama.cpp/src/llama-kv-cells.h +97 -21
  199. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +73 -69
  200. package/cpp/llama.cpp/src/llama-memory-hybrid.h +19 -22
  201. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +88 -77
  202. package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
  203. package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
  204. package/cpp/llama.cpp/src/llama-memory.h +21 -22
  205. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  206. package/cpp/llama.cpp/src/llama-model.cpp +5301 -2922
  207. package/cpp/llama.cpp/src/llama-model.h +40 -0
  208. package/cpp/llama.cpp/src/llama-quant.cpp +88 -5
  209. package/cpp/llama.cpp/src/llama-vocab.cpp +37 -3
  210. package/cpp/llama.cpp/src/llama-vocab.h +42 -0
  211. package/cpp/rn-utils.h +3 -0
  212. package/ios/include/chat.h +2 -0
  213. package/ios/include/common.h +5 -0
  214. package/ios/include/llama.h +8 -43
  215. package/ios/libs/llama.xcframework/Info.plist +19 -19
  216. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  217. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
  218. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  219. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  220. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +206 -10
  221. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -43
  222. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  223. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  224. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
  225. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
  226. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  227. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  228. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
  229. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
  230. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  231. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  232. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
  233. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3744
  234. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
  235. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  236. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +206 -10
  237. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -43
  238. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
  239. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  240. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +206 -10
  241. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -43
  242. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  243. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
  244. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  245. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +206 -10
  246. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -43
  247. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  248. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  249. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  250. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
  251. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  252. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  253. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +206 -10
  254. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -43
  255. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  256. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  257. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
  258. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
  259. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  260. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  261. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
  262. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
  263. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  264. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  265. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5095 -4900
  266. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  267. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  268. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +206 -10
  269. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -43
  270. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  271. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  272. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4871
  273. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3773
  274. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  275. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  276. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
  277. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
  278. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  279. package/package.json +1 -1
  280. package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
  281. package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  282. package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  283. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  284. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  285. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  286. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  287. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  288. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  289. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  290. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  291. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  292. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  293. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  294. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  295. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  296. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  297. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  298. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  299. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  300. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  301. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  302. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  303. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  304. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  305. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  306. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  307. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  308. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  309. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  310. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  311. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  312. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  313. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  314. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  315. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  316. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  317. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  318. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  319. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
@@ -300,6 +300,7 @@ class ModelBase:
300
300
  gguf.MODEL_TENSOR.POS_EMBD,
301
301
  gguf.MODEL_TENSOR.TOKEN_TYPES,
302
302
  gguf.MODEL_TENSOR.SSM_CONV1D,
303
+ gguf.MODEL_TENSOR.SHORTCONV_CONV,
303
304
  gguf.MODEL_TENSOR.TIME_MIX_FIRST,
304
305
  gguf.MODEL_TENSOR.TIME_MIX_W1,
305
306
  gguf.MODEL_TENSOR.TIME_MIX_W2,
@@ -310,6 +311,8 @@ class ModelBase:
310
311
  gguf.MODEL_TENSOR.POSNET_NORM2,
311
312
  gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
312
313
  gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
314
+ gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF,
315
+ gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
313
316
  )
314
317
  )
315
318
  or not new_name.endswith(".weight")
@@ -320,7 +323,11 @@ class ModelBase:
320
323
  self.match_model_tensor_name(new_name, key, bid)
321
324
  for key in (
322
325
  gguf.MODEL_TENSOR.TOKEN_EMBD,
326
+ gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
323
327
  gguf.MODEL_TENSOR.OUTPUT,
328
+ gguf.MODEL_TENSOR.ALTUP_ROUTER,
329
+ gguf.MODEL_TENSOR.LAUREL_L,
330
+ gguf.MODEL_TENSOR.LAUREL_R,
324
331
  )
325
332
  ):
326
333
  if self.ftype in (
@@ -809,6 +816,30 @@ class TextModel(ModelBase):
809
816
  if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
810
817
  # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
811
818
  res = "minerva-7b"
819
+ if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
820
+ # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
821
+ res = "hunyuan"
822
+ if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf":
823
+ # ref: https://huggingface.co/skt/A.X-4.0
824
+ res = "a.x-4.0"
825
+ if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
826
+ # ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
827
+ res = "falcon-h1"
828
+ if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86":
829
+ # ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base
830
+ res = "falcon-h1"
831
+ if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896":
832
+ # ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base
833
+ res = "falcon-h1"
834
+ if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
835
+ # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
836
+ res = "falcon-h1"
837
+ if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
838
+ # ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
839
+ res = "midm-2.0"
840
+ if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
841
+ # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
842
+ res = "lfm2"
812
843
 
813
844
  if res is None:
814
845
  logger.warning("\n")
@@ -921,13 +952,20 @@ class TextModel(ModelBase):
921
952
  tokenizer = SentencePieceProcessor()
922
953
  tokenizer.LoadFromFile(str(tokenizer_path))
923
954
 
924
- vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
955
+ vocab_size = self.find_hparam([
956
+ "vocab_size_per_layer_input", # gemma3n
957
+ "vocab_size",
958
+ ], optional=True) or tokenizer.vocab_size()
925
959
 
926
960
  tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
927
961
  scores: list[float] = [-10000.0] * vocab_size
928
962
  toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
929
963
 
930
964
  for token_id in range(tokenizer.vocab_size()):
965
+ if token_id >= vocab_size:
966
+ logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}')
967
+ break
968
+
931
969
  piece = tokenizer.IdToPiece(token_id)
932
970
  text = piece.encode("utf-8")
933
971
  score = tokenizer.GetScore(token_id)
@@ -2145,7 +2183,6 @@ class Llama4Model(LlamaModel):
2145
2183
 
2146
2184
  def set_vocab(self):
2147
2185
  self._set_vocab_gpt2()
2148
- self.gguf_writer.add_add_bos_token(True)
2149
2186
 
2150
2187
  def set_gguf_parameters(self):
2151
2188
  super().set_gguf_parameters()
@@ -2194,7 +2231,7 @@ class Llama4VisionModel(MmprojModel):
2194
2231
  name += ".weight"
2195
2232
  if "multi_modal_projector.linear_1" in name:
2196
2233
  # despite the name with number postfix, this is a single fully connected layer
2197
- return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
2234
+ return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch)]
2198
2235
  return [(self.map_tensor_name(name), data_torch)]
2199
2236
  return []
2200
2237
 
@@ -2731,6 +2768,52 @@ class Qwen2Model(TextModel):
2731
2768
  yield from super().modify_tensors(data_torch, name, bid)
2732
2769
 
2733
2770
 
2771
+ @ModelBase.register("Ernie4_5_ForCausalLM")
2772
+ class Ernie4_5Model(TextModel):
2773
+ model_arch = gguf.MODEL_ARCH.ERNIE4_5
2774
+
2775
+ def set_vocab(self):
2776
+ self._set_vocab_sentencepiece()
2777
+
2778
+ def set_gguf_parameters(self):
2779
+ super().set_gguf_parameters()
2780
+
2781
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2782
+ num_heads = self.hparams["num_attention_heads"]
2783
+ num_kv_heads = self.hparams["num_key_value_heads"]
2784
+ head_dim = self.hparams["head_dim"]
2785
+
2786
+ if "ernie." in name:
2787
+ name = name.replace("ernie.", "model.")
2788
+ # split the qkv weights
2789
+ # qkv_proj shape: [(num_heads + 2 * num_kv_heads) * head_dim, hidden_size]
2790
+ if "qkv_proj" in name:
2791
+ name_q = name.replace("qkv_proj.weight", "q_proj.weight")
2792
+ name_k = name.replace("qkv_proj.weight", "k_proj.weight")
2793
+ name_v = name.replace("qkv_proj.weight", "v_proj.weight")
2794
+ total_q_dim = num_heads * head_dim
2795
+ total_k_dim = num_kv_heads * head_dim
2796
+ total_v_dim = num_kv_heads * head_dim
2797
+ q_proj_weight, k_proj_weight, v_proj_weight = data_torch.split([total_q_dim, total_k_dim, total_v_dim], dim=0)
2798
+ return [
2799
+ (self.map_tensor_name(name_q), q_proj_weight),
2800
+ (self.map_tensor_name(name_k), k_proj_weight),
2801
+ (self.map_tensor_name(name_v), v_proj_weight)
2802
+ ]
2803
+ # split the up_gate_proj into gate and up
2804
+ # up_gate_proj shape: [2 * intermediate_size, hidden_size]
2805
+ if "up_gate_proj" in name:
2806
+ name_up = name.replace("up_gate_proj.weight", "up_proj.weight")
2807
+ name_gate = name.replace("up_gate_proj.weight", "gate_proj.weight")
2808
+ dim_half = data_torch.shape[0] // 2
2809
+ gate_proj_weight, up_proj_weight = data_torch.split(dim_half, dim=0)
2810
+ return [
2811
+ (self.map_tensor_name(name_gate), gate_proj_weight),
2812
+ (self.map_tensor_name(name_up), up_proj_weight)
2813
+ ]
2814
+ return [(self.map_tensor_name(name), data_torch)]
2815
+
2816
+
2734
2817
  @ModelBase.register(
2735
2818
  "Qwen2VLModel",
2736
2819
  "Qwen2VLForConditionalGeneration",
@@ -3918,9 +4001,6 @@ class BertModel(TextModel):
3918
4001
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3919
4002
  special_vocab.add_to_gguf(self.gguf_writer)
3920
4003
 
3921
- self.gguf_writer.add_add_bos_token(True)
3922
- self.gguf_writer.add_add_eos_token(True)
3923
-
3924
4004
 
3925
4005
  @ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
3926
4006
  class DistilBertModel(BertModel):
@@ -3962,8 +4042,6 @@ class RobertaModel(BertModel):
3962
4042
  bpe_tok_path = self.dir_model / "tokenizer.json"
3963
4043
  if bpe_tok_path.exists():
3964
4044
  self._set_vocab_gpt2()
3965
- self.gguf_writer.add_add_bos_token(True)
3966
- self.gguf_writer.add_add_eos_token(True)
3967
4045
 
3968
4046
  # we need this to validate the size of the token_type embeddings
3969
4047
  # though currently we are passing all zeros to the token_type embeddings
@@ -4223,6 +4301,7 @@ class Gemma2Model(TextModel):
4223
4301
  @ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
4224
4302
  class Gemma3Model(TextModel):
4225
4303
  model_arch = gguf.MODEL_ARCH.GEMMA3
4304
+ norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value
4226
4305
 
4227
4306
  def set_vocab(self):
4228
4307
  self._set_vocab_sentencepiece()
@@ -4244,9 +4323,8 @@ class Gemma3Model(TextModel):
4244
4323
  self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
4245
4324
  self.gguf_writer.add_file_type(self.ftype)
4246
4325
  self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
4247
- # both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3
4326
+ # attn_logit_softcapping is removed in Gemma3
4248
4327
  assert hparams.get("attn_logit_softcapping") is None
4249
- assert hparams.get("final_logit_softcapping") is None
4250
4328
  self.gguf_writer.add_sliding_window(hparams["sliding_window"])
4251
4329
  self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
4252
4330
  if hparams.get("rope_scaling") is not None:
@@ -4258,7 +4336,7 @@ class Gemma3Model(TextModel):
4258
4336
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4259
4337
  del bid # unused
4260
4338
 
4261
- if name.startswith("language_model."):
4339
+ if "language_model." in name:
4262
4340
  name = name.replace("language_model.", "")
4263
4341
 
4264
4342
  elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
@@ -4273,8 +4351,9 @@ class Gemma3Model(TextModel):
4273
4351
 
4274
4352
  # ref code in Gemma3RMSNorm
4275
4353
  # output = output * (1.0 + self.weight.float())
4354
+ # note: this is not the case on gemma3n
4276
4355
  if name.endswith("norm.weight"):
4277
- data_torch = data_torch + 1
4356
+ data_torch = data_torch + self.norm_shift
4278
4357
 
4279
4358
  return [(self.map_tensor_name(name), data_torch)]
4280
4359
 
@@ -4331,6 +4410,101 @@ class Gemma3VisionModel(MmprojModel):
4331
4410
  return [] # skip other tensors
4332
4411
 
4333
4412
 
4413
+ @ModelBase.register("Gemma3nForConditionalGeneration")
4414
+ class Gemma3NModel(Gemma3Model):
4415
+ model_arch = gguf.MODEL_ARCH.GEMMA3N
4416
+ norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
4417
+
4418
+ _altup_proj: list[Tensor] = []
4419
+ _altup_unembd: list[Tensor] = []
4420
+
4421
+ def __init__(self, *args, **kwargs):
4422
+ super().__init__(*args, **kwargs)
4423
+ assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs"
4424
+ self._altup_proj = [
4425
+ torch.Tensor(), # to be replaced
4426
+ torch.Tensor(), # to be replaced
4427
+ torch.Tensor(), # to be replaced
4428
+ ]
4429
+ self._altup_unembd = [
4430
+ torch.Tensor(), # to be replaced
4431
+ torch.Tensor(), # to be replaced
4432
+ torch.Tensor(), # to be replaced
4433
+ ]
4434
+
4435
+ def set_vocab(self):
4436
+ super().set_vocab()
4437
+
4438
+ def set_gguf_parameters(self):
4439
+ super().set_gguf_parameters()
4440
+ self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
4441
+ self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"])
4442
+ self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"])
4443
+ self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"])
4444
+
4445
+ activation_sparsity_scale = []
4446
+ for s in self.hparams["activation_sparsity_pattern"]:
4447
+ normal_dist = torch.distributions.normal.Normal(0, 1)
4448
+ std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32))
4449
+ activation_sparsity_scale.append(std_multiplier.item())
4450
+ self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale)
4451
+
4452
+ sliding_window_pattern = []
4453
+ for t in self.hparams["layer_types"]:
4454
+ sliding_window_pattern.append(t == "sliding_attention")
4455
+ self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
4456
+
4457
+ def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None:
4458
+ has_all = all(m.numel() > 0 for m in matrices)
4459
+ if not has_all:
4460
+ return None
4461
+ else:
4462
+ return torch.stack(matrices, dim=0)
4463
+
4464
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4465
+ if name.endswith("_scale"):
4466
+ name = name + ".weight"
4467
+
4468
+ # TODO: implement self.prediction_coefs.weight.clamp_(...)
4469
+
4470
+ if "language_model." not in name:
4471
+ return [] # skip non-language model tensors
4472
+
4473
+ if "altup_unembed_projections" in name:
4474
+ data_torch = data_torch.to(device="cpu")
4475
+ if ".0." in name:
4476
+ self._altup_unembd[0] = data_torch
4477
+ elif ".1." in name:
4478
+ self._altup_unembd[1] = data_torch
4479
+ elif ".2." in name:
4480
+ self._altup_unembd[2] = data_torch
4481
+ else:
4482
+ raise ValueError(f"Unknown name: {name}")
4483
+ out = self._stack_matrices(self._altup_unembd)
4484
+ if out is not None:
4485
+ return [(self.map_tensor_name("model.altup_unembed_projections.weight"), out)]
4486
+ else:
4487
+ return []
4488
+
4489
+ if "altup_projections" in name:
4490
+ data_torch = data_torch.to(device="cpu")
4491
+ if ".0." in name:
4492
+ self._altup_proj[0] = data_torch
4493
+ elif ".1." in name:
4494
+ self._altup_proj[1] = data_torch
4495
+ elif ".2." in name:
4496
+ self._altup_proj[2] = data_torch
4497
+ else:
4498
+ raise ValueError(f"Unknown name: {name}")
4499
+ out = self._stack_matrices(self._altup_proj)
4500
+ if out is not None:
4501
+ return [(self.map_tensor_name("model.altup_projections.weight"), out)]
4502
+ else:
4503
+ return []
4504
+
4505
+ return super().modify_tensors(data_torch, name, bid)
4506
+
4507
+
4334
4508
  @ModelBase.register("Starcoder2ForCausalLM")
4335
4509
  class StarCoder2Model(TextModel):
4336
4510
  model_arch = gguf.MODEL_ARCH.STARCODER2
@@ -4629,6 +4803,14 @@ class ARwkv7Model(Rwkv7Model):
4629
4803
  class MambaModel(TextModel):
4630
4804
  model_arch = gguf.MODEL_ARCH.MAMBA
4631
4805
 
4806
+ def __init__(self, dir_model: Path, *args, **kwargs):
4807
+ # Avoid using AutoConfig for hparams
4808
+ hparams = kwargs.pop("hparams", None)
4809
+ if hparams is None:
4810
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
4811
+ hparams = json.load(f)
4812
+ super().__init__(dir_model, *args, hparams=hparams, **kwargs)
4813
+
4632
4814
  def set_vocab(self):
4633
4815
  vocab_size = self.hparams["vocab_size"]
4634
4816
  # Round vocab size to next multiple of 8
@@ -4703,6 +4885,216 @@ class MambaModel(TextModel):
4703
4885
  return [(new_name, data_torch)]
4704
4886
 
4705
4887
 
4888
+ @ModelBase.register("Mamba2ForCausalLM")
4889
+ class Mamba2Model(TextModel):
4890
+ model_arch = gguf.MODEL_ARCH.MAMBA2
4891
+
4892
+ def __init__(self, dir_model: Path, *args, **kwargs):
4893
+ # Avoid using AutoConfig for hparams
4894
+ # It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1
4895
+ hparams = kwargs.pop("hparams", None)
4896
+ if hparams is None:
4897
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
4898
+ hparams = json.load(f)
4899
+ super().__init__(dir_model, *args, hparams=hparams, **kwargs)
4900
+ self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
4901
+ self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
4902
+ self.n_group = self.find_hparam(["n_groups"], optional=True) or 1
4903
+
4904
+ def set_vocab(self):
4905
+ vocab_size = self.hparams["vocab_size"]
4906
+ # Round vocab size to next multiple of 16
4907
+ pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16)
4908
+ # pad using ceiling division
4909
+ # ref: https://stackoverflow.com/a/17511341/22827863
4910
+ vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
4911
+ self.hparams["vocab_size"] = vocab_size
4912
+
4913
+ if (self.dir_model / "tokenizer.model").is_file():
4914
+ self._set_vocab_sentencepiece()
4915
+ elif (self.dir_model / "tokenizer.model.v3").is_file():
4916
+ # mamba-codestral
4917
+ raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}")
4918
+ elif (self.dir_model / "tokenizer.json").is_file():
4919
+ self._set_vocab_gpt2()
4920
+ else:
4921
+ # Use the GPT-NeoX tokenizer when no tokenizer files are present
4922
+ self._set_vocab_builtin("gpt-neox", vocab_size)
4923
+
4924
+ def set_gguf_parameters(self):
4925
+ d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
4926
+ d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
4927
+ head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
4928
+
4929
+ rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
4930
+
4931
+ # Fail early for models which don't have a block expansion factor of 2
4932
+ # TODO: does this really matter?
4933
+ # skip the assertion for FalconH1 Model
4934
+ if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
4935
+ assert self.d_inner == 2 * self.d_model
4936
+ assert self.d_inner % head_dim == 0
4937
+
4938
+ self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
4939
+ self.gguf_writer.add_embedding_length(self.d_model)
4940
+ self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
4941
+ self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
4942
+ self.gguf_writer.add_block_count(self.block_count)
4943
+ self.gguf_writer.add_ssm_conv_kernel(d_conv)
4944
+ self.gguf_writer.add_ssm_inner_size(self.d_inner)
4945
+ self.gguf_writer.add_ssm_state_size(d_state)
4946
+ self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
4947
+ self.gguf_writer.add_ssm_group_count(self.n_group)
4948
+ self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
4949
+ self.gguf_writer.add_file_type(self.ftype)
4950
+
4951
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4952
+
4953
+ if name.startswith("model.backbone") or name.startswith("model.lm_head"):
4954
+ # map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2
4955
+ name = name.removeprefix("model.")
4956
+
4957
+ if name.endswith(".dt_bias"):
4958
+ name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
4959
+
4960
+ new_name = self.map_tensor_name(name)
4961
+
4962
+ if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
4963
+ data_torch = data_torch.squeeze()
4964
+ elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [
4965
+ gguf.MODEL_TENSOR.SSM_A,
4966
+ gguf.MODEL_TENSOR.SSM_D,
4967
+ ]):
4968
+ # unsqueeze A to use similar shape semantics as Mamba-1
4969
+ # (D is also unsqueezed, but for more straightforward broadcast internally)
4970
+ data_torch = data_torch.reshape((*data_torch.shape, 1))
4971
+ elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
4972
+ data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group))
4973
+
4974
+ if name.endswith(".A_log"):
4975
+ logger.debug("A_log --> A ==> " + new_name)
4976
+ data_torch = -torch.exp(data_torch)
4977
+
4978
+ yield (new_name, data_torch)
4979
+
4980
+
4981
+ @ModelBase.register("JambaForCausalLM")
4982
+ class JambaModel(TextModel):
4983
+ model_arch = gguf.MODEL_ARCH.JAMBA
4984
+
4985
+ def get_vocab_base_pre(self, tokenizer) -> str:
4986
+ del tokenizer # unused
4987
+
4988
+ return "gpt-2"
4989
+
4990
+ def set_vocab(self):
4991
+ if (self.dir_model / "tokenizer.model").is_file():
4992
+ # Using Jamba's tokenizer.json causes errors on model load
4993
+ # (something about "byte not found in vocab"),
4994
+ # but there's a working tokenizer.model
4995
+ self._set_vocab_sentencepiece()
4996
+ else:
4997
+ # Some Jamba models only have a tokenizer.json, which works.
4998
+ self._set_vocab_gpt2()
4999
+
5000
+ def set_gguf_parameters(self):
5001
+ d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
5002
+ d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4
5003
+ d_inner = self.hparams["mamba_expand"] * d_model
5004
+ d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16
5005
+ # ceiling division
5006
+ # ref: https://stackoverflow.com/a/17511341/22827863
5007
+ # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
5008
+ dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16)
5009
+ rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6
5010
+ n_kv_head = self.hparams["num_key_value_heads"]
5011
+ attn_offset = self.hparams["attn_layer_offset"]
5012
+ attn_period = self.hparams["attn_layer_period"]
5013
+ n_kv_vec = [0 for _ in range(attn_offset)] + [
5014
+ n_kv_head if (i - attn_offset) % attn_period == 0 else 0 for i in range(attn_offset, self.block_count)
5015
+ ]
5016
+
5017
+ self.gguf_writer.add_block_count(self.block_count)
5018
+ self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"]))
5019
+ self.gguf_writer.add_embedding_length(d_model)
5020
+ self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
5021
+ self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
5022
+ self.gguf_writer.add_head_count_kv(n_kv_vec)
5023
+ self.gguf_writer.add_ssm_conv_kernel(d_conv)
5024
+ self.gguf_writer.add_ssm_inner_size(d_inner)
5025
+ self.gguf_writer.add_ssm_state_size(d_state)
5026
+ self.gguf_writer.add_ssm_time_step_rank(dt_rank)
5027
+ self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
5028
+ self.gguf_writer.add_expert_count(self.hparams["num_experts"])
5029
+ self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
5030
+ self.gguf_writer.add_file_type(self.ftype)
5031
+
5032
+ _experts: list[dict[str, Tensor]] | None = None
5033
+
5034
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5035
+
5036
+ # Mini-Jamba
5037
+ name = name.replace(".moe.", ".feed_forward.")
5038
+ if bid is not None:
5039
+ moe_offset = self.hparams["expert_layer_offset"]
5040
+ moe_period = self.hparams["expert_layer_period"]
5041
+
5042
+ if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0):
5043
+ name = name.replace(".experts.0.", ".")
5044
+
5045
+ # process the experts separately
5046
+ if ".feed_forward.experts." in name:
5047
+ n_experts = self.hparams["num_experts"]
5048
+
5049
+ assert bid is not None
5050
+
5051
+ if self._experts is None:
5052
+ self._experts = [{} for _ in range(self.block_count)]
5053
+
5054
+ self._experts[bid][name] = data_torch
5055
+
5056
+ if len(self._experts[bid]) >= n_experts * 3:
5057
+
5058
+ # merge the experts into a single 3d tensor
5059
+ for wid in ["down_proj", "gate_proj", "up_proj"]:
5060
+ datas: list[Tensor] = []
5061
+
5062
+ for xid in range(n_experts):
5063
+ ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight"
5064
+ datas.append(self._experts[bid][ename])
5065
+ del self._experts[bid][ename]
5066
+
5067
+ data_torch = torch.stack(datas, dim=0)
5068
+
5069
+ # using the same merged name as qwen2moe
5070
+ merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight"
5071
+
5072
+ new_name = self.map_tensor_name(merged_name)
5073
+
5074
+ yield new_name, data_torch
5075
+ return
5076
+
5077
+ new_name = self.map_tensor_name(name)
5078
+
5079
+ if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
5080
+ data_torch = data_torch.squeeze()
5081
+
5082
+ if name.endswith(".A_log"):
5083
+ logger.debug("A_log --> A ==> " + new_name)
5084
+ data_torch = -torch.exp(data_torch)
5085
+
5086
+ yield (new_name, data_torch)
5087
+
5088
+ def prepare_tensors(self):
5089
+ super().prepare_tensors()
5090
+
5091
+ if self._experts is not None:
5092
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
5093
+ experts = [k for d in self._experts for k in d.keys()]
5094
+ if len(experts) > 0:
5095
+ raise ValueError(f"Unprocessed experts: {experts}")
5096
+
5097
+
4706
5098
  @ModelBase.register("CohereForCausalLM")
4707
5099
  class CommandR2Model(TextModel):
4708
5100
  model_arch = gguf.MODEL_ARCH.COMMAND_R
@@ -4848,8 +5240,6 @@ class JinaBertV2Model(BertModel):
4848
5240
  self.gguf_writer.add_token_type_count(2)
4849
5241
  else:
4850
5242
  raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
4851
- self.gguf_writer.add_add_bos_token(True)
4852
- self.gguf_writer.add_add_eos_token(True)
4853
5243
 
4854
5244
 
4855
5245
  @ModelBase.register("OpenELMForCausalLM")
@@ -5451,9 +5841,6 @@ class T5Model(TextModel):
5451
5841
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
5452
5842
  special_vocab.add_to_gguf(self.gguf_writer)
5453
5843
 
5454
- self.gguf_writer.add_add_bos_token(False)
5455
- self.gguf_writer.add_add_eos_token(True)
5456
-
5457
5844
  def set_gguf_parameters(self):
5458
5845
  if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
5459
5846
  logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -5591,9 +5978,6 @@ class T5EncoderModel(TextModel):
5591
5978
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
5592
5979
  special_vocab.add_to_gguf(self.gguf_writer)
5593
5980
 
5594
- self.gguf_writer.add_add_bos_token(False)
5595
- self.gguf_writer.add_add_eos_token(True)
5596
-
5597
5981
  def set_gguf_parameters(self):
5598
5982
  if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
5599
5983
  logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -6072,18 +6456,148 @@ class GraniteMoeModel(GraniteModel):
6072
6456
  (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
6073
6457
  ]
6074
6458
 
6459
+ has_experts = bool(self.hparams.get('num_local_experts'))
6460
+
6075
6461
  if name.endswith("shared_mlp.input_linear.weight"):
6076
6462
  ffn_dim = self.hparams["shared_intermediate_size"]
6077
6463
  assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
6078
6464
  gate, up = data_torch.split(ffn_dim, dim=-2)
6465
+ if has_experts:
6466
+ return [
6467
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
6468
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
6469
+ ]
6470
+ return [
6471
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), gate),
6472
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), up),
6473
+ ]
6474
+
6475
+ if not has_experts and name.endswith("shared_mlp.output_linear.weight"):
6079
6476
  return [
6080
- (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
6081
- (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
6477
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), data_torch)
6082
6478
  ]
6083
6479
 
6084
6480
  return super().modify_tensors(data_torch, name, bid)
6085
6481
 
6086
6482
 
6483
+ @ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM")
6484
+ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
6485
+ """GraniteHybrid is a hybrid SSM + Attention model that uses Mamba2 SSM
6486
+ layers and optionally uses MoE w/ a shared expert"""
6487
+ model_arch = gguf.MODEL_ARCH.GRANITE_HYBRID
6488
+ undo_permute = True
6489
+
6490
+ def __init__(self, *args, **kwargs):
6491
+
6492
+ # Hybrid mamba models use a prefix for the mamba-specific params.
6493
+ # TODO: Extend this if the prefix(es) need to be configurable
6494
+ self.hparam_prefixes = ["mamba"]
6495
+
6496
+ super().__init__(*args, **kwargs)
6497
+
6498
+ # Lists of which layers use ssm vs attention
6499
+ self._attn_layers = self.get_attn_layers()
6500
+ self._ssm_layers = [
6501
+ i for i in range(self.block_count)
6502
+ if i not in self._attn_layers
6503
+ ]
6504
+
6505
+ # n_group and d_inner are used during reshape_tensors for mamba2
6506
+ self.d_model = self.find_hparam(["hidden_size", "d_model"])
6507
+ self.n_group = self.find_hparam(["n_groups"])
6508
+ self.d_inner = self.find_hparam(["expand"]) * self.d_model
6509
+
6510
+ def get_attn_layers(self):
6511
+ # Explicit list of layer type names
6512
+ if layer_types := self.hparams.get("layer_types"):
6513
+ return [
6514
+ i for i, typ in enumerate(layer_types)
6515
+ if typ == "attention"
6516
+ ]
6517
+
6518
+ # Layer types indicated by index or period
6519
+ attn_layers = self.hparams.get("attn_layer_indices", [])
6520
+ if not attn_layers:
6521
+ attn_period = self.hparams.get("attn_layer_period")
6522
+ assert attn_period, "Didn't find attn_layer_indices or attn_layer_period"
6523
+ attn_offset = self.hparams.get("attn_layer_offset")
6524
+ assert attn_offset is not None, "No attention layer offset set with attn_layer_period"
6525
+ attn_layers = [
6526
+ i for i in range(self.block_count)
6527
+ if i % attn_period == attn_offset
6528
+ ]
6529
+ return attn_layers
6530
+
6531
+ def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
6532
+ prefixed = []
6533
+ for pfx in self.hparam_prefixes:
6534
+ prefixed.extend(
6535
+ "_".join([pfx, k])
6536
+ for k in keys
6537
+ )
6538
+ keys = list(keys) + prefixed
6539
+ return Mamba2Model.find_hparam(self, keys, *args, **kwargs)
6540
+
6541
+ def modify_tensors(
6542
+ self, data_torch: Tensor, name: str, bid: int | None
6543
+ ) -> Iterable[tuple[str, Tensor]]:
6544
+ if (
6545
+ name.endswith("block_sparse_moe.input_linear.weight")
6546
+ or "shared_mlp" in name
6547
+ ):
6548
+ return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
6549
+
6550
+ # Determine whether this is a mamba layer or an attention layer
6551
+ if bid in self._ssm_layers:
6552
+ return Mamba2Model.modify_tensors(self, data_torch, name, bid)
6553
+ elif bid in self._attn_layers:
6554
+ return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
6555
+ return [(self.map_tensor_name(name), data_torch)]
6556
+
6557
+ def set_gguf_parameters(self):
6558
+ """This method merges params from both parents and some that are
6559
+ specific to this model. The result is some duplication of how the params
6560
+ get set. The following warnings are expected during conversion:
6561
+
6562
+ WARNING:Duplicated key name 'granitehybrid.attention.head_count_kv'
6563
+ WARNING:Duplicated key name 'granitehybrid.context_length'
6564
+ """
6565
+ GraniteMoeModel.set_gguf_parameters(self)
6566
+
6567
+ ## Mamba mixer params ##
6568
+ self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"]))
6569
+ self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state"]))
6570
+ self.gguf_writer.add_ssm_group_count(self.n_group)
6571
+ self.gguf_writer.add_ssm_inner_size(self.d_inner)
6572
+ # NOTE: The mamba_dt_rank is _not_ the right field for how this is used
6573
+ # in llama.cpp
6574
+ self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"]))
6575
+
6576
+ ## Attention params ##
6577
+ head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
6578
+ head_count_kv_vec = [
6579
+ head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count)
6580
+ ]
6581
+ if rope_dim := self.hparams.get("attn_rotary_emb"):
6582
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
6583
+ self.gguf_writer.add_head_count_kv(head_count_kv_vec)
6584
+
6585
+ ## If Bamba, use rope, otherwise don't
6586
+ use_rope = "BambaForCausalLM" in self.hparams["architectures"]
6587
+ self.gguf_writer.add_rope_scaling_finetuned(use_rope)
6588
+ if not use_rope:
6589
+ self.gguf_writer.add_context_length(2**20)
6590
+
6591
+ ## Validation ##
6592
+ d_head = self.find_hparam(["d_head"], optional=True) or 64
6593
+ assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
6594
+ assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
6595
+
6596
+ def set_vocab(self):
6597
+ self.hparams["pad_vocab_size_multiple"] = 8
6598
+ Mamba2Model.set_vocab(self)
6599
+
6600
+
6087
6601
  @ModelBase.register("BailingMoeForCausalLM")
6088
6602
  class BailingMoeModel(TextModel):
6089
6603
  model_arch = gguf.MODEL_ARCH.BAILINGMOE
@@ -6292,6 +6806,321 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
6292
6806
  super().set_gguf_parameters()
6293
6807
  self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
6294
6808
 
6809
+
6810
+ @ModelBase.register("FalconH1ForCausalLM")
6811
+ class FalconH1Model(Mamba2Model):
6812
+ model_arch = gguf.MODEL_ARCH.FALCON_H1
6813
+
6814
+ def __init__(self, *args, **kwargs):
6815
+ # Set the hparam prefixes for Falcon Mamba2
6816
+ self.hparam_prefixes = ["mamba"]
6817
+
6818
+ # Initialize the base Mamba2Model
6819
+ super().__init__(*args, **kwargs)
6820
+
6821
+ # Use Llama conversion for attention
6822
+ self._transformer_model_class = LlamaModel
6823
+
6824
+ # n_group and d_inner are used during reshape_tensors for mamba2
6825
+ self.n_group = self.find_hparam(["n_groups"])
6826
+ self.d_inner = self.find_hparam(["mamba_d_ssm"])
6827
+ self.d_head = self.find_hparam(["d_head"])
6828
+
6829
+ # Initialize any Falcon Mamba2 specific attributes
6830
+ self.has_attention = True # Falcon Mamba2 has attention components
6831
+
6832
+ # Load Falcon-H1 multipliers from hyperparameters
6833
+ self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True)
6834
+ self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True)
6835
+ self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True)
6836
+ self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True)
6837
+ self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True)
6838
+ self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True)
6839
+ self.intermediate_size = self.find_hparam(["intermediate_size"])
6840
+ self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True)
6841
+
6842
+ def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
6843
+ prefixed = []
6844
+ for pfx in self.hparam_prefixes:
6845
+ prefixed.extend(
6846
+ "_".join([pfx, k])
6847
+ for k in keys
6848
+ )
6849
+ keys = list(keys) + prefixed
6850
+ return super().find_hparam(keys, *args, **kwargs)
6851
+
6852
+ def set_vocab(self):
6853
+ self._set_vocab_gpt2()
6854
+
6855
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6856
+ tensors = list(super().modify_tensors(data_torch, name, bid))
6857
+ tensor = tensors[0][1]
6858
+
6859
+ if "down_proj" in name:
6860
+ tensor = tensor * self.mlp_multipliers[1]
6861
+ elif "gate_proj" in name:
6862
+ tensor = tensor * self.mlp_multipliers[0]
6863
+ elif "k_proj" in name:
6864
+ tensor = tensor * self.key_multiplier * self.attention_in_multiplier
6865
+ elif "q_proj" in name:
6866
+ tensor = tensor * self.attention_in_multiplier
6867
+ elif "v_proj" in name:
6868
+ tensor = tensor * self.attention_in_multiplier
6869
+ elif "o_proj" in name:
6870
+ tensor = tensor * self.attention_out_multiplier
6871
+ elif "out_proj" in name:
6872
+ tensor = tensor * self.ssm_out_multiplier
6873
+ elif "in_proj" in name:
6874
+ tensor = tensor * self.ssm_in_multiplier
6875
+ zxbcdt_multipliers = self.hparams["ssm_multipliers"]
6876
+ intermediate_size = self.hparams["mamba_d_ssm"]
6877
+ groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"]
6878
+ tensor[:intermediate_size, :] *= zxbcdt_multipliers[0]
6879
+ tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1]
6880
+ tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2]
6881
+ tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3]
6882
+ tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4]
6883
+ elif "lm_head" in name:
6884
+ tensor = tensor * self.hparams["lm_head_multiplier"]
6885
+ elif "embed_tokens" in name:
6886
+ tensor = tensor * self.hparams["embedding_multiplier"]
6887
+ elif "mamba.norm" in name:
6888
+ tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group)
6889
+
6890
+ tensors = [(tensors[0][0], tensor)]
6891
+ return tensors
6892
+
6893
+ def set_gguf_parameters(self):
6894
+ super().set_gguf_parameters()
6895
+
6896
+ ## General Params ##
6897
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
6898
+ # Override some Mamba2 defaults
6899
+ self.gguf_writer.add_block_count(self.block_count)
6900
+ self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0))
6901
+ self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
6902
+
6903
+ ## Attention params ##
6904
+ self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2
6905
+ self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
6906
+ self.gguf_writer.add_key_length(self.hparams["head_dim"])
6907
+ self.gguf_writer.add_value_length(self.hparams["head_dim"])
6908
+
6909
+ ## Validation ##
6910
+ assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
6911
+ assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
6912
+
6913
+ # Add any other Falcon Mamba2 specific configuration
6914
+ self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
6915
+
6916
+
6917
+ @ModelBase.register("HunYuanMoEV1ForCausalLM")
6918
+ class HunYuanMoEModel(TextModel):
6919
+ model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
6920
+
6921
+ def __init__(self, *args, **kwargs):
6922
+ super().__init__(*args, **kwargs)
6923
+ # For handling tied embeddings
6924
+ self._tok_embd = None
6925
+
6926
+ def set_vocab(self):
6927
+ from transformers import AutoTokenizer
6928
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
6929
+
6930
+ # 1. Get the pre-tokenizer identifier hash
6931
+ tokpre = self.get_vocab_base_pre(tokenizer)
6932
+
6933
+ # 2. Reverse-engineer the merges list from mergeable_ranks
6934
+ merges = []
6935
+ vocab = {}
6936
+ mergeable_ranks = tokenizer.mergeable_ranks
6937
+ for token, rank in mergeable_ranks.items():
6938
+ vocab[QwenModel.token_bytes_to_string(token)] = rank
6939
+ if len(token) == 1:
6940
+ continue
6941
+ merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
6942
+ if len(merged) == 2: # todo this is an assert in Qwen, why?
6943
+ merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
6944
+
6945
+ # 3. Generate the tokens and toktypes lists
6946
+ vocab_size = self.hparams["vocab_size"]
6947
+ assert tokenizer.vocab_size == vocab_size
6948
+ special_tokens = tokenizer.special_tokens
6949
+ reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
6950
+ tokens: list[str] = []
6951
+ toktypes: list[int] = []
6952
+ for i in range(vocab_size):
6953
+ if i not in reverse_vocab:
6954
+ tokens.append(f"[PAD{i}]")
6955
+ toktypes.append(gguf.TokenType.UNUSED)
6956
+ else:
6957
+ token = reverse_vocab[i]
6958
+ tokens.append(token)
6959
+ if i in special_tokens.values():
6960
+ toktypes.append(gguf.TokenType.CONTROL)
6961
+ else:
6962
+ toktypes.append(gguf.TokenType.NORMAL)
6963
+
6964
+ # 4. Write all vocab-related fields to the GGUF writer
6965
+ self.gguf_writer.add_tokenizer_model("gpt2")
6966
+ self.gguf_writer.add_tokenizer_pre(tokpre)
6967
+ self.gguf_writer.add_token_list(tokens)
6968
+ self.gguf_writer.add_token_types(toktypes)
6969
+ self.gguf_writer.add_token_merges(merges)
6970
+
6971
+ # 5. Add special tokens and chat templates
6972
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
6973
+ special_vocab.add_to_gguf(self.gguf_writer)
6974
+ # FIX for BOS token: Overwrite incorrect id read from config.json
6975
+ self.gguf_writer.add_bos_token_id(127959) # <|bos|>
6976
+
6977
+ def set_gguf_parameters(self):
6978
+ super().set_gguf_parameters()
6979
+ hparams = self.hparams
6980
+
6981
+ self.gguf_writer.add_expert_count(hparams["num_experts"])
6982
+ self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"])
6983
+
6984
+ moe_intermediate_size = hparams["moe_intermediate_size"]
6985
+ assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size)
6986
+ self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0])
6987
+
6988
+ moe_topk = hparams["moe_topk"]
6989
+ assert all(topk == moe_topk[0] for topk in moe_topk)
6990
+ self.gguf_writer.add_expert_used_count(moe_topk[0])
6991
+
6992
+ moe_shared_expert = hparams["num_shared_expert"]
6993
+ assert all(n == moe_shared_expert[0] for n in moe_shared_expert)
6994
+ self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
6995
+
6996
+ # Rope
6997
+ rope_scaling = hparams.get("rope_scaling", {})
6998
+ if rope_scaling.get("type") == "dynamic":
6999
+ # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
7000
+ # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
7001
+ alpha = rope_scaling.get("alpha", 1000)
7002
+ base = hparams.get("rope_theta", 10000.0)
7003
+ dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
7004
+ scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
7005
+ self.gguf_writer.add_rope_freq_base(scaled_base)
7006
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
7007
+ self.gguf_writer.add_rope_scaling_factor(1)
7008
+ # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
7009
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
7010
+ self.gguf_writer.add_context_length(256 * 1024) # 256k context length
7011
+
7012
+ # if any of our assumptions about the values are wrong, something has changed and this may need to be updated
7013
+ assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
7014
+ "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
7015
+
7016
+ _experts: list[dict[str, Tensor]] | None = None
7017
+
7018
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7019
+ if name == "model.embed_tokens.weight":
7020
+ self._tok_embd = data_torch.clone()
7021
+
7022
+ if name == "lm_head.weight":
7023
+ if self.hparams.get("tie_word_embeddings", False):
7024
+ logger.info("Skipping tied output layer 'lm_head.weight'")
7025
+ return []
7026
+
7027
+ if name.find("mlp.experts") != -1:
7028
+ n_experts = self.hparams["num_experts"]
7029
+ assert bid is not None
7030
+
7031
+ if self._experts is None:
7032
+ self._experts = [{} for _ in range(self.block_count)]
7033
+
7034
+ self._experts[bid][name] = data_torch
7035
+
7036
+ if len(self._experts[bid]) >= n_experts * 3:
7037
+ # merge the experts into a single 3d tensor
7038
+ tensors: list[tuple[str, Tensor]] = []
7039
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
7040
+ datas: list[Tensor] = []
7041
+
7042
+ for xid in range(n_experts):
7043
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
7044
+ datas.append(self._experts[bid][ename])
7045
+ del self._experts[bid][ename]
7046
+
7047
+ data_torch = torch.stack(datas, dim=0)
7048
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
7049
+ new_name = self.map_tensor_name(merged_name)
7050
+ tensors.append((new_name, data_torch))
7051
+
7052
+ return tensors
7053
+ else:
7054
+ return []
7055
+
7056
+ return [(self.map_tensor_name(name), data_torch)]
7057
+
7058
+ def prepare_tensors(self):
7059
+ super().prepare_tensors()
7060
+ if self._experts is not None:
7061
+ experts = [k for d in self._experts for k in d.keys()]
7062
+ if len(experts) > 0:
7063
+ raise ValueError(f"Unprocessed experts: {experts}")
7064
+
7065
+
7066
+ @ModelBase.register("SmolLM3ForCausalLM")
7067
+ class SmolLM3Model(LlamaModel):
7068
+ model_arch = gguf.MODEL_ARCH.SMOLLM3
7069
+
7070
+ def set_vocab(self):
7071
+ super().set_vocab()
7072
+ # remove unsupported array slicing in chat template
7073
+ # ref: https://huggingface.co/ggml-org/SmolLM3-3B-GGUF/discussions/1
7074
+ from transformers import AutoTokenizer
7075
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
7076
+ if tokenizer.chat_template is not None:
7077
+ chat_template = tokenizer.chat_template.replace("[:]", "")
7078
+ self.gguf_writer.add_chat_template(chat_template)
7079
+
7080
+
7081
+ @ModelBase.register("Lfm2ForCausalLM")
7082
+ @ModelBase.register("LFM2ForCausalLM")
7083
+ class LFM2Model(TextModel):
7084
+ model_arch = gguf.MODEL_ARCH.LFM2
7085
+
7086
+ def _add_feed_forward_length(self):
7087
+ ff_dim = self.hparams["block_ff_dim"]
7088
+
7089
+ auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"]
7090
+ ff_dim = self.hparams["block_ff_dim"]
7091
+ ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"]
7092
+ multiple_of = self.hparams["block_multiple_of"]
7093
+
7094
+ if auto_adjust_ff_dim:
7095
+ ff_dim = int(2 * ff_dim / 3)
7096
+ # custom dim factor multiplier
7097
+ if ffn_dim_multiplier is not None:
7098
+ ff_dim = int(ffn_dim_multiplier * ff_dim)
7099
+ ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
7100
+
7101
+ self.gguf_writer.add_feed_forward_length(ff_dim)
7102
+
7103
+ def set_gguf_parameters(self):
7104
+ # set num_key_value_heads only for attention layers
7105
+ self.hparams["num_key_value_heads"] = [
7106
+ self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
7107
+ for layer_type in self.hparams["layer_types"]
7108
+ ]
7109
+
7110
+ super().set_gguf_parameters()
7111
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
7112
+ self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
7113
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
7114
+ self._add_feed_forward_length()
7115
+
7116
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7117
+ # conv op requires 2d tensor
7118
+ if 'conv.conv' in name:
7119
+ data_torch = data_torch.squeeze(1)
7120
+
7121
+ return [(self.map_tensor_name(name), data_torch)]
7122
+
7123
+
6295
7124
  ###### CONVERSION LOGIC ######
6296
7125
 
6297
7126
 
@@ -6471,12 +7300,20 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
6471
7300
  # maybe we should fallback to text model's arch in that case, since not many models have both
6472
7301
  text_config = hparams.get("text_config", {})
6473
7302
  vision_config = hparams.get("vision_config", {})
6474
- arch = hparams["architectures"][0]
7303
+ arch = None
7304
+ if (arches := hparams.get("architectures")) is not None and len(arches) > 0:
7305
+ arch = arches[0]
7306
+ elif "ssm_cfg" in hparams:
7307
+ # For non-hf Mamba and Mamba2 models
7308
+ arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
7309
+
6475
7310
  # if "architectures" is found in the sub-config, use that instead
6476
7311
  if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
6477
7312
  arch = text_config["architectures"][0]
6478
7313
  elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
6479
7314
  arch = vision_config["architectures"][0]
7315
+ if arch is None:
7316
+ raise ValueError("Failed to detect model architecture")
6480
7317
  return arch
6481
7318
 
6482
7319