@novastera-oss/llamarn 0.2.9 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (314) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/proguard-rules.pro +12 -0
  3. package/android/src/main/cpp/include/llama.h +15 -47
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/build-info.cpp +2 -2
  21. package/cpp/llama.cpp/CMakeLists.txt +0 -1
  22. package/cpp/llama.cpp/CMakePresets.json +11 -0
  23. package/cpp/llama.cpp/CODEOWNERS +1 -0
  24. package/cpp/llama.cpp/README.md +8 -8
  25. package/cpp/llama.cpp/build-xcframework.sh +1 -1
  26. package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
  27. package/cpp/llama.cpp/common/arg.cpp +62 -1
  28. package/cpp/llama.cpp/common/chat.cpp +37 -20
  29. package/cpp/llama.cpp/common/chat.h +2 -0
  30. package/cpp/llama.cpp/common/common.cpp +22 -6
  31. package/cpp/llama.cpp/common/common.h +22 -4
  32. package/cpp/llama.cpp/convert_hf_to_gguf.py +1250 -43
  33. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +21 -13
  34. package/cpp/llama.cpp/ggml/CMakeLists.txt +13 -3
  35. package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +85 -47
  36. package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
  37. package/cpp/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  38. package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
  39. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  40. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +0 -15
  41. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +7 -8
  42. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +44 -38
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  44. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +126 -8
  45. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +130 -22
  46. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +138 -18
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +11 -3
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1206 -163
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +36 -9
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +3 -3
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +31 -4
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +86 -17
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +225 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +41 -301
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +85 -64
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +47 -60
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +29 -42
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +46 -59
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +36 -45
  72. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +38 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +23 -36
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +3 -13
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +255 -99
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +1 -1
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +111 -3
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +6 -4
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1152 -695
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +92 -5
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +2 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +275 -0
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
  88. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  89. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
  90. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +104 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +13 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
  93. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +27 -6
  94. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-impl.h +80 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
  97. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +48 -12
  98. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +572 -106
  99. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +599 -105
  100. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +18 -4
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +5 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +800 -42
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  105. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  106. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  108. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  109. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  110. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  112. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
  114. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
  115. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
  116. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
  117. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  118. package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
  119. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +4 -4
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +14 -26
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +191 -55
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +8 -9
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
  131. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +991 -307
  132. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +265 -0
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +59 -12
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  138. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  139. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
  140. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
  141. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
  142. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
  143. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
  144. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  145. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  147. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -0
  149. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +17 -0
  150. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +3 -8
  152. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
  153. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
  154. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +18 -3
  156. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  158. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  159. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  160. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  161. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  163. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
  164. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  166. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +84 -9
  167. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  168. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +907 -0
  169. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  170. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +35 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +56 -0
  173. package/cpp/llama.cpp/ggml/src/ggml.c +386 -67
  174. package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
  175. package/cpp/llama.cpp/gguf-py/gguf/constants.py +307 -0
  176. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
  177. package/cpp/llama.cpp/gguf-py/gguf/metadata.py +4 -0
  178. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_dump.py +24 -1
  179. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +122 -47
  180. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
  181. package/cpp/llama.cpp/include/llama.h +15 -47
  182. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +34 -0
  183. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +43 -0
  184. package/cpp/llama.cpp/requirements/requirements-all.txt +1 -0
  185. package/cpp/llama.cpp/requirements/requirements-server-bench.txt +5 -0
  186. package/cpp/llama.cpp/src/llama-arch.cpp +316 -3
  187. package/cpp/llama.cpp/src/llama-arch.h +23 -1
  188. package/cpp/llama.cpp/src/llama-batch.cpp +103 -71
  189. package/cpp/llama.cpp/src/llama-batch.h +31 -18
  190. package/cpp/llama.cpp/src/llama-chat.cpp +58 -1
  191. package/cpp/llama.cpp/src/llama-chat.h +3 -0
  192. package/cpp/llama.cpp/src/llama-context.cpp +180 -106
  193. package/cpp/llama.cpp/src/llama-context.h +26 -16
  194. package/cpp/llama.cpp/src/llama-cparams.h +3 -2
  195. package/cpp/llama.cpp/src/llama-graph.cpp +310 -211
  196. package/cpp/llama.cpp/src/llama-graph.h +184 -122
  197. package/cpp/llama.cpp/src/llama-hparams.cpp +47 -1
  198. package/cpp/llama.cpp/src/llama-hparams.h +13 -2
  199. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
  200. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
  201. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
  202. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +143 -47
  203. package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
  204. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
  205. package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
  206. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +36 -11
  207. package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
  208. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  209. package/cpp/llama.cpp/src/llama-model.cpp +3545 -719
  210. package/cpp/llama.cpp/src/llama-model.h +21 -4
  211. package/cpp/llama.cpp/src/llama-quant.cpp +2 -2
  212. package/cpp/llama.cpp/src/llama-vocab.cpp +376 -10
  213. package/cpp/llama.cpp/src/llama-vocab.h +43 -0
  214. package/cpp/llama.cpp/src/unicode.cpp +207 -0
  215. package/cpp/llama.cpp/src/unicode.h +2 -0
  216. package/ios/include/chat.h +2 -0
  217. package/ios/include/common.h +22 -4
  218. package/ios/include/llama.h +15 -47
  219. package/ios/libs/llama.xcframework/Info.plist +13 -13
  220. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  221. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -4890
  222. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  223. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
  224. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +15 -47
  225. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  226. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  227. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
  228. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3764
  229. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  230. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  231. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
  232. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  233. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  234. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
  235. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4016 -3766
  236. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
  237. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
  238. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +15 -47
  239. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
  240. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
  241. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +15 -47
  242. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  243. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
  244. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
  245. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +15 -47
  246. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  247. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  248. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  249. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -4890
  250. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  251. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
  252. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +15 -47
  253. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  254. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  255. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
  256. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3764
  257. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  258. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  259. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
  260. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  261. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  262. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5303 -4926
  263. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  264. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
  265. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +15 -47
  266. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  267. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  268. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5274 -4897
  269. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4044 -3794
  270. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  271. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  272. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
  273. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  274. package/package.json +4 -4
  275. package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
  276. package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  277. package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  278. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  279. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  280. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  281. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  282. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  283. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  284. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  285. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  286. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  287. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  288. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  289. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  290. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  291. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  292. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  293. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  294. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  295. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  296. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  297. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  298. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  299. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  300. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  301. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  302. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  303. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  304. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  305. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  306. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  307. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  308. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  309. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  310. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  311. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  312. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  313. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  314. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
@@ -300,6 +300,7 @@ class ModelBase:
300
300
  gguf.MODEL_TENSOR.POS_EMBD,
301
301
  gguf.MODEL_TENSOR.TOKEN_TYPES,
302
302
  gguf.MODEL_TENSOR.SSM_CONV1D,
303
+ gguf.MODEL_TENSOR.SHORTCONV_CONV,
303
304
  gguf.MODEL_TENSOR.TIME_MIX_FIRST,
304
305
  gguf.MODEL_TENSOR.TIME_MIX_W1,
305
306
  gguf.MODEL_TENSOR.TIME_MIX_W2,
@@ -668,6 +669,36 @@ class TextModel(ModelBase):
668
669
  # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
669
670
  # or pull the latest version of the model from Huggingface
670
671
  # don't edit the hashes manually!
672
+ if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
673
+ # ref: https://huggingface.co/THUDM/glm-4-9b-chat
674
+ res = "chatglm-bpe"
675
+ if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
676
+ # ref: https://huggingface.co/THUDM/glm-4-9b-chat
677
+ res = "chatglm-bpe"
678
+ if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
679
+ # ref: https://huggingface.co/THUDM/glm-4-9b-hf
680
+ res = "glm4"
681
+ if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
682
+ # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
683
+ res = "minerva-7b"
684
+ if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
685
+ # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
686
+ res = "hunyuan"
687
+ if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
688
+ # ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
689
+ res = "falcon-h1"
690
+ if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86":
691
+ # ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base
692
+ res = "falcon-h1"
693
+ if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896":
694
+ # ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base
695
+ res = "falcon-h1"
696
+ if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
697
+ # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
698
+ res = "falcon-h1"
699
+ if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
700
+ # ref: https://huggingface.co/moonshotai/Kimi-K2-Base
701
+ res = "kimi-k2"
671
702
  if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
672
703
  # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
673
704
  res = "llama-bpe"
@@ -803,18 +834,18 @@ class TextModel(ModelBase):
803
834
  if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
804
835
  # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
805
836
  res = "seed-coder"
806
- if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
807
- # ref: https://huggingface.co/THUDM/glm-4-9b-chat
808
- res = "chatglm-bpe"
809
- if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
810
- # ref: https://huggingface.co/THUDM/glm-4-9b-chat
811
- res = "chatglm-bpe"
812
- if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
813
- # ref: https://huggingface.co/THUDM/glm-4-9b-hf
814
- res = "glm4"
815
- if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
816
- # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
817
- res = "minerva-7b"
837
+ if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf":
838
+ # ref: https://huggingface.co/skt/A.X-4.0
839
+ res = "a.x-4.0"
840
+ if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
841
+ # ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
842
+ res = "midm-2.0"
843
+ if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
844
+ # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
845
+ res = "lfm2"
846
+ if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
847
+ # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
848
+ res = "exaone4"
818
849
 
819
850
  if res is None:
820
851
  logger.warning("\n")
@@ -1057,7 +1088,14 @@ class TextModel(ModelBase):
1057
1088
  self.gguf_writer.add_token_list(tokens)
1058
1089
  self.gguf_writer.add_token_types(toktypes)
1059
1090
  special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
1060
- special_vocab.chat_template = "rwkv-world"
1091
+ if special_vocab.chat_template is None:
1092
+ template_path = Path(__file__).parent / "models" / "templates" / "llama-cpp-rwkv-world.jinja"
1093
+ if template_path.is_file():
1094
+ with open(template_path, "r", encoding="utf-8") as f:
1095
+ template = f.read()
1096
+ else:
1097
+ template = "rwkv-world"
1098
+ special_vocab.chat_template = template
1061
1099
  # hack: Add '\n\n' as the EOT token to make it chat normally
1062
1100
  special_vocab._set_special_token("eot", 261)
1063
1101
  # hack: Override these as they have already been set (incorrectly)
@@ -2743,6 +2781,210 @@ class Qwen2Model(TextModel):
2743
2781
  yield from super().modify_tensors(data_torch, name, bid)
2744
2782
 
2745
2783
 
2784
+ @ModelBase.register("DreamModel")
2785
+ class DreamModel(TextModel):
2786
+ model_arch = gguf.MODEL_ARCH.DREAM
2787
+
2788
+ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
2789
+ tokens: list[str] = []
2790
+ toktypes: list[int] = []
2791
+
2792
+ from transformers import AutoTokenizer
2793
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
2794
+
2795
+ vocab_dict = tokenizer.get_vocab()
2796
+ vocab_size = self.hparams.get("vocab_size", len(vocab_dict))
2797
+ assert max(vocab_dict.values()) < vocab_size
2798
+
2799
+ tokpre = self.get_vocab_base_pre(tokenizer)
2800
+
2801
+ reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()}
2802
+ added_vocab = tokenizer.get_added_vocab()
2803
+
2804
+ for i in range(vocab_size):
2805
+ if i not in reverse_vocab:
2806
+ tokens.append(f"[PAD{i}]")
2807
+ toktypes.append(gguf.TokenType.UNUSED)
2808
+ elif reverse_vocab[i] in added_vocab:
2809
+ tokens.append(reverse_vocab[i])
2810
+ # Check if it's a special token - treat special tokens as CONTROL tokens
2811
+ if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder:
2812
+ if tokenizer.added_tokens_decoder[i].special:
2813
+ toktypes.append(gguf.TokenType.CONTROL)
2814
+ else:
2815
+ toktypes.append(gguf.TokenType.USER_DEFINED)
2816
+ else:
2817
+ # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|>
2818
+ toktypes.append(gguf.TokenType.CONTROL)
2819
+ else:
2820
+ tokens.append(reverse_vocab[i])
2821
+ toktypes.append(gguf.TokenType.NORMAL)
2822
+
2823
+ return tokens, toktypes, tokpre
2824
+
2825
+ def set_vocab(self):
2826
+ try:
2827
+ self._set_vocab_sentencepiece()
2828
+ except FileNotFoundError:
2829
+ self._set_vocab_gpt2()
2830
+
2831
+ def set_gguf_parameters(self):
2832
+ super().set_gguf_parameters()
2833
+ self._try_set_pooling_type()
2834
+
2835
+ # Dream models use non-causal attention for diffusion
2836
+ self.gguf_writer.add_causal_attention(False)
2837
+ # Handle RoPE scaling similar to Qwen2
2838
+ rope_scaling = self.hparams.get("rope_scaling") or {}
2839
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
2840
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
2841
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
2842
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
2843
+
2844
+ # Add Dream-specific parameters
2845
+ mask_token_id = self.hparams.get("mask_token_id")
2846
+ if mask_token_id is not None:
2847
+ self.gguf_writer.add_mask_token_id(mask_token_id)
2848
+
2849
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2850
+ # Dream model tensors should be mapped directly since it's the base model
2851
+ yield from super().modify_tensors(data_torch, name, bid)
2852
+
2853
+
2854
+ @ModelBase.register("Ernie4_5_ForCausalLM")
2855
+ class Ernie4_5Model(TextModel):
2856
+ model_arch = gguf.MODEL_ARCH.ERNIE4_5
2857
+
2858
+ def set_vocab(self):
2859
+ self._set_vocab_sentencepiece()
2860
+
2861
+ def set_gguf_parameters(self):
2862
+ super().set_gguf_parameters()
2863
+
2864
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2865
+ num_heads = self.hparams["num_attention_heads"]
2866
+ num_kv_heads = self.hparams["num_key_value_heads"]
2867
+ if (head_dim := self.hparams.get("head_dim")) is None:
2868
+ head_dim = self.hparams["hidden_size"] // num_heads
2869
+
2870
+ if "ernie." in name:
2871
+ name = name.replace("ernie.", "model.")
2872
+ # split the qkv weights
2873
+ # qkv_proj shape: [(num_heads + 2 * num_kv_heads) * head_dim, hidden_size]
2874
+ if "qkv_proj" in name:
2875
+ name_q = name.replace("qkv_proj.weight", "q_proj.weight")
2876
+ name_k = name.replace("qkv_proj.weight", "k_proj.weight")
2877
+ name_v = name.replace("qkv_proj.weight", "v_proj.weight")
2878
+ total_q_dim = num_heads * head_dim
2879
+ total_k_dim = num_kv_heads * head_dim
2880
+ total_v_dim = num_kv_heads * head_dim
2881
+ q_proj_weight, k_proj_weight, v_proj_weight = data_torch.split([total_q_dim, total_k_dim, total_v_dim], dim=0)
2882
+ return [
2883
+ (self.map_tensor_name(name_q), q_proj_weight),
2884
+ (self.map_tensor_name(name_k), k_proj_weight),
2885
+ (self.map_tensor_name(name_v), v_proj_weight)
2886
+ ]
2887
+ # split the up_gate_proj into gate and up
2888
+ # up_gate_proj shape: [2 * intermediate_size, hidden_size]
2889
+ if "up_gate_proj" in name:
2890
+ name_up = name.replace("up_gate_proj.weight", "up_proj.weight")
2891
+ name_gate = name.replace("up_gate_proj.weight", "gate_proj.weight")
2892
+ dim_half = data_torch.shape[0] // 2
2893
+ gate_proj_weight, up_proj_weight = data_torch.split(dim_half, dim=0)
2894
+ return [
2895
+ (self.map_tensor_name(name_gate), gate_proj_weight),
2896
+ (self.map_tensor_name(name_up), up_proj_weight)
2897
+ ]
2898
+ return [(self.map_tensor_name(name), data_torch)]
2899
+
2900
+
2901
+ @ModelBase.register("Ernie4_5_MoeForCausalLM")
2902
+ class Ernie4_5MoeModel(Ernie4_5Model):
2903
+ model_arch = gguf.MODEL_ARCH.ERNIE4_5_MOE
2904
+ _experts: list[dict[str, Tensor]] | None = None
2905
+
2906
+ def __init__(self, *args, **kwargs):
2907
+ super().__init__(*args, **kwargs)
2908
+ self._experts = [{} for _ in range(self.block_count)]
2909
+
2910
+ def set_gguf_parameters(self):
2911
+ super().set_gguf_parameters()
2912
+ self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"])
2913
+ self.gguf_writer.add_expert_used_count(self.hparams["moe_k"])
2914
+ self.gguf_writer.add_interleave_moe_layer_step(self.hparams["moe_layer_interval"])
2915
+ self.gguf_writer.add_leading_dense_block_count(self.hparams["moe_layer_start_index"])
2916
+ if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
2917
+ self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
2918
+ if (shared_expert_count := self.hparams.get('moe_num_shared_experts')) is not None:
2919
+ self.gguf_writer.add_expert_shared_count(shared_expert_count)
2920
+ if shared_expert_count > 0 and (shared_expert_intermediate_size := self.hparams.get('intermediate_size')) is not None and (num_key_value_heads := self.hparams.get('num_key_value_heads')) is not None:
2921
+ self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size // num_key_value_heads)
2922
+
2923
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2924
+ # Modify correction bias name as in DeepseekV2
2925
+ if name.endswith("e_score_correction_bias"):
2926
+ name = name.replace("e_score_correction_bias", "e_score_correction.bias")
2927
+
2928
+ # skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2)
2929
+ match = re.match(r"model.mtp_block.(\d+)", name)
2930
+ if match:
2931
+ return []
2932
+
2933
+ # skip all other MTP tensors for now
2934
+ match = re.match(r"model.mtp_emb_norm.(\d+)", name)
2935
+ if match:
2936
+ return []
2937
+
2938
+ match = re.match(r"model.mtp_hidden_norm.(\d+)", name)
2939
+ if match:
2940
+ return []
2941
+
2942
+ match = re.match(r"model.mtp_linear_proj.(\d+)", name)
2943
+ if match:
2944
+ return []
2945
+
2946
+ # process the experts separately
2947
+ if name.find("mlp.experts") != -1:
2948
+ n_experts = self.hparams["moe_num_experts"]
2949
+ assert bid is not None
2950
+
2951
+ if self._experts is None:
2952
+ self._experts = [{} for _ in range(self.block_count)]
2953
+
2954
+ self._experts[bid][name] = data_torch
2955
+
2956
+ if len(self._experts[bid]) >= n_experts * 3:
2957
+ tensors: list[tuple[str, Tensor]] = []
2958
+
2959
+ # merge the experts into a single 3d tensor
2960
+ for w_name in ["gate_proj", "up_proj", "down_proj"]:
2961
+ datas: list[Tensor] = []
2962
+
2963
+ for xid in range(n_experts):
2964
+ ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
2965
+ datas.append(self._experts[bid][ename_to_retrieve])
2966
+ del self._experts[bid][ename_to_retrieve]
2967
+
2968
+ data_torch = torch.stack(datas, dim=0)
2969
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
2970
+ new_name = self.map_tensor_name(merged_name)
2971
+ tensors.append((new_name, data_torch))
2972
+
2973
+ return tensors
2974
+ else:
2975
+ return []
2976
+ return [(self.map_tensor_name(name), data_torch)]
2977
+
2978
+ def prepare_tensors(self):
2979
+ super().prepare_tensors()
2980
+
2981
+ if self._experts is not None:
2982
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
2983
+ experts = [k for d in self._experts for k in d.keys()]
2984
+ if len(experts) > 0:
2985
+ raise ValueError(f"Unprocessed experts: {experts}")
2986
+
2987
+
2746
2988
  @ModelBase.register(
2747
2989
  "Qwen2VLModel",
2748
2990
  "Qwen2VLForConditionalGeneration",
@@ -3430,6 +3672,175 @@ class PlamoModel(TextModel):
3430
3672
  return [(new_name, data_torch)]
3431
3673
 
3432
3674
 
3675
+ @ModelBase.register("Plamo2ForCausalLM", "PLaMo2ForCausalLM")
3676
+ class Plamo2Model(TextModel):
3677
+ model_arch = gguf.MODEL_ARCH.PLAMO2
3678
+
3679
+ def set_vocab(self):
3680
+ # PLaMo 2 uses a custom tokenizer with a .jsonl file
3681
+ # We need to handle this specially
3682
+ tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl"
3683
+ tokenizer_config_path = self.dir_model / "tokenizer_config.json"
3684
+
3685
+ if not tokenizer_jsonl_path.is_file():
3686
+ raise FileNotFoundError(f"PLaMo 2 tokenizer file not found: {tokenizer_jsonl_path}")
3687
+
3688
+ # Load tokenizer config
3689
+ with open(tokenizer_config_path, 'r', encoding='utf-8') as f:
3690
+ tokenizer_config = json.load(f)
3691
+
3692
+ # Load tokens from JSONL file (actually a list format)
3693
+ tokens = []
3694
+ scores = []
3695
+ toktypes = []
3696
+
3697
+ with open(tokenizer_jsonl_path, 'r', encoding='utf-8') as f:
3698
+ for line_num, line in enumerate(f):
3699
+ if line.strip():
3700
+ token_data = json.loads(line)
3701
+ # Format: [token, score, type, ?, ?, ?, ?]
3702
+ token = token_data[0].encode("utf-8")
3703
+ score = float(token_data[1])
3704
+ token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL"
3705
+
3706
+ tokens.append(token)
3707
+ scores.append(score)
3708
+
3709
+ # Map token type strings to GGUF token types
3710
+ if token_type_str == "UNKNOWN":
3711
+ toktypes.append(gguf.TokenType.UNKNOWN)
3712
+ elif token_type_str == "CONTROL":
3713
+ toktypes.append(gguf.TokenType.CONTROL)
3714
+ elif token_type_str == "BYTE":
3715
+ toktypes.append(gguf.TokenType.BYTE)
3716
+ else:
3717
+ # Check for PLaMo-2 special tokens
3718
+ token_str = token_data[0]
3719
+ if token_str.startswith("<|plamo:") and token_str.endswith("|>"):
3720
+ toktypes.append(gguf.TokenType.CONTROL)
3721
+ else:
3722
+ toktypes.append(gguf.TokenType.NORMAL)
3723
+
3724
+ vocab_size = self.hparams["vocab_size"]
3725
+ if vocab_size > len(tokens):
3726
+ pad_count = vocab_size - len(tokens)
3727
+ logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3728
+ for i in range(1, pad_count + 1):
3729
+ tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3730
+ scores.append(-1000.0)
3731
+ toktypes.append(gguf.TokenType.UNUSED)
3732
+
3733
+ # Use "plamo2" tokenizer type for PLaMo-2's custom Aho-Corasick tokenizer
3734
+ self.gguf_writer.add_tokenizer_model("plamo2")
3735
+ self.gguf_writer.add_tokenizer_pre("default")
3736
+ self.gguf_writer.add_token_list(tokens)
3737
+ self.gguf_writer.add_token_scores(scores)
3738
+ self.gguf_writer.add_token_types(toktypes)
3739
+
3740
+ # Add special tokens from config
3741
+ if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None:
3742
+ token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8"))
3743
+ self.gguf_writer.add_bos_token_id(token_id)
3744
+ if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None:
3745
+ token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8"))
3746
+ self.gguf_writer.add_eos_token_id(token_id)
3747
+ if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None:
3748
+ token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8"))
3749
+ self.gguf_writer.add_pad_token_id(token_id)
3750
+ if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None:
3751
+ token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8"))
3752
+ self.gguf_writer.add_sep_token_id(token_id)
3753
+ if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None:
3754
+ token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8"))
3755
+ self.gguf_writer.add_unk_token_id(token_id)
3756
+
3757
+ # Add <|plamo:op|> as EOT to ensure appropriate end of generation
3758
+ self.gguf_writer.add_eot_token_id(4)
3759
+
3760
+ self.gguf_writer.add_add_space_prefix(False)
3761
+
3762
+ def set_gguf_parameters(self):
3763
+ hparams = self.hparams
3764
+ block_count = hparams["num_hidden_layers"]
3765
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
3766
+
3767
+ # Which layers are Mamba layers
3768
+ # PLaMo 2 uses mamba_step to indicate the pattern (e.g., 2 means every other layer)
3769
+ # This logic matches modeling_plamo.py's is_mamba function
3770
+ mamba_step = hparams.get("mamba_step", 2)
3771
+ mamba_enabled = hparams.get("mamba_enabled", True)
3772
+ mamba_layers = []
3773
+
3774
+ if mamba_enabled:
3775
+ for i in range(block_count):
3776
+ if block_count <= (mamba_step // 2):
3777
+ # use attention in last layer
3778
+ is_mamba = (i != block_count - 1)
3779
+ else:
3780
+ is_mamba = (i % mamba_step) != (mamba_step // 2)
3781
+ if is_mamba:
3782
+ mamba_layers.append(0)
3783
+ else:
3784
+ mamba_layers.append(hparams.get("num_key_value_heads", 4))
3785
+
3786
+ if mamba_layers:
3787
+ self.gguf_writer.add_head_count_kv(mamba_layers)
3788
+
3789
+ self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048))
3790
+ self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096))
3791
+ self.gguf_writer.add_block_count(block_count)
3792
+ self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
3793
+ self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
3794
+ self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
3795
+
3796
+ # Mamba parameters
3797
+ self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
3798
+ self.gguf_writer.add_ssm_conv_kernel(hparams.get("mamba_d_conv", 4))
3799
+ self.gguf_writer.add_ssm_time_step_rank(hparams.get("mamba_num_heads", 64))
3800
+ intermediate_size = hparams.get("mamba_num_heads", 64) * hparams.get("hidden_size_per_head", 128)
3801
+ self.gguf_writer.add_ssm_inner_size(intermediate_size)
3802
+ self.gguf_writer.add_ssm_group_count(0)
3803
+
3804
+ # MLP feed forward parameters (for attention layers)
3805
+ self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312))
3806
+ self.gguf_writer.add_file_type(self.ftype)
3807
+
3808
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3809
+ del bid # unused
3810
+
3811
+ if name.endswith(".A_log"):
3812
+ data_torch = -torch.exp(data_torch)
3813
+ elif name.endswith(".dt_bias"):
3814
+ name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
3815
+ elif name.endswith(".dt_norm_weight"):
3816
+ name = name.rpartition(".dt_norm_weight")[0] + ".dt_norm.weight"
3817
+ elif name.endswith(".B_norm_weight"):
3818
+ name = name.rpartition(".B_norm_weight")[0] + ".B_norm.weight"
3819
+ elif name.endswith(".C_norm_weight"):
3820
+ name = name.rpartition(".C_norm_weight")[0] + ".C_norm.weight"
3821
+ elif name.endswith(".k_weight"):
3822
+ name = name.rpartition(".k_weight")[0] + ".k.weight"
3823
+ elif name.endswith(".q_weight"):
3824
+ name = name.rpartition(".q_weight")[0] + ".q.weight"
3825
+ elif name.endswith(".conv1d.weight"):
3826
+ data_torch = torch.squeeze(data_torch) # remove (, 1, )
3827
+ assert data_torch.ndim == 2
3828
+ elif name.endswith(".pre_mixer_norm.weight"):
3829
+ data_torch += 1.0
3830
+ elif name.endswith(".post_mixer_norm.weight"):
3831
+ data_torch += 1.0 / 5
3832
+ elif name.endswith(".pre_mlp_norm.weight"):
3833
+ data_torch += 1.0
3834
+ elif name.endswith(".post_mlp_norm.weight"):
3835
+ data_torch += 1.0 / (5**1.5)
3836
+ elif name.endswith(".norm.weight"):
3837
+ data_torch += 1.0
3838
+
3839
+ new_name = self.map_tensor_name(name)
3840
+
3841
+ return [(new_name, data_torch)]
3842
+
3843
+
3433
3844
  @ModelBase.register("CodeShellForCausalLM")
3434
3845
  class CodeShellModel(TextModel):
3435
3846
  model_arch = gguf.MODEL_ARCH.CODESHELL
@@ -4362,9 +4773,6 @@ class Gemma3NModel(Gemma3Model):
4362
4773
  ]
4363
4774
 
4364
4775
  def set_vocab(self):
4365
- with open(self.dir_model / "chat_template.jinja") as f:
4366
- # quick hack to make sure chat template is added
4367
- self.gguf_writer.add_chat_template(f.read())
4368
4776
  super().set_vocab()
4369
4777
 
4370
4778
  def set_gguf_parameters(self):
@@ -4735,6 +5143,14 @@ class ARwkv7Model(Rwkv7Model):
4735
5143
  class MambaModel(TextModel):
4736
5144
  model_arch = gguf.MODEL_ARCH.MAMBA
4737
5145
 
5146
+ def __init__(self, dir_model: Path, *args, **kwargs):
5147
+ # Avoid using AutoConfig for hparams
5148
+ hparams = kwargs.pop("hparams", None)
5149
+ if hparams is None:
5150
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
5151
+ hparams = json.load(f)
5152
+ super().__init__(dir_model, *args, hparams=hparams, **kwargs)
5153
+
4738
5154
  def set_vocab(self):
4739
5155
  vocab_size = self.hparams["vocab_size"]
4740
5156
  # Round vocab size to next multiple of 8
@@ -4809,36 +5225,246 @@ class MambaModel(TextModel):
4809
5225
  return [(new_name, data_torch)]
4810
5226
 
4811
5227
 
4812
- @ModelBase.register("CohereForCausalLM")
4813
- class CommandR2Model(TextModel):
4814
- model_arch = gguf.MODEL_ARCH.COMMAND_R
5228
+ @ModelBase.register("Mamba2ForCausalLM")
5229
+ class Mamba2Model(TextModel):
5230
+ model_arch = gguf.MODEL_ARCH.MAMBA2
4815
5231
 
4816
- def __init__(self, *args, **kwargs):
4817
- super().__init__(*args, **kwargs)
5232
+ def __init__(self, dir_model: Path, *args, **kwargs):
5233
+ # Avoid using AutoConfig for hparams
5234
+ # It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1
5235
+ hparams = kwargs.pop("hparams", None)
5236
+ if hparams is None:
5237
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
5238
+ hparams = json.load(f)
5239
+ super().__init__(dir_model, *args, hparams=hparams, **kwargs)
5240
+ self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
5241
+ self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
5242
+ self.n_group = self.find_hparam(["n_groups"], optional=True) or 1
4818
5243
 
4819
- # max_position_embeddings = 8192 in config.json but model was actually
4820
- # trained on 128k context length
4821
- # aya-23 models don't have model_max_length specified
4822
- self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
5244
+ def set_vocab(self):
5245
+ vocab_size = self.hparams["vocab_size"]
5246
+ # Round vocab size to next multiple of 16
5247
+ pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16)
5248
+ # pad using ceiling division
5249
+ # ref: https://stackoverflow.com/a/17511341/22827863
5250
+ vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
5251
+ self.hparams["vocab_size"] = vocab_size
5252
+
5253
+ if (self.dir_model / "tokenizer.model").is_file():
5254
+ self._set_vocab_sentencepiece()
5255
+ elif (self.dir_model / "tokenizer.model.v3").is_file():
5256
+ # mamba-codestral
5257
+ raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}")
5258
+ elif (self.dir_model / "tokenizer.json").is_file():
5259
+ self._set_vocab_gpt2()
5260
+ else:
5261
+ # Use the GPT-NeoX tokenizer when no tokenizer files are present
5262
+ self._set_vocab_builtin("gpt-neox", vocab_size)
4823
5263
 
4824
5264
  def set_gguf_parameters(self):
4825
- super().set_gguf_parameters()
4826
- self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
4827
- self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
5265
+ d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
5266
+ d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
5267
+ head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
4828
5268
 
5269
+ rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
4829
5270
 
4830
- @ModelBase.register("Cohere2ForCausalLM")
4831
- class Cohere2Model(TextModel):
4832
- model_arch = gguf.MODEL_ARCH.COHERE2
5271
+ # Fail early for models which don't have a block expansion factor of 2
5272
+ # TODO: does this really matter?
5273
+ # skip the assertion for FalconH1 Model
5274
+ if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
5275
+ assert self.d_inner == 2 * self.d_model
5276
+ assert self.d_inner % head_dim == 0
5277
+
5278
+ self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
5279
+ self.gguf_writer.add_embedding_length(self.d_model)
5280
+ self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
5281
+ self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
5282
+ self.gguf_writer.add_block_count(self.block_count)
5283
+ self.gguf_writer.add_ssm_conv_kernel(d_conv)
5284
+ self.gguf_writer.add_ssm_inner_size(self.d_inner)
5285
+ self.gguf_writer.add_ssm_state_size(d_state)
5286
+ self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
5287
+ self.gguf_writer.add_ssm_group_count(self.n_group)
5288
+ self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
5289
+ self.gguf_writer.add_file_type(self.ftype)
4833
5290
 
4834
- def set_gguf_parameters(self):
4835
- super().set_gguf_parameters()
5291
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4836
5292
 
4837
- self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
4838
- self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
4839
- self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
5293
+ if name.startswith("model.backbone") or name.startswith("model.lm_head"):
5294
+ # map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2
5295
+ name = name.removeprefix("model.")
4840
5296
 
4841
- rotary_pct = self.hparams["rotary_pct"]
5297
+ if name.endswith(".dt_bias"):
5298
+ name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
5299
+
5300
+ new_name = self.map_tensor_name(name)
5301
+
5302
+ if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
5303
+ data_torch = data_torch.squeeze()
5304
+ elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [
5305
+ gguf.MODEL_TENSOR.SSM_A,
5306
+ gguf.MODEL_TENSOR.SSM_D,
5307
+ ]):
5308
+ # unsqueeze A to use similar shape semantics as Mamba-1
5309
+ # (D is also unsqueezed, but for more straightforward broadcast internally)
5310
+ data_torch = data_torch.reshape((*data_torch.shape, 1))
5311
+ elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
5312
+ data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group))
5313
+
5314
+ if name.endswith(".A_log"):
5315
+ logger.debug("A_log --> A ==> " + new_name)
5316
+ data_torch = -torch.exp(data_torch)
5317
+
5318
+ yield (new_name, data_torch)
5319
+
5320
+
5321
+ @ModelBase.register("JambaForCausalLM")
5322
+ class JambaModel(TextModel):
5323
+ model_arch = gguf.MODEL_ARCH.JAMBA
5324
+
5325
+ def get_vocab_base_pre(self, tokenizer) -> str:
5326
+ del tokenizer # unused
5327
+
5328
+ return "gpt-2"
5329
+
5330
+ def set_vocab(self):
5331
+ if (self.dir_model / "tokenizer.model").is_file():
5332
+ # Using Jamba's tokenizer.json causes errors on model load
5333
+ # (something about "byte not found in vocab"),
5334
+ # but there's a working tokenizer.model
5335
+ self._set_vocab_sentencepiece()
5336
+ else:
5337
+ # Some Jamba models only have a tokenizer.json, which works.
5338
+ self._set_vocab_gpt2()
5339
+
5340
+ def set_gguf_parameters(self):
5341
+ d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
5342
+ d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4
5343
+ d_inner = self.hparams["mamba_expand"] * d_model
5344
+ d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16
5345
+ # ceiling division
5346
+ # ref: https://stackoverflow.com/a/17511341/22827863
5347
+ # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
5348
+ dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16)
5349
+ rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6
5350
+ n_kv_head = self.hparams["num_key_value_heads"]
5351
+ attn_offset = self.hparams["attn_layer_offset"]
5352
+ attn_period = self.hparams["attn_layer_period"]
5353
+ n_kv_vec = [0 for _ in range(attn_offset)] + [
5354
+ n_kv_head if (i - attn_offset) % attn_period == 0 else 0 for i in range(attn_offset, self.block_count)
5355
+ ]
5356
+
5357
+ self.gguf_writer.add_block_count(self.block_count)
5358
+ self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"]))
5359
+ self.gguf_writer.add_embedding_length(d_model)
5360
+ self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
5361
+ self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
5362
+ self.gguf_writer.add_head_count_kv(n_kv_vec)
5363
+ self.gguf_writer.add_ssm_conv_kernel(d_conv)
5364
+ self.gguf_writer.add_ssm_inner_size(d_inner)
5365
+ self.gguf_writer.add_ssm_state_size(d_state)
5366
+ self.gguf_writer.add_ssm_time_step_rank(dt_rank)
5367
+ self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
5368
+ self.gguf_writer.add_expert_count(self.hparams["num_experts"])
5369
+ self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
5370
+ self.gguf_writer.add_file_type(self.ftype)
5371
+
5372
+ _experts: list[dict[str, Tensor]] | None = None
5373
+
5374
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5375
+
5376
+ # Mini-Jamba
5377
+ name = name.replace(".moe.", ".feed_forward.")
5378
+ if bid is not None:
5379
+ moe_offset = self.hparams["expert_layer_offset"]
5380
+ moe_period = self.hparams["expert_layer_period"]
5381
+
5382
+ if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0):
5383
+ name = name.replace(".experts.0.", ".")
5384
+
5385
+ # process the experts separately
5386
+ if ".feed_forward.experts." in name:
5387
+ n_experts = self.hparams["num_experts"]
5388
+
5389
+ assert bid is not None
5390
+
5391
+ if self._experts is None:
5392
+ self._experts = [{} for _ in range(self.block_count)]
5393
+
5394
+ self._experts[bid][name] = data_torch
5395
+
5396
+ if len(self._experts[bid]) >= n_experts * 3:
5397
+
5398
+ # merge the experts into a single 3d tensor
5399
+ for wid in ["down_proj", "gate_proj", "up_proj"]:
5400
+ datas: list[Tensor] = []
5401
+
5402
+ for xid in range(n_experts):
5403
+ ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight"
5404
+ datas.append(self._experts[bid][ename])
5405
+ del self._experts[bid][ename]
5406
+
5407
+ data_torch = torch.stack(datas, dim=0)
5408
+
5409
+ # using the same merged name as qwen2moe
5410
+ merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight"
5411
+
5412
+ new_name = self.map_tensor_name(merged_name)
5413
+
5414
+ yield new_name, data_torch
5415
+ return
5416
+
5417
+ new_name = self.map_tensor_name(name)
5418
+
5419
+ if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
5420
+ data_torch = data_torch.squeeze()
5421
+
5422
+ if name.endswith(".A_log"):
5423
+ logger.debug("A_log --> A ==> " + new_name)
5424
+ data_torch = -torch.exp(data_torch)
5425
+
5426
+ yield (new_name, data_torch)
5427
+
5428
+ def prepare_tensors(self):
5429
+ super().prepare_tensors()
5430
+
5431
+ if self._experts is not None:
5432
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
5433
+ experts = [k for d in self._experts for k in d.keys()]
5434
+ if len(experts) > 0:
5435
+ raise ValueError(f"Unprocessed experts: {experts}")
5436
+
5437
+
5438
+ @ModelBase.register("CohereForCausalLM")
5439
+ class CommandR2Model(TextModel):
5440
+ model_arch = gguf.MODEL_ARCH.COMMAND_R
5441
+
5442
+ def __init__(self, *args, **kwargs):
5443
+ super().__init__(*args, **kwargs)
5444
+
5445
+ # max_position_embeddings = 8192 in config.json but model was actually
5446
+ # trained on 128k context length
5447
+ # aya-23 models don't have model_max_length specified
5448
+ self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
5449
+
5450
+ def set_gguf_parameters(self):
5451
+ super().set_gguf_parameters()
5452
+ self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
5453
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
5454
+
5455
+
5456
+ @ModelBase.register("Cohere2ForCausalLM")
5457
+ class Cohere2Model(TextModel):
5458
+ model_arch = gguf.MODEL_ARCH.COHERE2
5459
+
5460
+ def set_gguf_parameters(self):
5461
+ super().set_gguf_parameters()
5462
+
5463
+ self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
5464
+ self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
5465
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
5466
+
5467
+ rotary_pct = self.hparams["rotary_pct"]
4842
5468
  hidden_size = self.hparams["hidden_size"]
4843
5469
  num_attention_heads = self.hparams["num_attention_heads"]
4844
5470
  self.gguf_writer.add_rope_dimension_count(int(rotary_pct * (hidden_size // num_attention_heads)))
@@ -5277,7 +5903,58 @@ class DeepseekV2Model(TextModel):
5277
5903
  model_arch = gguf.MODEL_ARCH.DEEPSEEK2
5278
5904
 
5279
5905
  def set_vocab(self):
5280
- self._set_vocab_gpt2()
5906
+ try:
5907
+ self._set_vocab_gpt2()
5908
+ return
5909
+ except Exception:
5910
+ pass
5911
+
5912
+ from transformers import AutoTokenizer
5913
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
5914
+ tokpre = self.get_vocab_base_pre(tokenizer)
5915
+
5916
+ if tokpre == "kimi-k2":
5917
+ # Build merges list using the approach similar to HunYuanMoE
5918
+ merges = []
5919
+ vocab = {}
5920
+ mergeable_ranks = tokenizer.model._mergeable_ranks
5921
+ for token, rank in mergeable_ranks.items():
5922
+ vocab[QwenModel.token_bytes_to_string(token)] = rank
5923
+ if len(token) == 1:
5924
+ continue
5925
+ merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
5926
+ if len(merged) == 2:
5927
+ merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
5928
+
5929
+ # Build token list
5930
+ vocab_size = self.hparams["vocab_size"]
5931
+ special_tokens = tokenizer.special_tokens
5932
+ reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
5933
+ tokens: list[str] = []
5934
+ toktypes: list[int] = []
5935
+
5936
+ for i in range(vocab_size):
5937
+ if i not in reverse_vocab:
5938
+ tokens.append(f"[PAD{i}]")
5939
+ toktypes.append(gguf.TokenType.UNUSED)
5940
+ else:
5941
+ token = reverse_vocab[i]
5942
+ tokens.append(token)
5943
+ if i in special_tokens.values():
5944
+ toktypes.append(gguf.TokenType.CONTROL)
5945
+ else:
5946
+ toktypes.append(gguf.TokenType.NORMAL)
5947
+
5948
+ self.gguf_writer.add_tokenizer_model("gpt2")
5949
+ self.gguf_writer.add_tokenizer_pre(tokpre)
5950
+ self.gguf_writer.add_token_list(tokens)
5951
+ self.gguf_writer.add_token_types(toktypes)
5952
+ self.gguf_writer.add_token_merges(merges)
5953
+
5954
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
5955
+ special_vocab.add_to_gguf(self.gguf_writer)
5956
+ else:
5957
+ raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
5281
5958
 
5282
5959
  def set_gguf_parameters(self):
5283
5960
 
@@ -5809,7 +6486,7 @@ class JaisModel(TextModel):
5809
6486
  self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
5810
6487
 
5811
6488
 
5812
- @ModelBase.register("Glm4ForCausalLM")
6489
+ @ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration")
5813
6490
  class Glm4Model(TextModel):
5814
6491
  model_arch = gguf.MODEL_ARCH.GLM4
5815
6492
 
@@ -5831,7 +6508,8 @@ class Glm4Model(TextModel):
5831
6508
 
5832
6509
  def set_gguf_parameters(self):
5833
6510
  super().set_gguf_parameters()
5834
- rope_dim = self.hparams["head_dim"]
6511
+ if (rope_dim := self.hparams.get("head_dim")) is None:
6512
+ rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
5835
6513
  self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
5836
6514
  rope_scaling = self.hparams.get("rope_scaling") or {}
5837
6515
  if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
@@ -5839,6 +6517,13 @@ class Glm4Model(TextModel):
5839
6517
  self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
5840
6518
  self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
5841
6519
 
6520
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6521
+ if name.startswith("model.visual."): # ignore visual part of Glm4v
6522
+ return []
6523
+ elif name.startswith("model.language_model."):
6524
+ name = name.replace("language_model.", "") # for Glm4v
6525
+ return super().modify_tensors(data_torch, name, bid)
6526
+
5842
6527
 
5843
6528
  @ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
5844
6529
  class ChatGLMModel(TextModel):
@@ -6106,6 +6791,75 @@ class ExaoneModel(TextModel):
6106
6791
  yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
6107
6792
 
6108
6793
 
6794
+ @ModelBase.register("Exaone4ForCausalLM")
6795
+ class Exaone4Model(TextModel):
6796
+ model_arch = gguf.MODEL_ARCH.EXAONE4
6797
+
6798
+ def set_vocab(self):
6799
+ tokens, toktypes, tokpre = self.get_vocab_base()
6800
+ self.gguf_writer.add_tokenizer_model("gpt2")
6801
+ self.gguf_writer.add_tokenizer_pre(tokpre)
6802
+ self.gguf_writer.add_token_list(tokens)
6803
+ self.gguf_writer.add_token_types(toktypes)
6804
+
6805
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
6806
+ special_vocab.add_to_gguf(self.gguf_writer)
6807
+
6808
+ def set_gguf_parameters(self):
6809
+ super().set_gguf_parameters()
6810
+ hparams = self.hparams
6811
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
6812
+
6813
+ if hparams.get("sliding_window") is not None:
6814
+ self.gguf_writer.add_sliding_window(hparams["sliding_window"])
6815
+ if "layer_types" in hparams:
6816
+ self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]])
6817
+ elif "sliding_window_pattern" in hparams:
6818
+ sliding_window_pattern = []
6819
+ if isinstance(hparams["sliding_window_pattern"], str): # e.g. LLLG
6820
+ for i in range(hparams["num_hidden_layers"]):
6821
+ sliding_window_pattern.append(hparams["sliding_window_pattern"][i % len(hparams["sliding_window_pattern"])] == "L")
6822
+ if isinstance(hparams["sliding_window_pattern"], int): # e.g. 4
6823
+ for i in range(hparams["num_hidden_layers"]):
6824
+ sliding_window_pattern.append((i + 1) % hparams["sliding_window_pattern"] != 0)
6825
+ if len(sliding_window_pattern) == hparams["num_hidden_layers"]:
6826
+ self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
6827
+
6828
+ rope_scaling = self.hparams.get("rope_scaling") or {}
6829
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
6830
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
6831
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
6832
+
6833
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
6834
+ if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
6835
+ if rope_scaling.get("rope_type", '').lower() == "llama3":
6836
+ base = self.hparams.get("rope_theta", 10_000.0)
6837
+ if (dim := self.hparams.get("head_dim")) is None:
6838
+ dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
6839
+ freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
6840
+
6841
+ factor = rope_scaling.get("factor", 16.0)
6842
+ low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
6843
+ high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
6844
+ old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
6845
+
6846
+ low_freq_wavelen = old_context_len / low_freq_factor
6847
+ high_freq_wavelen = old_context_len / high_freq_factor
6848
+
6849
+ rope_factors = []
6850
+ for freq in freqs:
6851
+ wavelen = 2 * math.pi / freq
6852
+ if wavelen < high_freq_wavelen:
6853
+ rope_factors.append(1)
6854
+ elif wavelen > low_freq_wavelen:
6855
+ rope_factors.append(factor)
6856
+ else:
6857
+ smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
6858
+ rope_factors.append(1 / ((1 - smooth) / factor + smooth))
6859
+
6860
+ yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
6861
+
6862
+
6109
6863
  @ModelBase.register("GraniteForCausalLM")
6110
6864
  class GraniteModel(LlamaModel):
6111
6865
  """Conversion for IBM's GraniteForCausalLM"""
@@ -6170,18 +6924,148 @@ class GraniteMoeModel(GraniteModel):
6170
6924
  (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
6171
6925
  ]
6172
6926
 
6927
+ has_experts = bool(self.hparams.get('num_local_experts'))
6928
+
6173
6929
  if name.endswith("shared_mlp.input_linear.weight"):
6174
6930
  ffn_dim = self.hparams["shared_intermediate_size"]
6175
6931
  assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
6176
6932
  gate, up = data_torch.split(ffn_dim, dim=-2)
6933
+ if has_experts:
6934
+ return [
6935
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
6936
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
6937
+ ]
6938
+ return [
6939
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), gate),
6940
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), up),
6941
+ ]
6942
+
6943
+ if not has_experts and name.endswith("shared_mlp.output_linear.weight"):
6177
6944
  return [
6178
- (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
6179
- (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
6945
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), data_torch)
6180
6946
  ]
6181
6947
 
6182
6948
  return super().modify_tensors(data_torch, name, bid)
6183
6949
 
6184
6950
 
6951
+ @ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM")
6952
+ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
6953
+ """GraniteHybrid is a hybrid SSM + Attention model that uses Mamba2 SSM
6954
+ layers and optionally uses MoE w/ a shared expert"""
6955
+ model_arch = gguf.MODEL_ARCH.GRANITE_HYBRID
6956
+ undo_permute = True
6957
+
6958
+ def __init__(self, *args, **kwargs):
6959
+
6960
+ # Hybrid mamba models use a prefix for the mamba-specific params.
6961
+ # TODO: Extend this if the prefix(es) need to be configurable
6962
+ self.hparam_prefixes = ["mamba"]
6963
+
6964
+ super().__init__(*args, **kwargs)
6965
+
6966
+ # Lists of which layers use ssm vs attention
6967
+ self._attn_layers = self.get_attn_layers()
6968
+ self._ssm_layers = [
6969
+ i for i in range(self.block_count)
6970
+ if i not in self._attn_layers
6971
+ ]
6972
+
6973
+ # n_group and d_inner are used during reshape_tensors for mamba2
6974
+ self.d_model = self.find_hparam(["hidden_size", "d_model"])
6975
+ self.n_group = self.find_hparam(["n_groups"])
6976
+ self.d_inner = self.find_hparam(["expand"]) * self.d_model
6977
+
6978
+ def get_attn_layers(self):
6979
+ # Explicit list of layer type names
6980
+ if layer_types := self.hparams.get("layer_types"):
6981
+ return [
6982
+ i for i, typ in enumerate(layer_types)
6983
+ if typ == "attention"
6984
+ ]
6985
+
6986
+ # Layer types indicated by index or period
6987
+ attn_layers = self.hparams.get("attn_layer_indices", [])
6988
+ if not attn_layers:
6989
+ attn_period = self.hparams.get("attn_layer_period")
6990
+ assert attn_period, "Didn't find attn_layer_indices or attn_layer_period"
6991
+ attn_offset = self.hparams.get("attn_layer_offset")
6992
+ assert attn_offset is not None, "No attention layer offset set with attn_layer_period"
6993
+ attn_layers = [
6994
+ i for i in range(self.block_count)
6995
+ if i % attn_period == attn_offset
6996
+ ]
6997
+ return attn_layers
6998
+
6999
+ def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
7000
+ prefixed = []
7001
+ for pfx in self.hparam_prefixes:
7002
+ prefixed.extend(
7003
+ "_".join([pfx, k])
7004
+ for k in keys
7005
+ )
7006
+ keys = list(keys) + prefixed
7007
+ return Mamba2Model.find_hparam(self, keys, *args, **kwargs)
7008
+
7009
+ def modify_tensors(
7010
+ self, data_torch: Tensor, name: str, bid: int | None
7011
+ ) -> Iterable[tuple[str, Tensor]]:
7012
+ if (
7013
+ name.endswith("block_sparse_moe.input_linear.weight")
7014
+ or "shared_mlp" in name
7015
+ ):
7016
+ return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
7017
+
7018
+ # Determine whether this is a mamba layer or an attention layer
7019
+ if bid in self._ssm_layers:
7020
+ return Mamba2Model.modify_tensors(self, data_torch, name, bid)
7021
+ elif bid in self._attn_layers:
7022
+ return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
7023
+ return [(self.map_tensor_name(name), data_torch)]
7024
+
7025
+ def set_gguf_parameters(self):
7026
+ """This method merges params from both parents and some that are
7027
+ specific to this model. The result is some duplication of how the params
7028
+ get set. The following warnings are expected during conversion:
7029
+
7030
+ WARNING:Duplicated key name 'granitehybrid.attention.head_count_kv'
7031
+ WARNING:Duplicated key name 'granitehybrid.context_length'
7032
+ """
7033
+ GraniteMoeModel.set_gguf_parameters(self)
7034
+
7035
+ ## Mamba mixer params ##
7036
+ self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"]))
7037
+ self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state"]))
7038
+ self.gguf_writer.add_ssm_group_count(self.n_group)
7039
+ self.gguf_writer.add_ssm_inner_size(self.d_inner)
7040
+ # NOTE: The mamba_dt_rank is _not_ the right field for how this is used
7041
+ # in llama.cpp
7042
+ self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"]))
7043
+
7044
+ ## Attention params ##
7045
+ head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
7046
+ head_count_kv_vec = [
7047
+ head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count)
7048
+ ]
7049
+ if rope_dim := self.hparams.get("attn_rotary_emb"):
7050
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
7051
+ self.gguf_writer.add_head_count_kv(head_count_kv_vec)
7052
+
7053
+ ## If Bamba, use rope, otherwise don't
7054
+ use_rope = "BambaForCausalLM" in self.hparams["architectures"]
7055
+ self.gguf_writer.add_rope_scaling_finetuned(use_rope)
7056
+ if not use_rope:
7057
+ self.gguf_writer.add_context_length(2**20)
7058
+
7059
+ ## Validation ##
7060
+ d_head = self.find_hparam(["d_head"], optional=True) or 64
7061
+ assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
7062
+ assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
7063
+
7064
+ def set_vocab(self):
7065
+ self.hparams["pad_vocab_size_multiple"] = 8
7066
+ Mamba2Model.set_vocab(self)
7067
+
7068
+
6185
7069
  @ModelBase.register("BailingMoeForCausalLM")
6186
7070
  class BailingMoeModel(TextModel):
6187
7071
  model_arch = gguf.MODEL_ARCH.BAILINGMOE
@@ -6390,6 +7274,321 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
6390
7274
  super().set_gguf_parameters()
6391
7275
  self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
6392
7276
 
7277
+
7278
+ @ModelBase.register("FalconH1ForCausalLM")
7279
+ class FalconH1Model(Mamba2Model):
7280
+ model_arch = gguf.MODEL_ARCH.FALCON_H1
7281
+
7282
+ def __init__(self, *args, **kwargs):
7283
+ # Set the hparam prefixes for Falcon Mamba2
7284
+ self.hparam_prefixes = ["mamba"]
7285
+
7286
+ # Initialize the base Mamba2Model
7287
+ super().__init__(*args, **kwargs)
7288
+
7289
+ # Use Llama conversion for attention
7290
+ self._transformer_model_class = LlamaModel
7291
+
7292
+ # n_group and d_inner are used during reshape_tensors for mamba2
7293
+ self.n_group = self.find_hparam(["n_groups"])
7294
+ self.d_inner = self.find_hparam(["mamba_d_ssm"])
7295
+ self.d_head = self.find_hparam(["d_head"])
7296
+
7297
+ # Initialize any Falcon Mamba2 specific attributes
7298
+ self.has_attention = True # Falcon Mamba2 has attention components
7299
+
7300
+ # Load Falcon-H1 multipliers from hyperparameters
7301
+ self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True)
7302
+ self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True)
7303
+ self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True)
7304
+ self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True)
7305
+ self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True)
7306
+ self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True)
7307
+ self.intermediate_size = self.find_hparam(["intermediate_size"])
7308
+ self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True)
7309
+
7310
+ def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
7311
+ prefixed = []
7312
+ for pfx in self.hparam_prefixes:
7313
+ prefixed.extend(
7314
+ "_".join([pfx, k])
7315
+ for k in keys
7316
+ )
7317
+ keys = list(keys) + prefixed
7318
+ return super().find_hparam(keys, *args, **kwargs)
7319
+
7320
+ def set_vocab(self):
7321
+ self._set_vocab_gpt2()
7322
+
7323
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7324
+ tensors = list(super().modify_tensors(data_torch, name, bid))
7325
+ tensor = tensors[0][1]
7326
+
7327
+ if "down_proj" in name:
7328
+ tensor = tensor * self.mlp_multipliers[1]
7329
+ elif "gate_proj" in name:
7330
+ tensor = tensor * self.mlp_multipliers[0]
7331
+ elif "k_proj" in name:
7332
+ tensor = tensor * self.key_multiplier * self.attention_in_multiplier
7333
+ elif "q_proj" in name:
7334
+ tensor = tensor * self.attention_in_multiplier
7335
+ elif "v_proj" in name:
7336
+ tensor = tensor * self.attention_in_multiplier
7337
+ elif "o_proj" in name:
7338
+ tensor = tensor * self.attention_out_multiplier
7339
+ elif "out_proj" in name:
7340
+ tensor = tensor * self.ssm_out_multiplier
7341
+ elif "in_proj" in name:
7342
+ tensor = tensor * self.ssm_in_multiplier
7343
+ zxbcdt_multipliers = self.hparams["ssm_multipliers"]
7344
+ intermediate_size = self.hparams["mamba_d_ssm"]
7345
+ groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"]
7346
+ tensor[:intermediate_size, :] *= zxbcdt_multipliers[0]
7347
+ tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1]
7348
+ tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2]
7349
+ tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3]
7350
+ tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4]
7351
+ elif "lm_head" in name:
7352
+ tensor = tensor * self.hparams["lm_head_multiplier"]
7353
+ elif "embed_tokens" in name:
7354
+ tensor = tensor * self.hparams["embedding_multiplier"]
7355
+ elif "mamba.norm" in name:
7356
+ tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group)
7357
+
7358
+ tensors = [(tensors[0][0], tensor)]
7359
+ return tensors
7360
+
7361
+ def set_gguf_parameters(self):
7362
+ super().set_gguf_parameters()
7363
+
7364
+ ## General Params ##
7365
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
7366
+ # Override some Mamba2 defaults
7367
+ self.gguf_writer.add_block_count(self.block_count)
7368
+ self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0))
7369
+ self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
7370
+
7371
+ ## Attention params ##
7372
+ self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2
7373
+ self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
7374
+ self.gguf_writer.add_key_length(self.hparams["head_dim"])
7375
+ self.gguf_writer.add_value_length(self.hparams["head_dim"])
7376
+
7377
+ ## Validation ##
7378
+ assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
7379
+ assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
7380
+
7381
+ # Add any other Falcon Mamba2 specific configuration
7382
+ self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
7383
+
7384
+
7385
+ @ModelBase.register("HunYuanMoEV1ForCausalLM")
7386
+ class HunYuanMoEModel(TextModel):
7387
+ model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
7388
+
7389
+ def __init__(self, *args, **kwargs):
7390
+ super().__init__(*args, **kwargs)
7391
+ # For handling tied embeddings
7392
+ self._tok_embd = None
7393
+
7394
+ def set_vocab(self):
7395
+ from transformers import AutoTokenizer
7396
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
7397
+
7398
+ # 1. Get the pre-tokenizer identifier hash
7399
+ tokpre = self.get_vocab_base_pre(tokenizer)
7400
+
7401
+ # 2. Reverse-engineer the merges list from mergeable_ranks
7402
+ merges = []
7403
+ vocab = {}
7404
+ mergeable_ranks = tokenizer.mergeable_ranks
7405
+ for token, rank in mergeable_ranks.items():
7406
+ vocab[QwenModel.token_bytes_to_string(token)] = rank
7407
+ if len(token) == 1:
7408
+ continue
7409
+ merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
7410
+ if len(merged) == 2: # todo this is an assert in Qwen, why?
7411
+ merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
7412
+
7413
+ # 3. Generate the tokens and toktypes lists
7414
+ vocab_size = self.hparams["vocab_size"]
7415
+ assert tokenizer.vocab_size == vocab_size
7416
+ special_tokens = tokenizer.special_tokens
7417
+ reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
7418
+ tokens: list[str] = []
7419
+ toktypes: list[int] = []
7420
+ for i in range(vocab_size):
7421
+ if i not in reverse_vocab:
7422
+ tokens.append(f"[PAD{i}]")
7423
+ toktypes.append(gguf.TokenType.UNUSED)
7424
+ else:
7425
+ token = reverse_vocab[i]
7426
+ tokens.append(token)
7427
+ if i in special_tokens.values():
7428
+ toktypes.append(gguf.TokenType.CONTROL)
7429
+ else:
7430
+ toktypes.append(gguf.TokenType.NORMAL)
7431
+
7432
+ # 4. Write all vocab-related fields to the GGUF writer
7433
+ self.gguf_writer.add_tokenizer_model("gpt2")
7434
+ self.gguf_writer.add_tokenizer_pre(tokpre)
7435
+ self.gguf_writer.add_token_list(tokens)
7436
+ self.gguf_writer.add_token_types(toktypes)
7437
+ self.gguf_writer.add_token_merges(merges)
7438
+
7439
+ # 5. Add special tokens and chat templates
7440
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
7441
+ special_vocab.add_to_gguf(self.gguf_writer)
7442
+ # FIX for BOS token: Overwrite incorrect id read from config.json
7443
+ self.gguf_writer.add_bos_token_id(127959) # <|bos|>
7444
+
7445
+ def set_gguf_parameters(self):
7446
+ super().set_gguf_parameters()
7447
+ hparams = self.hparams
7448
+
7449
+ self.gguf_writer.add_expert_count(hparams["num_experts"])
7450
+ self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"])
7451
+
7452
+ moe_intermediate_size = hparams["moe_intermediate_size"]
7453
+ assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size)
7454
+ self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0])
7455
+
7456
+ moe_topk = hparams["moe_topk"]
7457
+ assert all(topk == moe_topk[0] for topk in moe_topk)
7458
+ self.gguf_writer.add_expert_used_count(moe_topk[0])
7459
+
7460
+ moe_shared_expert = hparams["num_shared_expert"]
7461
+ assert all(n == moe_shared_expert[0] for n in moe_shared_expert)
7462
+ self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
7463
+
7464
+ # Rope
7465
+ rope_scaling = hparams.get("rope_scaling", {})
7466
+ if rope_scaling.get("type") == "dynamic":
7467
+ # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
7468
+ # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
7469
+ alpha = rope_scaling.get("alpha", 1000)
7470
+ base = hparams.get("rope_theta", 10000.0)
7471
+ dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
7472
+ scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
7473
+ self.gguf_writer.add_rope_freq_base(scaled_base)
7474
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
7475
+ self.gguf_writer.add_rope_scaling_factor(1)
7476
+ # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
7477
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
7478
+ self.gguf_writer.add_context_length(256 * 1024) # 256k context length
7479
+
7480
+ # if any of our assumptions about the values are wrong, something has changed and this may need to be updated
7481
+ assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
7482
+ "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
7483
+
7484
+ _experts: list[dict[str, Tensor]] | None = None
7485
+
7486
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7487
+ if name == "model.embed_tokens.weight":
7488
+ self._tok_embd = data_torch.clone()
7489
+
7490
+ if name == "lm_head.weight":
7491
+ if self.hparams.get("tie_word_embeddings", False):
7492
+ logger.info("Skipping tied output layer 'lm_head.weight'")
7493
+ return []
7494
+
7495
+ if name.find("mlp.experts") != -1:
7496
+ n_experts = self.hparams["num_experts"]
7497
+ assert bid is not None
7498
+
7499
+ if self._experts is None:
7500
+ self._experts = [{} for _ in range(self.block_count)]
7501
+
7502
+ self._experts[bid][name] = data_torch
7503
+
7504
+ if len(self._experts[bid]) >= n_experts * 3:
7505
+ # merge the experts into a single 3d tensor
7506
+ tensors: list[tuple[str, Tensor]] = []
7507
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
7508
+ datas: list[Tensor] = []
7509
+
7510
+ for xid in range(n_experts):
7511
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
7512
+ datas.append(self._experts[bid][ename])
7513
+ del self._experts[bid][ename]
7514
+
7515
+ data_torch = torch.stack(datas, dim=0)
7516
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
7517
+ new_name = self.map_tensor_name(merged_name)
7518
+ tensors.append((new_name, data_torch))
7519
+
7520
+ return tensors
7521
+ else:
7522
+ return []
7523
+
7524
+ return [(self.map_tensor_name(name), data_torch)]
7525
+
7526
+ def prepare_tensors(self):
7527
+ super().prepare_tensors()
7528
+ if self._experts is not None:
7529
+ experts = [k for d in self._experts for k in d.keys()]
7530
+ if len(experts) > 0:
7531
+ raise ValueError(f"Unprocessed experts: {experts}")
7532
+
7533
+
7534
+ @ModelBase.register("SmolLM3ForCausalLM")
7535
+ class SmolLM3Model(LlamaModel):
7536
+ model_arch = gguf.MODEL_ARCH.SMOLLM3
7537
+
7538
+ def set_vocab(self):
7539
+ super().set_vocab()
7540
+ # remove unsupported array slicing in chat template
7541
+ # ref: https://huggingface.co/ggml-org/SmolLM3-3B-GGUF/discussions/1
7542
+ from transformers import AutoTokenizer
7543
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
7544
+ if tokenizer.chat_template is not None:
7545
+ chat_template = tokenizer.chat_template.replace("[:]", "")
7546
+ self.gguf_writer.add_chat_template(chat_template)
7547
+
7548
+
7549
+ @ModelBase.register("Lfm2ForCausalLM")
7550
+ @ModelBase.register("LFM2ForCausalLM")
7551
+ class LFM2Model(TextModel):
7552
+ model_arch = gguf.MODEL_ARCH.LFM2
7553
+
7554
+ def _add_feed_forward_length(self):
7555
+ ff_dim = self.hparams["block_ff_dim"]
7556
+
7557
+ auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"]
7558
+ ff_dim = self.hparams["block_ff_dim"]
7559
+ ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"]
7560
+ multiple_of = self.hparams["block_multiple_of"]
7561
+
7562
+ if auto_adjust_ff_dim:
7563
+ ff_dim = int(2 * ff_dim / 3)
7564
+ # custom dim factor multiplier
7565
+ if ffn_dim_multiplier is not None:
7566
+ ff_dim = int(ffn_dim_multiplier * ff_dim)
7567
+ ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
7568
+
7569
+ self.gguf_writer.add_feed_forward_length(ff_dim)
7570
+
7571
+ def set_gguf_parameters(self):
7572
+ # set num_key_value_heads only for attention layers
7573
+ self.hparams["num_key_value_heads"] = [
7574
+ self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
7575
+ for layer_type in self.hparams["layer_types"]
7576
+ ]
7577
+
7578
+ super().set_gguf_parameters()
7579
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
7580
+ self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
7581
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
7582
+ self._add_feed_forward_length()
7583
+
7584
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7585
+ # conv op requires 2d tensor
7586
+ if 'conv.conv' in name:
7587
+ data_torch = data_torch.squeeze(1)
7588
+
7589
+ return [(self.map_tensor_name(name), data_torch)]
7590
+
7591
+
6393
7592
  ###### CONVERSION LOGIC ######
6394
7593
 
6395
7594
 
@@ -6569,12 +7768,20 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
6569
7768
  # maybe we should fallback to text model's arch in that case, since not many models have both
6570
7769
  text_config = hparams.get("text_config", {})
6571
7770
  vision_config = hparams.get("vision_config", {})
6572
- arch = hparams["architectures"][0]
7771
+ arch = None
7772
+ if (arches := hparams.get("architectures")) is not None and len(arches) > 0:
7773
+ arch = arches[0]
7774
+ elif "ssm_cfg" in hparams:
7775
+ # For non-hf Mamba and Mamba2 models
7776
+ arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
7777
+
6573
7778
  # if "architectures" is found in the sub-config, use that instead
6574
7779
  if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
6575
7780
  arch = text_config["architectures"][0]
6576
7781
  elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
6577
7782
  arch = vision_config["architectures"][0]
7783
+ if arch is None:
7784
+ raise ValueError("Failed to detect model architecture")
6578
7785
  return arch
6579
7786
 
6580
7787