@novastera-oss/llamarn 0.2.9 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  5. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  15. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  17. package/cpp/build-info.cpp +2 -2
  18. package/cpp/llama.cpp/CMakeLists.txt +0 -1
  19. package/cpp/llama.cpp/README.md +4 -5
  20. package/cpp/llama.cpp/build-xcframework.sh +1 -1
  21. package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
  22. package/cpp/llama.cpp/common/arg.cpp +17 -0
  23. package/cpp/llama.cpp/common/chat.cpp +37 -20
  24. package/cpp/llama.cpp/common/chat.h +2 -0
  25. package/cpp/llama.cpp/common/common.h +4 -0
  26. package/cpp/llama.cpp/convert_hf_to_gguf.py +745 -6
  27. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
  28. package/cpp/llama.cpp/ggml/CMakeLists.txt +7 -2
  29. package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
  30. package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
  31. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +0 -1
  32. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
  33. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
  34. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
  35. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1203 -163
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
  43. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +17 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  47. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +8 -6
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +185 -79
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
  66. package/cpp/llama.cpp/ggml/src/ggml-impl.h +64 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
  68. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +35 -9
  69. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +167 -39
  70. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +254 -57
  71. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +505 -40
  73. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  74. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  83. package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
  84. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +60 -9
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +711 -292
  92. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
  93. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
  94. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
  95. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
  96. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
  97. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
  98. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
  105. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
  106. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  108. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  112. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  113. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
  117. package/cpp/llama.cpp/ggml/src/ggml.c +382 -61
  118. package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
  119. package/cpp/llama.cpp/gguf-py/gguf/constants.py +209 -0
  120. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
  121. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +73 -21
  122. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
  123. package/cpp/llama.cpp/include/llama.h +0 -40
  124. package/cpp/llama.cpp/src/llama-arch.cpp +210 -3
  125. package/cpp/llama.cpp/src/llama-arch.h +18 -1
  126. package/cpp/llama.cpp/src/llama-batch.cpp +27 -1
  127. package/cpp/llama.cpp/src/llama-batch.h +8 -1
  128. package/cpp/llama.cpp/src/llama-chat.cpp +15 -0
  129. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  130. package/cpp/llama.cpp/src/llama-graph.cpp +119 -184
  131. package/cpp/llama.cpp/src/llama-graph.h +47 -60
  132. package/cpp/llama.cpp/src/llama-hparams.cpp +7 -1
  133. package/cpp/llama.cpp/src/llama-hparams.h +3 -0
  134. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
  135. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
  136. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
  137. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +62 -24
  138. package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
  139. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
  140. package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
  141. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +20 -10
  142. package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
  143. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  144. package/cpp/llama.cpp/src/llama-model.cpp +2530 -685
  145. package/cpp/llama.cpp/src/llama-model.h +18 -0
  146. package/cpp/llama.cpp/src/llama-quant.cpp +1 -0
  147. package/cpp/llama.cpp/src/llama-vocab.cpp +13 -2
  148. package/cpp/llama.cpp/src/llama-vocab.h +41 -0
  149. package/ios/include/chat.h +2 -0
  150. package/ios/include/common.h +4 -0
  151. package/ios/include/llama.h +0 -40
  152. package/ios/libs/llama.xcframework/Info.plist +19 -19
  153. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  154. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5055 -4886
  155. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  156. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
  157. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +0 -40
  158. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  159. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  160. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
  161. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
  162. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  163. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  164. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
  165. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  166. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  167. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
  168. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3766
  169. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
  170. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
  171. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -40
  172. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
  173. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
  174. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -40
  175. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  176. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
  177. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
  178. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -40
  179. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  180. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  181. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  182. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4890
  183. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  184. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
  185. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -40
  186. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  187. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  188. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4861
  189. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3764
  190. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  191. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  192. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
  193. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  194. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  195. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5091 -4922
  196. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  197. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
  198. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -40
  199. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  200. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  201. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4897
  202. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3794
  203. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  204. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  205. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -40
  206. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  207. package/package.json +1 -1
  208. package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
  209. package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  210. package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  211. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  212. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  213. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  214. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  215. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  216. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  217. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  218. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  219. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  220. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  221. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  222. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  223. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  224. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  225. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  226. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  227. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  228. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  229. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  230. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  231. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  232. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  233. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  234. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  235. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  236. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  237. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  238. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  239. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  240. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  241. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  242. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  243. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  244. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  245. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  246. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  247. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
@@ -300,6 +300,7 @@ class ModelBase:
300
300
  gguf.MODEL_TENSOR.POS_EMBD,
301
301
  gguf.MODEL_TENSOR.TOKEN_TYPES,
302
302
  gguf.MODEL_TENSOR.SSM_CONV1D,
303
+ gguf.MODEL_TENSOR.SHORTCONV_CONV,
303
304
  gguf.MODEL_TENSOR.TIME_MIX_FIRST,
304
305
  gguf.MODEL_TENSOR.TIME_MIX_W1,
305
306
  gguf.MODEL_TENSOR.TIME_MIX_W2,
@@ -815,6 +816,30 @@ class TextModel(ModelBase):
815
816
  if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
816
817
  # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
817
818
  res = "minerva-7b"
819
+ if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
820
+ # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
821
+ res = "hunyuan"
822
+ if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf":
823
+ # ref: https://huggingface.co/skt/A.X-4.0
824
+ res = "a.x-4.0"
825
+ if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6":
826
+ # ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base
827
+ res = "falcon-h1"
828
+ if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86":
829
+ # ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base
830
+ res = "falcon-h1"
831
+ if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896":
832
+ # ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base
833
+ res = "falcon-h1"
834
+ if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
835
+ # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
836
+ res = "falcon-h1"
837
+ if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
838
+ # ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
839
+ res = "midm-2.0"
840
+ if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
841
+ # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
842
+ res = "lfm2"
818
843
 
819
844
  if res is None:
820
845
  logger.warning("\n")
@@ -2743,6 +2768,52 @@ class Qwen2Model(TextModel):
2743
2768
  yield from super().modify_tensors(data_torch, name, bid)
2744
2769
 
2745
2770
 
2771
+ @ModelBase.register("Ernie4_5_ForCausalLM")
2772
+ class Ernie4_5Model(TextModel):
2773
+ model_arch = gguf.MODEL_ARCH.ERNIE4_5
2774
+
2775
+ def set_vocab(self):
2776
+ self._set_vocab_sentencepiece()
2777
+
2778
+ def set_gguf_parameters(self):
2779
+ super().set_gguf_parameters()
2780
+
2781
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2782
+ num_heads = self.hparams["num_attention_heads"]
2783
+ num_kv_heads = self.hparams["num_key_value_heads"]
2784
+ head_dim = self.hparams["head_dim"]
2785
+
2786
+ if "ernie." in name:
2787
+ name = name.replace("ernie.", "model.")
2788
+ # split the qkv weights
2789
+ # qkv_proj shape: [(num_heads + 2 * num_kv_heads) * head_dim, hidden_size]
2790
+ if "qkv_proj" in name:
2791
+ name_q = name.replace("qkv_proj.weight", "q_proj.weight")
2792
+ name_k = name.replace("qkv_proj.weight", "k_proj.weight")
2793
+ name_v = name.replace("qkv_proj.weight", "v_proj.weight")
2794
+ total_q_dim = num_heads * head_dim
2795
+ total_k_dim = num_kv_heads * head_dim
2796
+ total_v_dim = num_kv_heads * head_dim
2797
+ q_proj_weight, k_proj_weight, v_proj_weight = data_torch.split([total_q_dim, total_k_dim, total_v_dim], dim=0)
2798
+ return [
2799
+ (self.map_tensor_name(name_q), q_proj_weight),
2800
+ (self.map_tensor_name(name_k), k_proj_weight),
2801
+ (self.map_tensor_name(name_v), v_proj_weight)
2802
+ ]
2803
+ # split the up_gate_proj into gate and up
2804
+ # up_gate_proj shape: [2 * intermediate_size, hidden_size]
2805
+ if "up_gate_proj" in name:
2806
+ name_up = name.replace("up_gate_proj.weight", "up_proj.weight")
2807
+ name_gate = name.replace("up_gate_proj.weight", "gate_proj.weight")
2808
+ dim_half = data_torch.shape[0] // 2
2809
+ gate_proj_weight, up_proj_weight = data_torch.split(dim_half, dim=0)
2810
+ return [
2811
+ (self.map_tensor_name(name_gate), gate_proj_weight),
2812
+ (self.map_tensor_name(name_up), up_proj_weight)
2813
+ ]
2814
+ return [(self.map_tensor_name(name), data_torch)]
2815
+
2816
+
2746
2817
  @ModelBase.register(
2747
2818
  "Qwen2VLModel",
2748
2819
  "Qwen2VLForConditionalGeneration",
@@ -4362,9 +4433,6 @@ class Gemma3NModel(Gemma3Model):
4362
4433
  ]
4363
4434
 
4364
4435
  def set_vocab(self):
4365
- with open(self.dir_model / "chat_template.jinja") as f:
4366
- # quick hack to make sure chat template is added
4367
- self.gguf_writer.add_chat_template(f.read())
4368
4436
  super().set_vocab()
4369
4437
 
4370
4438
  def set_gguf_parameters(self):
@@ -4735,6 +4803,14 @@ class ARwkv7Model(Rwkv7Model):
4735
4803
  class MambaModel(TextModel):
4736
4804
  model_arch = gguf.MODEL_ARCH.MAMBA
4737
4805
 
4806
+ def __init__(self, dir_model: Path, *args, **kwargs):
4807
+ # Avoid using AutoConfig for hparams
4808
+ hparams = kwargs.pop("hparams", None)
4809
+ if hparams is None:
4810
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
4811
+ hparams = json.load(f)
4812
+ super().__init__(dir_model, *args, hparams=hparams, **kwargs)
4813
+
4738
4814
  def set_vocab(self):
4739
4815
  vocab_size = self.hparams["vocab_size"]
4740
4816
  # Round vocab size to next multiple of 8
@@ -4809,6 +4885,216 @@ class MambaModel(TextModel):
4809
4885
  return [(new_name, data_torch)]
4810
4886
 
4811
4887
 
4888
+ @ModelBase.register("Mamba2ForCausalLM")
4889
+ class Mamba2Model(TextModel):
4890
+ model_arch = gguf.MODEL_ARCH.MAMBA2
4891
+
4892
+ def __init__(self, dir_model: Path, *args, **kwargs):
4893
+ # Avoid using AutoConfig for hparams
4894
+ # It wrongly assumes all Mamba2 models are Mamba-Codestral-7B-v0.1
4895
+ hparams = kwargs.pop("hparams", None)
4896
+ if hparams is None:
4897
+ with open(dir_model / "config.json", "r", encoding="utf-8") as f:
4898
+ hparams = json.load(f)
4899
+ super().__init__(dir_model, *args, hparams=hparams, **kwargs)
4900
+ self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
4901
+ self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
4902
+ self.n_group = self.find_hparam(["n_groups"], optional=True) or 1
4903
+
4904
+ def set_vocab(self):
4905
+ vocab_size = self.hparams["vocab_size"]
4906
+ # Round vocab size to next multiple of 16
4907
+ pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16)
4908
+ # pad using ceiling division
4909
+ # ref: https://stackoverflow.com/a/17511341/22827863
4910
+ vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
4911
+ self.hparams["vocab_size"] = vocab_size
4912
+
4913
+ if (self.dir_model / "tokenizer.model").is_file():
4914
+ self._set_vocab_sentencepiece()
4915
+ elif (self.dir_model / "tokenizer.model.v3").is_file():
4916
+ # mamba-codestral
4917
+ raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}")
4918
+ elif (self.dir_model / "tokenizer.json").is_file():
4919
+ self._set_vocab_gpt2()
4920
+ else:
4921
+ # Use the GPT-NeoX tokenizer when no tokenizer files are present
4922
+ self._set_vocab_builtin("gpt-neox", vocab_size)
4923
+
4924
+ def set_gguf_parameters(self):
4925
+ d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
4926
+ d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128
4927
+ head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
4928
+
4929
+ rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
4930
+
4931
+ # Fail early for models which don't have a block expansion factor of 2
4932
+ # TODO: does this really matter?
4933
+ # skip the assertion for FalconH1 Model
4934
+ if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
4935
+ assert self.d_inner == 2 * self.d_model
4936
+ assert self.d_inner % head_dim == 0
4937
+
4938
+ self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
4939
+ self.gguf_writer.add_embedding_length(self.d_model)
4940
+ self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
4941
+ self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
4942
+ self.gguf_writer.add_block_count(self.block_count)
4943
+ self.gguf_writer.add_ssm_conv_kernel(d_conv)
4944
+ self.gguf_writer.add_ssm_inner_size(self.d_inner)
4945
+ self.gguf_writer.add_ssm_state_size(d_state)
4946
+ self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
4947
+ self.gguf_writer.add_ssm_group_count(self.n_group)
4948
+ self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
4949
+ self.gguf_writer.add_file_type(self.ftype)
4950
+
4951
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4952
+
4953
+ if name.startswith("model.backbone") or name.startswith("model.lm_head"):
4954
+ # map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2
4955
+ name = name.removeprefix("model.")
4956
+
4957
+ if name.endswith(".dt_bias"):
4958
+ name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
4959
+
4960
+ new_name = self.map_tensor_name(name)
4961
+
4962
+ if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
4963
+ data_torch = data_torch.squeeze()
4964
+ elif any(self.match_model_tensor_name(new_name, t, bid, suffix="") for t in [
4965
+ gguf.MODEL_TENSOR.SSM_A,
4966
+ gguf.MODEL_TENSOR.SSM_D,
4967
+ ]):
4968
+ # unsqueeze A to use similar shape semantics as Mamba-1
4969
+ # (D is also unsqueezed, but for more straightforward broadcast internally)
4970
+ data_torch = data_torch.reshape((*data_torch.shape, 1))
4971
+ elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
4972
+ data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group))
4973
+
4974
+ if name.endswith(".A_log"):
4975
+ logger.debug("A_log --> A ==> " + new_name)
4976
+ data_torch = -torch.exp(data_torch)
4977
+
4978
+ yield (new_name, data_torch)
4979
+
4980
+
4981
+ @ModelBase.register("JambaForCausalLM")
4982
+ class JambaModel(TextModel):
4983
+ model_arch = gguf.MODEL_ARCH.JAMBA
4984
+
4985
+ def get_vocab_base_pre(self, tokenizer) -> str:
4986
+ del tokenizer # unused
4987
+
4988
+ return "gpt-2"
4989
+
4990
+ def set_vocab(self):
4991
+ if (self.dir_model / "tokenizer.model").is_file():
4992
+ # Using Jamba's tokenizer.json causes errors on model load
4993
+ # (something about "byte not found in vocab"),
4994
+ # but there's a working tokenizer.model
4995
+ self._set_vocab_sentencepiece()
4996
+ else:
4997
+ # Some Jamba models only have a tokenizer.json, which works.
4998
+ self._set_vocab_gpt2()
4999
+
5000
+ def set_gguf_parameters(self):
5001
+ d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
5002
+ d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4
5003
+ d_inner = self.hparams["mamba_expand"] * d_model
5004
+ d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16
5005
+ # ceiling division
5006
+ # ref: https://stackoverflow.com/a/17511341/22827863
5007
+ # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
5008
+ dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16)
5009
+ rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6
5010
+ n_kv_head = self.hparams["num_key_value_heads"]
5011
+ attn_offset = self.hparams["attn_layer_offset"]
5012
+ attn_period = self.hparams["attn_layer_period"]
5013
+ n_kv_vec = [0 for _ in range(attn_offset)] + [
5014
+ n_kv_head if (i - attn_offset) % attn_period == 0 else 0 for i in range(attn_offset, self.block_count)
5015
+ ]
5016
+
5017
+ self.gguf_writer.add_block_count(self.block_count)
5018
+ self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"]))
5019
+ self.gguf_writer.add_embedding_length(d_model)
5020
+ self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
5021
+ self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
5022
+ self.gguf_writer.add_head_count_kv(n_kv_vec)
5023
+ self.gguf_writer.add_ssm_conv_kernel(d_conv)
5024
+ self.gguf_writer.add_ssm_inner_size(d_inner)
5025
+ self.gguf_writer.add_ssm_state_size(d_state)
5026
+ self.gguf_writer.add_ssm_time_step_rank(dt_rank)
5027
+ self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
5028
+ self.gguf_writer.add_expert_count(self.hparams["num_experts"])
5029
+ self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
5030
+ self.gguf_writer.add_file_type(self.ftype)
5031
+
5032
+ _experts: list[dict[str, Tensor]] | None = None
5033
+
5034
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
5035
+
5036
+ # Mini-Jamba
5037
+ name = name.replace(".moe.", ".feed_forward.")
5038
+ if bid is not None:
5039
+ moe_offset = self.hparams["expert_layer_offset"]
5040
+ moe_period = self.hparams["expert_layer_period"]
5041
+
5042
+ if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0):
5043
+ name = name.replace(".experts.0.", ".")
5044
+
5045
+ # process the experts separately
5046
+ if ".feed_forward.experts." in name:
5047
+ n_experts = self.hparams["num_experts"]
5048
+
5049
+ assert bid is not None
5050
+
5051
+ if self._experts is None:
5052
+ self._experts = [{} for _ in range(self.block_count)]
5053
+
5054
+ self._experts[bid][name] = data_torch
5055
+
5056
+ if len(self._experts[bid]) >= n_experts * 3:
5057
+
5058
+ # merge the experts into a single 3d tensor
5059
+ for wid in ["down_proj", "gate_proj", "up_proj"]:
5060
+ datas: list[Tensor] = []
5061
+
5062
+ for xid in range(n_experts):
5063
+ ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight"
5064
+ datas.append(self._experts[bid][ename])
5065
+ del self._experts[bid][ename]
5066
+
5067
+ data_torch = torch.stack(datas, dim=0)
5068
+
5069
+ # using the same merged name as qwen2moe
5070
+ merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight"
5071
+
5072
+ new_name = self.map_tensor_name(merged_name)
5073
+
5074
+ yield new_name, data_torch
5075
+ return
5076
+
5077
+ new_name = self.map_tensor_name(name)
5078
+
5079
+ if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
5080
+ data_torch = data_torch.squeeze()
5081
+
5082
+ if name.endswith(".A_log"):
5083
+ logger.debug("A_log --> A ==> " + new_name)
5084
+ data_torch = -torch.exp(data_torch)
5085
+
5086
+ yield (new_name, data_torch)
5087
+
5088
+ def prepare_tensors(self):
5089
+ super().prepare_tensors()
5090
+
5091
+ if self._experts is not None:
5092
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
5093
+ experts = [k for d in self._experts for k in d.keys()]
5094
+ if len(experts) > 0:
5095
+ raise ValueError(f"Unprocessed experts: {experts}")
5096
+
5097
+
4812
5098
  @ModelBase.register("CohereForCausalLM")
4813
5099
  class CommandR2Model(TextModel):
4814
5100
  model_arch = gguf.MODEL_ARCH.COMMAND_R
@@ -6170,18 +6456,148 @@ class GraniteMoeModel(GraniteModel):
6170
6456
  (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
6171
6457
  ]
6172
6458
 
6459
+ has_experts = bool(self.hparams.get('num_local_experts'))
6460
+
6173
6461
  if name.endswith("shared_mlp.input_linear.weight"):
6174
6462
  ffn_dim = self.hparams["shared_intermediate_size"]
6175
6463
  assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
6176
6464
  gate, up = data_torch.split(ffn_dim, dim=-2)
6465
+ if has_experts:
6466
+ return [
6467
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
6468
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
6469
+ ]
6177
6470
  return [
6178
- (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
6179
- (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
6471
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), gate),
6472
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), up),
6473
+ ]
6474
+
6475
+ if not has_experts and name.endswith("shared_mlp.output_linear.weight"):
6476
+ return [
6477
+ (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), data_torch)
6180
6478
  ]
6181
6479
 
6182
6480
  return super().modify_tensors(data_torch, name, bid)
6183
6481
 
6184
6482
 
6483
+ @ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM")
6484
+ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
6485
+ """GraniteHybrid is a hybrid SSM + Attention model that uses Mamba2 SSM
6486
+ layers and optionally uses MoE w/ a shared expert"""
6487
+ model_arch = gguf.MODEL_ARCH.GRANITE_HYBRID
6488
+ undo_permute = True
6489
+
6490
+ def __init__(self, *args, **kwargs):
6491
+
6492
+ # Hybrid mamba models use a prefix for the mamba-specific params.
6493
+ # TODO: Extend this if the prefix(es) need to be configurable
6494
+ self.hparam_prefixes = ["mamba"]
6495
+
6496
+ super().__init__(*args, **kwargs)
6497
+
6498
+ # Lists of which layers use ssm vs attention
6499
+ self._attn_layers = self.get_attn_layers()
6500
+ self._ssm_layers = [
6501
+ i for i in range(self.block_count)
6502
+ if i not in self._attn_layers
6503
+ ]
6504
+
6505
+ # n_group and d_inner are used during reshape_tensors for mamba2
6506
+ self.d_model = self.find_hparam(["hidden_size", "d_model"])
6507
+ self.n_group = self.find_hparam(["n_groups"])
6508
+ self.d_inner = self.find_hparam(["expand"]) * self.d_model
6509
+
6510
+ def get_attn_layers(self):
6511
+ # Explicit list of layer type names
6512
+ if layer_types := self.hparams.get("layer_types"):
6513
+ return [
6514
+ i for i, typ in enumerate(layer_types)
6515
+ if typ == "attention"
6516
+ ]
6517
+
6518
+ # Layer types indicated by index or period
6519
+ attn_layers = self.hparams.get("attn_layer_indices", [])
6520
+ if not attn_layers:
6521
+ attn_period = self.hparams.get("attn_layer_period")
6522
+ assert attn_period, "Didn't find attn_layer_indices or attn_layer_period"
6523
+ attn_offset = self.hparams.get("attn_layer_offset")
6524
+ assert attn_offset is not None, "No attention layer offset set with attn_layer_period"
6525
+ attn_layers = [
6526
+ i for i in range(self.block_count)
6527
+ if i % attn_period == attn_offset
6528
+ ]
6529
+ return attn_layers
6530
+
6531
+ def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
6532
+ prefixed = []
6533
+ for pfx in self.hparam_prefixes:
6534
+ prefixed.extend(
6535
+ "_".join([pfx, k])
6536
+ for k in keys
6537
+ )
6538
+ keys = list(keys) + prefixed
6539
+ return Mamba2Model.find_hparam(self, keys, *args, **kwargs)
6540
+
6541
+ def modify_tensors(
6542
+ self, data_torch: Tensor, name: str, bid: int | None
6543
+ ) -> Iterable[tuple[str, Tensor]]:
6544
+ if (
6545
+ name.endswith("block_sparse_moe.input_linear.weight")
6546
+ or "shared_mlp" in name
6547
+ ):
6548
+ return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
6549
+
6550
+ # Determine whether this is a mamba layer or an attention layer
6551
+ if bid in self._ssm_layers:
6552
+ return Mamba2Model.modify_tensors(self, data_torch, name, bid)
6553
+ elif bid in self._attn_layers:
6554
+ return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
6555
+ return [(self.map_tensor_name(name), data_torch)]
6556
+
6557
+ def set_gguf_parameters(self):
6558
+ """This method merges params from both parents and some that are
6559
+ specific to this model. The result is some duplication of how the params
6560
+ get set. The following warnings are expected during conversion:
6561
+
6562
+ WARNING:Duplicated key name 'granitehybrid.attention.head_count_kv'
6563
+ WARNING:Duplicated key name 'granitehybrid.context_length'
6564
+ """
6565
+ GraniteMoeModel.set_gguf_parameters(self)
6566
+
6567
+ ## Mamba mixer params ##
6568
+ self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"]))
6569
+ self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state"]))
6570
+ self.gguf_writer.add_ssm_group_count(self.n_group)
6571
+ self.gguf_writer.add_ssm_inner_size(self.d_inner)
6572
+ # NOTE: The mamba_dt_rank is _not_ the right field for how this is used
6573
+ # in llama.cpp
6574
+ self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"]))
6575
+
6576
+ ## Attention params ##
6577
+ head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
6578
+ head_count_kv_vec = [
6579
+ head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count)
6580
+ ]
6581
+ if rope_dim := self.hparams.get("attn_rotary_emb"):
6582
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
6583
+ self.gguf_writer.add_head_count_kv(head_count_kv_vec)
6584
+
6585
+ ## If Bamba, use rope, otherwise don't
6586
+ use_rope = "BambaForCausalLM" in self.hparams["architectures"]
6587
+ self.gguf_writer.add_rope_scaling_finetuned(use_rope)
6588
+ if not use_rope:
6589
+ self.gguf_writer.add_context_length(2**20)
6590
+
6591
+ ## Validation ##
6592
+ d_head = self.find_hparam(["d_head"], optional=True) or 64
6593
+ assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
6594
+ assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
6595
+
6596
+ def set_vocab(self):
6597
+ self.hparams["pad_vocab_size_multiple"] = 8
6598
+ Mamba2Model.set_vocab(self)
6599
+
6600
+
6185
6601
  @ModelBase.register("BailingMoeForCausalLM")
6186
6602
  class BailingMoeModel(TextModel):
6187
6603
  model_arch = gguf.MODEL_ARCH.BAILINGMOE
@@ -6390,6 +6806,321 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
6390
6806
  super().set_gguf_parameters()
6391
6807
  self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
6392
6808
 
6809
+
6810
+ @ModelBase.register("FalconH1ForCausalLM")
6811
+ class FalconH1Model(Mamba2Model):
6812
+ model_arch = gguf.MODEL_ARCH.FALCON_H1
6813
+
6814
+ def __init__(self, *args, **kwargs):
6815
+ # Set the hparam prefixes for Falcon Mamba2
6816
+ self.hparam_prefixes = ["mamba"]
6817
+
6818
+ # Initialize the base Mamba2Model
6819
+ super().__init__(*args, **kwargs)
6820
+
6821
+ # Use Llama conversion for attention
6822
+ self._transformer_model_class = LlamaModel
6823
+
6824
+ # n_group and d_inner are used during reshape_tensors for mamba2
6825
+ self.n_group = self.find_hparam(["n_groups"])
6826
+ self.d_inner = self.find_hparam(["mamba_d_ssm"])
6827
+ self.d_head = self.find_hparam(["d_head"])
6828
+
6829
+ # Initialize any Falcon Mamba2 specific attributes
6830
+ self.has_attention = True # Falcon Mamba2 has attention components
6831
+
6832
+ # Load Falcon-H1 multipliers from hyperparameters
6833
+ self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True)
6834
+ self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True)
6835
+ self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True)
6836
+ self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True)
6837
+ self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True)
6838
+ self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True)
6839
+ self.intermediate_size = self.find_hparam(["intermediate_size"])
6840
+ self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True)
6841
+
6842
+ def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
6843
+ prefixed = []
6844
+ for pfx in self.hparam_prefixes:
6845
+ prefixed.extend(
6846
+ "_".join([pfx, k])
6847
+ for k in keys
6848
+ )
6849
+ keys = list(keys) + prefixed
6850
+ return super().find_hparam(keys, *args, **kwargs)
6851
+
6852
+ def set_vocab(self):
6853
+ self._set_vocab_gpt2()
6854
+
6855
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6856
+ tensors = list(super().modify_tensors(data_torch, name, bid))
6857
+ tensor = tensors[0][1]
6858
+
6859
+ if "down_proj" in name:
6860
+ tensor = tensor * self.mlp_multipliers[1]
6861
+ elif "gate_proj" in name:
6862
+ tensor = tensor * self.mlp_multipliers[0]
6863
+ elif "k_proj" in name:
6864
+ tensor = tensor * self.key_multiplier * self.attention_in_multiplier
6865
+ elif "q_proj" in name:
6866
+ tensor = tensor * self.attention_in_multiplier
6867
+ elif "v_proj" in name:
6868
+ tensor = tensor * self.attention_in_multiplier
6869
+ elif "o_proj" in name:
6870
+ tensor = tensor * self.attention_out_multiplier
6871
+ elif "out_proj" in name:
6872
+ tensor = tensor * self.ssm_out_multiplier
6873
+ elif "in_proj" in name:
6874
+ tensor = tensor * self.ssm_in_multiplier
6875
+ zxbcdt_multipliers = self.hparams["ssm_multipliers"]
6876
+ intermediate_size = self.hparams["mamba_d_ssm"]
6877
+ groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"]
6878
+ tensor[:intermediate_size, :] *= zxbcdt_multipliers[0]
6879
+ tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1]
6880
+ tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2]
6881
+ tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3]
6882
+ tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4]
6883
+ elif "lm_head" in name:
6884
+ tensor = tensor * self.hparams["lm_head_multiplier"]
6885
+ elif "embed_tokens" in name:
6886
+ tensor = tensor * self.hparams["embedding_multiplier"]
6887
+ elif "mamba.norm" in name:
6888
+ tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group)
6889
+
6890
+ tensors = [(tensors[0][0], tensor)]
6891
+ return tensors
6892
+
6893
+ def set_gguf_parameters(self):
6894
+ super().set_gguf_parameters()
6895
+
6896
+ ## General Params ##
6897
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
6898
+ # Override some Mamba2 defaults
6899
+ self.gguf_writer.add_block_count(self.block_count)
6900
+ self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0))
6901
+ self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
6902
+
6903
+ ## Attention params ##
6904
+ self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2
6905
+ self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
6906
+ self.gguf_writer.add_key_length(self.hparams["head_dim"])
6907
+ self.gguf_writer.add_value_length(self.hparams["head_dim"])
6908
+
6909
+ ## Validation ##
6910
+ assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
6911
+ assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"
6912
+
6913
+ # Add any other Falcon Mamba2 specific configuration
6914
+ self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
6915
+
6916
+
6917
+ @ModelBase.register("HunYuanMoEV1ForCausalLM")
6918
+ class HunYuanMoEModel(TextModel):
6919
+ model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE
6920
+
6921
+ def __init__(self, *args, **kwargs):
6922
+ super().__init__(*args, **kwargs)
6923
+ # For handling tied embeddings
6924
+ self._tok_embd = None
6925
+
6926
+ def set_vocab(self):
6927
+ from transformers import AutoTokenizer
6928
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
6929
+
6930
+ # 1. Get the pre-tokenizer identifier hash
6931
+ tokpre = self.get_vocab_base_pre(tokenizer)
6932
+
6933
+ # 2. Reverse-engineer the merges list from mergeable_ranks
6934
+ merges = []
6935
+ vocab = {}
6936
+ mergeable_ranks = tokenizer.mergeable_ranks
6937
+ for token, rank in mergeable_ranks.items():
6938
+ vocab[QwenModel.token_bytes_to_string(token)] = rank
6939
+ if len(token) == 1:
6940
+ continue
6941
+ merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
6942
+ if len(merged) == 2: # todo this is an assert in Qwen, why?
6943
+ merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
6944
+
6945
+ # 3. Generate the tokens and toktypes lists
6946
+ vocab_size = self.hparams["vocab_size"]
6947
+ assert tokenizer.vocab_size == vocab_size
6948
+ special_tokens = tokenizer.special_tokens
6949
+ reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
6950
+ tokens: list[str] = []
6951
+ toktypes: list[int] = []
6952
+ for i in range(vocab_size):
6953
+ if i not in reverse_vocab:
6954
+ tokens.append(f"[PAD{i}]")
6955
+ toktypes.append(gguf.TokenType.UNUSED)
6956
+ else:
6957
+ token = reverse_vocab[i]
6958
+ tokens.append(token)
6959
+ if i in special_tokens.values():
6960
+ toktypes.append(gguf.TokenType.CONTROL)
6961
+ else:
6962
+ toktypes.append(gguf.TokenType.NORMAL)
6963
+
6964
+ # 4. Write all vocab-related fields to the GGUF writer
6965
+ self.gguf_writer.add_tokenizer_model("gpt2")
6966
+ self.gguf_writer.add_tokenizer_pre(tokpre)
6967
+ self.gguf_writer.add_token_list(tokens)
6968
+ self.gguf_writer.add_token_types(toktypes)
6969
+ self.gguf_writer.add_token_merges(merges)
6970
+
6971
+ # 5. Add special tokens and chat templates
6972
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
6973
+ special_vocab.add_to_gguf(self.gguf_writer)
6974
+ # FIX for BOS token: Overwrite incorrect id read from config.json
6975
+ self.gguf_writer.add_bos_token_id(127959) # <|bos|>
6976
+
6977
+ def set_gguf_parameters(self):
6978
+ super().set_gguf_parameters()
6979
+ hparams = self.hparams
6980
+
6981
+ self.gguf_writer.add_expert_count(hparams["num_experts"])
6982
+ self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"])
6983
+
6984
+ moe_intermediate_size = hparams["moe_intermediate_size"]
6985
+ assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size)
6986
+ self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0])
6987
+
6988
+ moe_topk = hparams["moe_topk"]
6989
+ assert all(topk == moe_topk[0] for topk in moe_topk)
6990
+ self.gguf_writer.add_expert_used_count(moe_topk[0])
6991
+
6992
+ moe_shared_expert = hparams["num_shared_expert"]
6993
+ assert all(n == moe_shared_expert[0] for n in moe_shared_expert)
6994
+ self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])
6995
+
6996
+ # Rope
6997
+ rope_scaling = hparams.get("rope_scaling", {})
6998
+ if rope_scaling.get("type") == "dynamic":
6999
+ # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
7000
+ # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
7001
+ alpha = rope_scaling.get("alpha", 1000)
7002
+ base = hparams.get("rope_theta", 10000.0)
7003
+ dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
7004
+ scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
7005
+ self.gguf_writer.add_rope_freq_base(scaled_base)
7006
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
7007
+ self.gguf_writer.add_rope_scaling_factor(1)
7008
+ # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
7009
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
7010
+ self.gguf_writer.add_context_length(256 * 1024) # 256k context length
7011
+
7012
+ # if any of our assumptions about the values are wrong, something has changed and this may need to be updated
7013
+ assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
7014
+ "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
7015
+
7016
+ _experts: list[dict[str, Tensor]] | None = None
7017
+
7018
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7019
+ if name == "model.embed_tokens.weight":
7020
+ self._tok_embd = data_torch.clone()
7021
+
7022
+ if name == "lm_head.weight":
7023
+ if self.hparams.get("tie_word_embeddings", False):
7024
+ logger.info("Skipping tied output layer 'lm_head.weight'")
7025
+ return []
7026
+
7027
+ if name.find("mlp.experts") != -1:
7028
+ n_experts = self.hparams["num_experts"]
7029
+ assert bid is not None
7030
+
7031
+ if self._experts is None:
7032
+ self._experts = [{} for _ in range(self.block_count)]
7033
+
7034
+ self._experts[bid][name] = data_torch
7035
+
7036
+ if len(self._experts[bid]) >= n_experts * 3:
7037
+ # merge the experts into a single 3d tensor
7038
+ tensors: list[tuple[str, Tensor]] = []
7039
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
7040
+ datas: list[Tensor] = []
7041
+
7042
+ for xid in range(n_experts):
7043
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
7044
+ datas.append(self._experts[bid][ename])
7045
+ del self._experts[bid][ename]
7046
+
7047
+ data_torch = torch.stack(datas, dim=0)
7048
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
7049
+ new_name = self.map_tensor_name(merged_name)
7050
+ tensors.append((new_name, data_torch))
7051
+
7052
+ return tensors
7053
+ else:
7054
+ return []
7055
+
7056
+ return [(self.map_tensor_name(name), data_torch)]
7057
+
7058
+ def prepare_tensors(self):
7059
+ super().prepare_tensors()
7060
+ if self._experts is not None:
7061
+ experts = [k for d in self._experts for k in d.keys()]
7062
+ if len(experts) > 0:
7063
+ raise ValueError(f"Unprocessed experts: {experts}")
7064
+
7065
+
7066
+ @ModelBase.register("SmolLM3ForCausalLM")
7067
+ class SmolLM3Model(LlamaModel):
7068
+ model_arch = gguf.MODEL_ARCH.SMOLLM3
7069
+
7070
+ def set_vocab(self):
7071
+ super().set_vocab()
7072
+ # remove unsupported array slicing in chat template
7073
+ # ref: https://huggingface.co/ggml-org/SmolLM3-3B-GGUF/discussions/1
7074
+ from transformers import AutoTokenizer
7075
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
7076
+ if tokenizer.chat_template is not None:
7077
+ chat_template = tokenizer.chat_template.replace("[:]", "")
7078
+ self.gguf_writer.add_chat_template(chat_template)
7079
+
7080
+
7081
+ @ModelBase.register("Lfm2ForCausalLM")
7082
+ @ModelBase.register("LFM2ForCausalLM")
7083
+ class LFM2Model(TextModel):
7084
+ model_arch = gguf.MODEL_ARCH.LFM2
7085
+
7086
+ def _add_feed_forward_length(self):
7087
+ ff_dim = self.hparams["block_ff_dim"]
7088
+
7089
+ auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"]
7090
+ ff_dim = self.hparams["block_ff_dim"]
7091
+ ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"]
7092
+ multiple_of = self.hparams["block_multiple_of"]
7093
+
7094
+ if auto_adjust_ff_dim:
7095
+ ff_dim = int(2 * ff_dim / 3)
7096
+ # custom dim factor multiplier
7097
+ if ffn_dim_multiplier is not None:
7098
+ ff_dim = int(ffn_dim_multiplier * ff_dim)
7099
+ ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
7100
+
7101
+ self.gguf_writer.add_feed_forward_length(ff_dim)
7102
+
7103
+ def set_gguf_parameters(self):
7104
+ # set num_key_value_heads only for attention layers
7105
+ self.hparams["num_key_value_heads"] = [
7106
+ self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
7107
+ for layer_type in self.hparams["layer_types"]
7108
+ ]
7109
+
7110
+ super().set_gguf_parameters()
7111
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
7112
+ self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
7113
+ self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
7114
+ self._add_feed_forward_length()
7115
+
7116
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7117
+ # conv op requires 2d tensor
7118
+ if 'conv.conv' in name:
7119
+ data_torch = data_torch.squeeze(1)
7120
+
7121
+ return [(self.map_tensor_name(name), data_torch)]
7122
+
7123
+
6393
7124
  ###### CONVERSION LOGIC ######
6394
7125
 
6395
7126
 
@@ -6569,12 +7300,20 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
6569
7300
  # maybe we should fallback to text model's arch in that case, since not many models have both
6570
7301
  text_config = hparams.get("text_config", {})
6571
7302
  vision_config = hparams.get("vision_config", {})
6572
- arch = hparams["architectures"][0]
7303
+ arch = None
7304
+ if (arches := hparams.get("architectures")) is not None and len(arches) > 0:
7305
+ arch = arches[0]
7306
+ elif "ssm_cfg" in hparams:
7307
+ # For non-hf Mamba and Mamba2 models
7308
+ arch = hparams["ssm_cfg"].get("layer", "Mamba") + "ForCausalLM"
7309
+
6573
7310
  # if "architectures" is found in the sub-config, use that instead
6574
7311
  if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
6575
7312
  arch = text_config["architectures"][0]
6576
7313
  elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
6577
7314
  arch = vision_config["architectures"][0]
7315
+ if arch is None:
7316
+ raise ValueError("Failed to detect model architecture")
6578
7317
  return arch
6579
7318
 
6580
7319