@novastera-oss/llamarn 0.4.0 → 0.4.3-beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (979) hide show
  1. package/RNLlamaCpp.podspec +4 -1
  2. package/android/CMakeLists.txt +13 -3
  3. package/android/src/main/cpp/include/llama.h +44 -21
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/LlamaCppModel.cpp +2 -10
  21. package/cpp/SystemUtils.cpp +3 -7
  22. package/cpp/build-info.cpp +2 -2
  23. package/cpp/llama.cpp/CMakeLists.txt +12 -0
  24. package/cpp/llama.cpp/CODEOWNERS +116 -10
  25. package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
  26. package/cpp/llama.cpp/README.md +13 -5
  27. package/cpp/llama.cpp/build-xcframework.sh +5 -0
  28. package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
  29. package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
  30. package/cpp/llama.cpp/common/arg.cpp +303 -795
  31. package/cpp/llama.cpp/common/arg.h +2 -3
  32. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
  33. package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
  34. package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
  35. package/cpp/llama.cpp/common/chat-parser.h +13 -0
  36. package/cpp/llama.cpp/common/chat.cpp +1147 -88
  37. package/cpp/llama.cpp/common/chat.h +16 -3
  38. package/cpp/llama.cpp/common/common.cpp +70 -15
  39. package/cpp/llama.cpp/common/common.h +57 -19
  40. package/cpp/llama.cpp/common/download.cpp +1072 -0
  41. package/cpp/llama.cpp/common/download.h +55 -0
  42. package/cpp/llama.cpp/common/http.h +73 -0
  43. package/cpp/llama.cpp/common/json-partial.cpp +70 -2
  44. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
  45. package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
  46. package/cpp/llama.cpp/common/log.cpp +59 -2
  47. package/cpp/llama.cpp/common/log.h +12 -4
  48. package/cpp/llama.cpp/common/sampling.cpp +84 -8
  49. package/cpp/llama.cpp/common/sampling.h +3 -1
  50. package/cpp/llama.cpp/common/speculative.cpp +1 -1
  51. package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
  52. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
  53. package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
  54. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
  55. package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
  56. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  57. package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
  58. package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
  59. package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
  60. package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
  61. package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
  62. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
  64. package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
  65. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
  66. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
  67. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
  70. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
  71. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
  72. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
  73. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
  74. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
  75. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
  76. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
  77. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  78. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
  79. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
  80. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
  82. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
  83. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  84. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
  86. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
  87. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
  88. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
  89. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
  90. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
  91. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
  92. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
  93. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
  94. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
  95. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
  102. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
  103. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
  104. package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
  105. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
  106. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
  107. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
  108. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
  110. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
  111. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
  112. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
  113. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
  115. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
  117. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
  119. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
  120. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
  121. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
  122. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
  123. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
  124. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
  125. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
  126. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
  127. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
  128. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
  129. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
  130. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
  131. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
  132. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
  133. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
  134. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
  135. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
  136. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
  137. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
  138. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
  139. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
  140. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
  141. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
  142. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
  143. package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
  144. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
  145. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
  147. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
  149. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
  150. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
  152. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
  153. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
  154. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
  161. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
  164. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
  166. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
  167. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
  168. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
  169. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
  170. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
  173. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
  174. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
  175. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
  176. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
  177. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
  178. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
  179. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
  180. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
  181. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
  182. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
  183. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
  184. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
  185. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
  186. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
  187. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
  188. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
  189. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
  190. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
  191. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
  192. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
  193. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
  194. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
  195. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
  196. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
  197. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
  198. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
  199. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
  200. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
  201. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
  202. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
  203. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
  204. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
  205. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
  206. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
  207. package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
  208. package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
  209. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
  210. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
  211. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
  212. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
  213. package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
  214. package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
  215. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
  216. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
  217. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
  218. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  219. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
  220. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
  221. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
  222. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
  223. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
  224. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  225. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
  226. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
  227. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
  228. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
  229. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
  230. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
  231. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
  232. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
  233. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
  234. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
  235. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
  236. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
  237. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  238. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
  239. package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
  240. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
  241. package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
  242. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
  243. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
  244. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  245. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
  246. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
  247. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
  248. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
  249. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
  250. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
  251. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
  252. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
  253. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
  254. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
  255. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
  256. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
  257. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
  258. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
  259. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
  260. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
  261. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
  262. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  263. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  264. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
  265. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  266. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
  267. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
  268. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  269. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  270. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  271. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  272. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  273. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  274. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  275. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
  276. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
  277. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
  278. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
  279. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
  280. package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
  281. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
  282. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  283. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
  284. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
  285. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
  286. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
  287. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
  288. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
  289. package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
  290. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
  291. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
  292. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
  293. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
  294. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
  295. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
  296. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
  297. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  298. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  299. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
  300. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
  301. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
  302. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
  303. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
  304. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
  305. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
  306. package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
  307. package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  308. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
  309. package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
  310. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
  311. package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
  312. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
  313. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
  314. package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
  315. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
  316. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
  317. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
  318. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
  319. package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
  320. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
  321. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
  322. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
  323. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
  324. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
  325. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
  326. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
  327. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
  328. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
  329. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
  330. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
  331. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
  332. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
  333. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
  334. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
  335. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
  336. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
  337. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
  338. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
  339. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
  340. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
  341. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
  342. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
  343. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
  344. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
  345. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
  346. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
  347. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
  348. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
  349. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
  350. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  351. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
  352. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
  353. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
  354. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
  355. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
  356. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
  357. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
  358. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
  359. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
  360. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
  361. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  362. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
  363. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
  364. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
  365. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
  366. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
  367. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
  368. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  369. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
  370. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
  371. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
  372. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
  373. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
  374. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
  375. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
  376. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
  377. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
  378. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
  379. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
  380. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
  381. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
  382. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
  383. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
  384. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
  385. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
  386. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
  387. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
  388. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
  389. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
  390. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
  391. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
  392. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
  393. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
  394. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
  395. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
  396. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
  397. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
  398. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
  399. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
  400. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
  401. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
  402. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
  403. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
  404. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
  405. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
  406. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
  407. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
  408. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
  409. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
  410. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
  411. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
  412. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
  413. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
  414. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
  415. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
  416. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
  417. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
  418. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
  419. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
  420. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
  421. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
  422. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
  423. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
  424. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
  425. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
  426. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
  427. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
  428. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
  429. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
  430. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
  431. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
  432. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
  433. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
  434. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
  435. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
  436. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
  437. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
  438. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
  439. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
  440. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
  441. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
  442. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
  443. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
  444. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
  445. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
  446. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
  447. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
  448. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
  449. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
  450. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
  451. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
  452. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
  453. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
  454. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
  455. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
  456. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
  457. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
  458. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
  459. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
  460. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
  461. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
  462. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
  463. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
  464. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
  465. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
  466. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
  467. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
  468. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
  469. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
  470. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
  471. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
  472. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
  473. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
  474. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
  475. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
  476. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
  477. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
  478. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
  479. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
  480. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  481. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
  482. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
  483. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
  484. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
  485. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
  486. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  487. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  488. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
  489. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
  490. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  491. package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  492. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
  493. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  494. package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  495. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  496. package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  497. package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
  498. package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
  499. package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
  500. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
  501. package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
  502. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
  503. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
  504. package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
  505. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
  506. package/cpp/llama.cpp/include/llama.h +44 -21
  507. package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
  508. package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
  509. package/cpp/llama.cpp/media/llama1-icon.png +0 -0
  510. package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
  511. package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
  512. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
  513. package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
  514. package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
  515. package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
  516. package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
  517. package/cpp/llama.cpp/src/llama-adapter.h +3 -0
  518. package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
  519. package/cpp/llama.cpp/src/llama-arch.h +50 -0
  520. package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
  521. package/cpp/llama.cpp/src/llama-batch.h +13 -2
  522. package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
  523. package/cpp/llama.cpp/src/llama-chat.h +4 -0
  524. package/cpp/llama.cpp/src/llama-context.cpp +300 -45
  525. package/cpp/llama.cpp/src/llama-context.h +16 -6
  526. package/cpp/llama.cpp/src/llama-cparams.h +2 -1
  527. package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
  528. package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
  529. package/cpp/llama.cpp/src/llama-graph.h +27 -5
  530. package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
  531. package/cpp/llama.cpp/src/llama-hparams.h +48 -8
  532. package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
  533. package/cpp/llama.cpp/src/llama-impl.h +2 -0
  534. package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
  535. package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
  536. package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
  537. package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
  538. package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
  539. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
  540. package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
  541. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
  542. package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
  543. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  544. package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
  545. package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
  546. package/cpp/llama.cpp/src/llama-model.h +40 -4
  547. package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
  548. package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
  549. package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
  550. package/cpp/llama.cpp/src/llama-vocab.h +43 -39
  551. package/cpp/llama.cpp/src/llama.cpp +69 -10
  552. package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
  553. package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
  554. package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
  555. package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
  556. package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
  557. package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
  558. package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
  559. package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  560. package/cpp/llama.cpp/src/models/bert.cpp +176 -0
  561. package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
  562. package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
  563. package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
  564. package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
  565. package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
  566. package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
  567. package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  568. package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
  569. package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
  570. package/cpp/llama.cpp/src/models/deci.cpp +135 -0
  571. package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
  572. package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
  573. package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
  574. package/cpp/llama.cpp/src/models/dream.cpp +105 -0
  575. package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  576. package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
  577. package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
  578. package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
  579. package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
  580. package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
  581. package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  582. package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
  583. package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  584. package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  585. package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  586. package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
  587. package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
  588. package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
  589. package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
  590. package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  591. package/cpp/llama.cpp/src/models/granite.cpp +211 -0
  592. package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  593. package/cpp/llama.cpp/src/models/grok.cpp +159 -0
  594. package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
  595. package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  596. package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  597. package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
  598. package/cpp/llama.cpp/src/models/jais.cpp +86 -0
  599. package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
  600. package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
  601. package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
  602. package/cpp/llama.cpp/src/models/llada.cpp +99 -0
  603. package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
  604. package/cpp/llama.cpp/src/models/llama.cpp +155 -0
  605. package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
  606. package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
  607. package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
  608. package/cpp/llama.cpp/src/models/models.h +485 -0
  609. package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
  610. package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
  611. package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
  612. package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
  613. package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
  614. package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
  615. package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
  616. package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
  617. package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
  618. package/cpp/llama.cpp/src/models/orion.cpp +123 -0
  619. package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  620. package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
  621. package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
  622. package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
  623. package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
  624. package/cpp/llama.cpp/src/models/plm.cpp +168 -0
  625. package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
  626. package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
  627. package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
  628. package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
  629. package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
  630. package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
  631. package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  632. package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
  633. package/cpp/llama.cpp/src/models/refact.cpp +94 -0
  634. package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  635. package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
  636. package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  637. package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  638. package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
  639. package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
  640. package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
  641. package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
  642. package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
  643. package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
  644. package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
  645. package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
  646. package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
  647. package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  648. package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
  649. package/cpp/llama.cpp/src/unicode.cpp +77 -0
  650. package/cpp/llama.cpp/src/unicode.h +43 -0
  651. package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
  652. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
  653. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
  654. package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
  655. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
  656. package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
  657. package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
  658. package/cpp/rn-completion.cpp +3 -27
  659. package/ios/include/chat.h +16 -3
  660. package/ios/include/common/minja/chat-template.hpp +9 -2
  661. package/ios/include/common/minja/minja.hpp +101 -22
  662. package/ios/include/common.h +57 -19
  663. package/ios/include/json-schema-to-grammar.h +2 -0
  664. package/ios/include/llama.h +44 -21
  665. package/ios/include/log.h +12 -4
  666. package/ios/include/sampling.h +3 -1
  667. package/ios/libs/llama.xcframework/Info.plist +20 -20
  668. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  669. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
  670. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
  671. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
  672. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
  673. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
  674. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
  675. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  676. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  677. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
  678. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
  679. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
  680. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
  681. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
  682. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
  683. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
  684. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  685. package/package.json +10 -4
  686. package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
  687. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
  688. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
  689. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
  690. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
  691. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
  692. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
  693. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
  694. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
  695. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
  696. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
  697. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
  698. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
  699. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
  700. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
  701. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
  702. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
  703. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
  704. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
  705. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
  706. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
  707. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
  708. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
  709. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
  710. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
  711. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
  712. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
  713. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
  714. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
  715. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
  716. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
  717. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
  718. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
  719. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
  720. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
  721. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
  722. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
  723. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
  724. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
  725. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
  726. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
  727. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
  728. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
  729. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
  730. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
  731. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
  732. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
  733. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
  734. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
  735. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
  736. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
  737. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
  738. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
  739. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
  740. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
  741. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
  742. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
  743. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
  744. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
  745. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
  746. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
  747. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
  748. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
  749. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
  750. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
  751. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
  752. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
  753. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
  754. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
  755. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
  756. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
  757. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
  758. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
  759. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
  760. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
  761. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
  762. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
  763. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
  764. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
  765. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
  766. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
  767. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
  768. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
  769. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
  770. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
  771. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
  772. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
  773. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
  774. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
  775. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
  776. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
  777. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
  778. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
  779. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
  780. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
  781. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
  782. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
  783. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
  784. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
  785. package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
  786. package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
  787. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
  788. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
  789. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
  790. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
  791. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
  792. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
  793. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
  794. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
  795. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
  796. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
  797. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
  798. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
  799. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
  800. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
  801. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
  802. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
  803. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
  804. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
  805. package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
  806. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
  807. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
  808. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
  809. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
  810. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
  811. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
  812. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
  813. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
  814. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
  815. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  816. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
  817. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
  818. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
  819. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
  820. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
  821. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
  822. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
  823. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
  824. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
  825. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
  826. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
  827. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
  828. package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
  829. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
  830. package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
  831. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
  832. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
  833. package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
  834. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
  835. package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
  836. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
  837. package/cpp/llama.cpp/models/templates/README.md +0 -25
  838. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
  839. package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
  840. package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
  841. package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
  842. package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
  843. package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
  844. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
  845. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
  846. package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
  847. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
  848. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
  849. package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
  850. package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
  851. package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
  852. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
  853. package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
  854. package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
  855. package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
  856. package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
  857. package/cpp/llama.cpp/prompts/assistant.txt +0 -31
  858. package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
  859. package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
  860. package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
  861. package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
  862. package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
  863. package/cpp/llama.cpp/prompts/chat.txt +0 -28
  864. package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
  865. package/cpp/llama.cpp/prompts/dan.txt +0 -1
  866. package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
  867. package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
  868. package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
  869. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  870. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  871. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  872. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
  873. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
  874. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
  875. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
  876. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
  877. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
  878. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
  879. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
  880. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
  881. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
  882. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
  883. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
  884. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
  885. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
  886. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
  887. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
  888. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
  889. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
  890. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
  891. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
  892. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
  893. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
  894. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
  895. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  896. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
  897. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
  898. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
  899. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
  900. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
  901. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
  902. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
  903. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
  904. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
  905. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
  906. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
  907. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  908. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  909. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  910. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  911. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
  912. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  913. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  914. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  915. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  916. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  917. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  918. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
  919. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
  920. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
  921. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
  922. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
  923. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  924. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  925. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  926. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
  927. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
  928. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  929. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  930. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  931. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  932. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  933. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  934. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  935. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  936. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  937. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
  938. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  939. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  940. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  941. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  942. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
  943. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
  944. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
  945. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
  946. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
  947. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
  948. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
  949. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
  950. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
  951. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
  952. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
  953. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
  954. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  955. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
  956. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  957. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
  958. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
  959. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
  960. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
  961. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
  962. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
  963. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
  964. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
  965. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
  966. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
  967. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
  968. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
  969. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
  970. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  971. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
  972. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
  973. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
  974. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
  975. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
  976. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
  977. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
  978. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
  979. /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
@@ -29,12 +29,29 @@ if 'NO_LOCAL_GGUF' not in os.environ:
29
29
  sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
30
30
  import gguf
31
31
  from gguf.vocab import MistralTokenizerType, MistralVocab
32
- from mistral_common.tokens.tokenizers.base import TokenizerVersion
33
- from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD
34
- from mistral_common.tokens.tokenizers.tekken import Tekkenizer
35
- from mistral_common.tokens.tokenizers.sentencepiece import (
36
- SentencePieceTokenizer,
37
- )
32
+
33
+ try:
34
+ from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports]
35
+ from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, DATASET_STD as _MISTRAL_COMMON_DATASET_STD # pyright: ignore[reportMissingImports]
36
+ from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports]
37
+ from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports]
38
+ SentencePieceTokenizer,
39
+ )
40
+
41
+ _mistral_common_installed = True
42
+ _mistral_import_error_msg = ""
43
+ except ImportError:
44
+ _MISTRAL_COMMON_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
45
+ _MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
46
+
47
+ _mistral_common_installed = False
48
+ TokenizerVersion = None
49
+ Tekkenizer = None
50
+ SentencePieceTokenizer = None
51
+ _mistral_import_error_msg = (
52
+ "Mistral format requires `mistral-common` to be installed. Please run "
53
+ "`pip install mistral-common[image,audio]` to install it."
54
+ )
38
55
 
39
56
 
40
57
  logger = logging.getLogger("hf-to-gguf")
@@ -73,10 +90,8 @@ class ModelBase:
73
90
  use_temp_file: bool
74
91
  lazy: bool
75
92
  dry_run: bool
76
- part_names: list[str]
77
- is_safetensors: bool
78
93
  hparams: dict[str, Any]
79
- tensor_names: set[str] | None
94
+ model_tensors: dict[str, Callable[[], Tensor]]
80
95
  gguf_writer: gguf.GGUFWriter
81
96
  model_name: str | None
82
97
  metadata_override: Path | None
@@ -93,18 +108,23 @@ class ModelBase:
93
108
  # Mistral format specifics
94
109
  is_mistral_format: bool = False
95
110
  disable_mistral_community_chat_template: bool = False
111
+ sentence_transformers_dense_modules: bool = False
96
112
 
97
113
  def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
98
114
  use_temp_file: bool = False, eager: bool = False,
99
115
  metadata_override: Path | None = None, model_name: str | None = None,
100
116
  split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
101
117
  small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
102
- disable_mistral_community_chat_template: bool = False):
118
+ disable_mistral_community_chat_template: bool = False,
119
+ sentence_transformers_dense_modules: bool = False):
103
120
  if type(self) is ModelBase or \
104
121
  type(self) is TextModel or \
105
122
  type(self) is MmprojModel:
106
123
  raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
107
124
 
125
+ if self.is_mistral_format and not _mistral_common_installed:
126
+ raise ImportError(_mistral_import_error_msg)
127
+
108
128
  self.dir_model = dir_model
109
129
  self.ftype = ftype
110
130
  self.fname_out = fname_out
@@ -114,25 +134,9 @@ class ModelBase:
114
134
  self.lazy = not eager or (remote_hf_model_id is not None)
115
135
  self.dry_run = dry_run
116
136
  self.remote_hf_model_id = remote_hf_model_id
117
- if remote_hf_model_id is not None:
118
- self.is_safetensors = True
119
-
120
- def get_remote_tensors() -> Iterator[tuple[str, Tensor]]:
121
- logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
122
- remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
123
- self.tensor_names = set(name for name in remote_tensors.keys())
124
- for name, remote_tensor in remote_tensors.items():
125
- yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor))
126
-
127
- self.get_tensors = get_remote_tensors
128
- else:
129
- prefix = "model" if not self.is_mistral_format else "consolidated"
130
- self.part_names = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors")
131
- self.is_safetensors = len(self.part_names) > 0
132
- if not self.is_safetensors:
133
- self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
137
+ self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
134
138
  self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
135
- self.tensor_names = None
139
+ self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id)
136
140
  self.metadata_override = metadata_override
137
141
  self.model_name = model_name
138
142
  self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
@@ -148,6 +152,8 @@ class ModelBase:
148
152
  logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
149
153
  self.ftype = gguf.LlamaFileType.MOSTLY_BF16
150
154
 
155
+ self.dequant_model()
156
+
151
157
  # Configure GGUF Writer
152
158
  self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
153
159
  split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
@@ -169,67 +175,292 @@ class ModelBase:
169
175
  return None
170
176
  raise KeyError(f"could not find any of: {keys}")
171
177
 
172
- def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
173
- tensor_names_from_parts: set[str] = set()
178
+ def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
179
+ tensors: dict[str, Callable[[], Tensor]] = {}
180
+
181
+ if remote_hf_model_id is not None:
182
+ is_safetensors = True
183
+
184
+ logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
185
+ remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
186
+ for name, remote_tensor in remote_tensors.items():
187
+ tensors[name] = lambda r=remote_tensor: LazyTorchTensor.from_remote_tensor(r)
188
+
189
+ return tensors
190
+
191
+ prefix = "model" if not self.is_mistral_format else "consolidated"
192
+ part_names: set[str] = set(ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors"))
193
+ is_safetensors: bool = len(part_names) > 0
194
+ if not is_safetensors:
195
+ part_names = set(ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin"))
196
+
197
+ tensor_names_from_index: set[str] = set()
174
198
 
175
199
  if not self.is_mistral_format:
176
- index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
200
+ index_name = "model.safetensors" if is_safetensors else "pytorch_model.bin"
177
201
  index_name += ".index.json"
178
202
  index_file = self.dir_model / index_name
179
203
 
180
204
  if index_file.is_file():
181
- self.tensor_names = set()
182
205
  logger.info(f"gguf: loading model weight map from '{index_name}'")
183
206
  with open(index_file, "r", encoding="utf-8") as f:
184
207
  index: dict[str, Any] = json.load(f)
185
208
  weight_map = index.get("weight_map")
186
209
  if weight_map is None or not isinstance(weight_map, dict):
187
210
  raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
188
- self.tensor_names.update(weight_map.keys())
211
+ tensor_names_from_index.update(weight_map.keys())
212
+ part_names |= set(weight_map.values())
189
213
  else:
190
- self.tensor_names = tensor_names_from_parts
191
214
  weight_map = {}
192
215
  else:
193
- self.tensor_names = tensor_names_from_parts
194
216
  weight_map = {}
195
217
 
196
- for part_name in self.part_names:
197
- logger.info(f"gguf: loading model part '{part_name}'")
218
+ for part_name in part_names:
219
+ logger.info(f"gguf: indexing model part '{part_name}'")
198
220
  ctx: ContextManager[Any]
199
- if self.is_safetensors:
200
- from safetensors import safe_open
201
- ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
221
+ if is_safetensors:
222
+ ctx = cast(ContextManager[Any], gguf.utility.SafetensorsLocal(self.dir_model / part_name))
202
223
  else:
203
224
  ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
204
225
 
205
226
  with ctx as model_part:
206
- tensor_names_from_parts.update(model_part.keys())
227
+ assert model_part is not None
207
228
 
208
229
  for name in model_part.keys():
209
- if self.is_safetensors:
230
+ if is_safetensors:
231
+ data: gguf.utility.LocalTensor = model_part[name]
210
232
  if self.lazy:
211
- data = model_part.get_slice(name)
212
- data = LazyTorchTensor.from_safetensors_slice(data)
233
+ data_gen = lambda data=data: LazyTorchTensor.from_local_tensor(data) # noqa: E731
213
234
  else:
214
- data = model_part.get_tensor(name)
235
+ dtype = LazyTorchTensor._dtype_str_map[data.dtype]
236
+ data_gen = lambda data=data, dtype=dtype: torch.from_numpy(data.mmap_bytes()).view(dtype).reshape(data.shape) # noqa: E731
215
237
  else:
216
- data = model_part[name]
238
+ data_torch: Tensor = model_part[name]
217
239
  if self.lazy:
218
- data = LazyTorchTensor.from_eager(data)
219
- yield name, data
240
+ data_gen = lambda data=data_torch: LazyTorchTensor.from_eager(data) # noqa: E731
241
+ else:
242
+ data_gen = lambda data=data_torch: data # noqa: E731
243
+ tensors[name] = data_gen
220
244
 
221
245
  # verify tensor name presence and identify potentially missing files
222
- if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
223
- missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
224
- extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
225
- missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
226
- if len(extra) == 0 and len(missing_files) > 0:
227
- raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
228
- f"Missing tensors: {missing}")
246
+ if len(tensor_names_from_index) > 0:
247
+ tensor_names_from_parts = set(tensors.keys())
248
+ if len(tensor_names_from_parts.symmetric_difference(tensor_names_from_index)) > 0:
249
+ missing = sorted(tensor_names_from_index.difference(tensor_names_from_parts))
250
+ extra = sorted(tensor_names_from_parts.difference(tensor_names_from_index))
251
+ missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
252
+ if len(extra) == 0 and len(missing_files) > 0:
253
+ raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
254
+ f"Missing tensors: {missing}")
255
+ else:
256
+ raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
257
+ f"Missing tensors: {missing}\n"
258
+ f"Extra tensors: {extra}")
259
+
260
+ return tensors
261
+
262
+ def dequant_model(self):
263
+ tensors_to_remove: list[str] = []
264
+ new_tensors: dict[str, Callable[[], Tensor]] = {}
265
+
266
+ if (quant_config := self.hparams.get("quantization_config")) and isinstance(quant_config, dict):
267
+ quant_method = quant_config.get("quant_method")
268
+
269
+ def dequant_bitnet(weight: Tensor, scale: Tensor) -> Tensor:
270
+ weight = weight.view(torch.uint8)
271
+ orig_shape = weight.shape
272
+
273
+ shift = torch.tensor([0, 2, 4, 6], dtype=torch.uint8).reshape((4, *(1 for _ in range(len(orig_shape)))))
274
+ data = weight.unsqueeze(0).expand((4, *orig_shape)) >> shift
275
+ data = data & 3
276
+ data = (data.float() - 1).reshape((orig_shape[0] * 4, *orig_shape[1:]))
277
+
278
+ # The scale is inverted
279
+ return data / scale.float()
280
+
281
+ def dequant_simple(weight: Tensor, scale: Tensor, block_size: Sequence[int] | None = None) -> Tensor:
282
+ scale = scale.float()
283
+
284
+ if block_size is not None:
285
+ for i, size in enumerate(block_size):
286
+ scale = scale.repeat_interleave(size, i)
287
+ # unpad the scale (e.g. when the tensor size isn't a multiple of the block size)
288
+ scale = scale[tuple(slice(0, size) for size in weight.shape)]
289
+
290
+ return weight.float() * scale
291
+
292
+ # ref: https://github.com/ModelCloud/GPTQModel/blob/037c5c0f6c9e33c500d975b038d02e7ca437546d/gptqmodel/nn_modules/qlinear/__init__.py#L437-L476
293
+ def dequant_gptq(g_idx: Tensor, qweight: Tensor, qzeros: Tensor, scales: Tensor) -> Tensor:
294
+ bits = quant_config["bits"]
295
+ assert bits in (2, 3, 4, 8)
296
+ assert qweight.dtype == qzeros.dtype
297
+ maxq = (2 ** bits) - 1
298
+ weight = None
299
+ zeros = None
300
+ pack_dtype_bits = qweight.dtype.itemsize * 8
301
+
302
+ if bits in [2, 4, 8]:
303
+ pack_factor = pack_dtype_bits // bits
304
+ wf = torch.tensor(list(range(0, pack_dtype_bits, bits)), dtype=torch.int32).unsqueeze(0)
305
+ if self.lazy:
306
+ wf = LazyTorchTensor.from_eager(wf)
307
+
308
+ zeros = torch.bitwise_right_shift(
309
+ qzeros.unsqueeze(2).expand(-1, -1, pack_factor),
310
+ wf.unsqueeze(0)
311
+ ).to(torch.int16 if bits == 8 else torch.int8)
312
+ zeros = torch.bitwise_and(zeros, maxq).reshape(scales.shape)
313
+
314
+ weight = torch.bitwise_and(
315
+ torch.bitwise_right_shift(
316
+ qweight.unsqueeze(1).expand(-1, pack_factor, -1),
317
+ wf.unsqueeze(-1)
318
+ ).to(torch.int16 if bits == 8 else torch.int8),
319
+ maxq
320
+ )
321
+ elif bits == 3:
322
+ raise NotImplementedError("3-bit gptq dequantization is not yet implemented")
323
+
324
+ assert weight is not None
325
+ assert zeros is not None
326
+
327
+ weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
328
+
329
+ # gptq_v2 doesn't need to offset zeros
330
+ if quant_config.get("checkpoint_format", "gptq") == "gptq":
331
+ zeros += 1
332
+
333
+ return (scales[g_idx].float() * (weight - zeros[g_idx]).float()).T
334
+
335
+ def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: Tensor | None, num_bits: int, group_size: int):
336
+ assert w.dtype == torch.int32
337
+ shape = tuple(shape_tensor.tolist())
338
+ assert len(shape) == 2
339
+ mask = (1 << num_bits) - 1
340
+
341
+ shifts = torch.arange(0, 32 - (num_bits - 1), num_bits, dtype=torch.int32)
342
+ if self.lazy:
343
+ shifts = LazyTorchTensor.from_eager(shifts)
344
+
345
+ if zero_point is None:
346
+ offset = 1 << (num_bits - 1)
347
+ else:
348
+ assert len(zero_point.shape) == 2
349
+ offset = (zero_point.unsqueeze(1) >> shifts.reshape(1, -1, 1)) & mask
350
+ offset = offset.reshape(-1, zero_point.shape[1])
351
+ # trim padding, and prepare for broadcast
352
+ # NOTE: the zero-point is packed along dim 0
353
+ offset = offset[:shape[0], :].unsqueeze(-1)
354
+
355
+ # extract values
356
+ # NOTE: the weights are packed along dim 1
357
+ unpacked = (w.unsqueeze(-1) >> shifts.reshape(1, 1, -1)) & mask
358
+ unpacked = unpacked.reshape(shape[0], -1)
359
+
360
+ # trim padding
361
+ unpacked = unpacked[:, :shape[1]]
362
+
363
+ # prepare for broadcast of the scale
364
+ unpacked = unpacked.reshape(shape[0], (unpacked.shape[-1] + group_size - 1) // group_size, group_size)
365
+ unpacked = unpacked - offset
366
+
367
+ return (unpacked * scale.unsqueeze(-1).float()).reshape(shape)
368
+
369
+ if quant_method == "bitnet":
370
+ for name in self.model_tensors.keys():
371
+ if name.endswith(".weight_scale"):
372
+ weight_name = name.removesuffix("_scale")
373
+ w = self.model_tensors[weight_name]
374
+ s = self.model_tensors[name]
375
+ self.model_tensors[weight_name] = lambda w=w, s=s: dequant_bitnet(w(), s())
376
+ tensors_to_remove.append(name)
377
+ elif quant_method == "fp8":
378
+ block_size = quant_config.get("weight_block_size")
379
+ for name in self.model_tensors.keys():
380
+ if name.endswith(".weight_scale_inv"):
381
+ weight_name = name.removesuffix("_scale_inv")
382
+ w = self.model_tensors[weight_name]
383
+ s = self.model_tensors[name]
384
+ self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs)
385
+ tensors_to_remove.append(name)
386
+ elif quant_method == "gptq":
387
+ for name in self.model_tensors.keys():
388
+ if name.endswith(".qweight"):
389
+ base_name = name.removesuffix(".qweight")
390
+ g_idx = self.model_tensors[base_name + ".g_idx"]
391
+ qweight = self.model_tensors[base_name + ".qweight"]
392
+ qzeros = self.model_tensors[base_name + ".qzeros"]
393
+ scales = self.model_tensors[base_name + ".scales"]
394
+ new_tensors[base_name + ".weight"] = (
395
+ lambda g=g_idx, z=qzeros, w=qweight, s=scales: dequant_gptq(
396
+ g(), w(), z(), s()
397
+ )
398
+ )
399
+ tensors_to_remove += [
400
+ base_name + n
401
+ for n in (
402
+ ".g_idx",
403
+ ".qzeros",
404
+ ".qweight",
405
+ ".scales",
406
+ )
407
+ ]
408
+ elif quant_method == "compressed-tensors":
409
+ quant_format = quant_config["format"]
410
+ groups = quant_config["config_groups"]
411
+ if len(groups) > 1:
412
+ raise NotImplementedError("Can't handle multiple config groups for compressed-tensors yet")
413
+ weight_config = tuple(groups.values())[0]["weights"]
414
+
415
+ if quant_format == "float-quantized" or quant_format == "int-quantized" or quant_format == "naive-quantized":
416
+ block_size = weight_config.get("block_structure", None)
417
+ strategy = weight_config.get("strategy")
418
+ assert strategy == "channel" or strategy == "block"
419
+ assert weight_config.get("group_size") is None # didn't find a model using this yet
420
+ for name in self.model_tensors.keys():
421
+ if name.endswith(".weight_scale"):
422
+ weight_name = name.removesuffix("_scale")
423
+ w = self.model_tensors[weight_name]
424
+ s = self.model_tensors[name]
425
+ self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), block_size)
426
+ tensors_to_remove.append(name)
427
+ elif quant_format == "pack-quantized":
428
+ assert weight_config.get("strategy") == "group"
429
+ assert weight_config.get("type", "int") == "int"
430
+ num_bits = weight_config.get("num_bits")
431
+ group_size = weight_config.get("group_size")
432
+ assert isinstance(num_bits, int)
433
+ assert isinstance(group_size, int)
434
+ for name in self.model_tensors.keys():
435
+ if name.endswith(".weight_packed"):
436
+ base_name = name.removesuffix("_packed")
437
+ w = self.model_tensors[name]
438
+ scale = self.model_tensors[base_name + "_scale"]
439
+ shape = self.model_tensors[base_name + "_shape"]
440
+ zero_point = self.model_tensors.get(base_name + "_zero_point", lambda: None)
441
+ new_tensors[base_name] = (
442
+ lambda w=w, scale=scale, shape=shape, zero_point=zero_point: dequant_packed(
443
+ w(), scale(), shape(), zero_point(), num_bits, group_size,
444
+ )
445
+ )
446
+ tensors_to_remove += [base_name + n for n in ("_packed", "_shape", "_scale")]
447
+ if (base_name + "_zero_point") in self.model_tensors:
448
+ tensors_to_remove.append(base_name + "_zero_point")
449
+ else:
450
+ raise NotImplementedError(f"Quant format {quant_format!r} for method {quant_method!r} is not yet supported")
229
451
  else:
230
- raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
231
- f"Missing tensors: {missing}\n"
232
- f"Extra tensors: {extra}")
452
+ raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}")
453
+
454
+ for name in tensors_to_remove:
455
+ if name in self.model_tensors:
456
+ del self.model_tensors[name]
457
+
458
+ for name, value in new_tensors.items():
459
+ self.model_tensors[name] = value
460
+
461
+ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
462
+ for name, gen in self.model_tensors.items():
463
+ yield name, gen()
233
464
 
234
465
  def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
235
466
  if key not in gguf.MODEL_TENSORS[self.model_arch]:
@@ -302,10 +533,6 @@ class ModelBase:
302
533
  # data = data_torch.squeeze().numpy()
303
534
  data = data_torch.numpy()
304
535
 
305
- # if data ends up empty, it means data_torch was a scalar tensor -> restore
306
- if len(data.shape) == 0:
307
- data = data_torch.numpy()
308
-
309
536
  n_dims = len(data.shape)
310
537
  data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
311
538
 
@@ -592,6 +819,21 @@ class TextModel(ModelBase):
592
819
  if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
593
820
  self.gguf_writer.add_expert_used_count(n_experts_used)
594
821
  logger.info(f"gguf: experts used count = {n_experts_used}")
822
+ if (n_expert_groups := self.hparams.get("n_group")) is not None:
823
+ self.gguf_writer.add_expert_group_count(n_expert_groups)
824
+ logger.info(f"gguf: expert groups count = {n_expert_groups}")
825
+ if (n_group_used := self.hparams.get("topk_group")) is not None:
826
+ self.gguf_writer.add_expert_group_used_count(n_group_used)
827
+ logger.info(f"gguf: expert groups used count = {n_group_used}")
828
+
829
+ if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func"], optional=True)) is not None:
830
+ if score_func == "sigmoid":
831
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
832
+ elif score_func == "softmax":
833
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
834
+ else:
835
+ raise ValueError(f"Unsupported expert score gating function value: {score_func}")
836
+ logger.info(f"gguf: expert score gating function = {score_func}")
595
837
 
596
838
  if (head_dim := self.hparams.get("head_dim")) is not None:
597
839
  self.gguf_writer.add_key_length(head_dim)
@@ -739,6 +981,9 @@ class TextModel(ModelBase):
739
981
  if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
740
982
  # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
741
983
  res = "qwen2"
984
+ if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
985
+ # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
986
+ res = "grok-2"
742
987
  if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
743
988
  # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
744
989
  res = "llama-bpe"
@@ -889,6 +1134,18 @@ class TextModel(ModelBase):
889
1134
  if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
890
1135
  # ref: https://huggingface.co/JetBrains/Mellum-4b-base
891
1136
  res = "mellum"
1137
+ if chkhsh == "49fc0303c9e0d2c2c565c510f64b2d9b271276acdcdadff733249eda9f7d59df":
1138
+ # ref: https://huggingface.co/arcee-ai/Trinity-Tokenizer
1139
+ res = "afmoe"
1140
+ if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
1141
+ # ref: https://huggingface.co/inclusionAI/Ling-mini-base-2.0
1142
+ res = "bailingmoe2"
1143
+ if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
1144
+ # ref: https://huggingface.co/ibm-granite/granite-docling-258M
1145
+ res = "granite-docling"
1146
+ if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95":
1147
+ # ref: https://huggingface.co/MiniMaxAI/MiniMax-M2
1148
+ res = "minimax-m2"
892
1149
 
893
1150
  if res is None:
894
1151
  logger.warning("\n")
@@ -1323,6 +1580,7 @@ class MmprojModel(ModelBase):
1323
1580
  self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
1324
1581
 
1325
1582
  # load preprocessor config
1583
+ self.preprocessor_config = {}
1326
1584
  if not self.is_mistral_format:
1327
1585
  with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
1328
1586
  self.preprocessor_config = json.load(f)
@@ -1337,6 +1595,17 @@ class MmprojModel(ModelBase):
1337
1595
  def set_type(self):
1338
1596
  self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
1339
1597
 
1598
+ def prepare_metadata(self, vocab_only: bool):
1599
+ super().prepare_metadata(vocab_only=vocab_only)
1600
+
1601
+ output_type: str = self.ftype.name.partition("_")[2]
1602
+
1603
+ if self.fname_out.is_dir():
1604
+ fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=output_type, model_type=None)
1605
+ self.fname_out = self.fname_out / f"mmproj-{fname_default}.gguf"
1606
+ else:
1607
+ self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
1608
+
1340
1609
  def set_gguf_parameters(self):
1341
1610
  self.gguf_writer.add_file_type(self.ftype)
1342
1611
 
@@ -1345,16 +1614,17 @@ class MmprojModel(ModelBase):
1345
1614
  self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
1346
1615
 
1347
1616
  # vision config
1348
- self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
1617
+ self.image_size = self.find_vparam(["image_size"])
1618
+ self.gguf_writer.add_vision_image_size(self.image_size)
1349
1619
  self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
1350
1620
  self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
1351
1621
  self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
1352
1622
  self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
1353
- self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
1623
+ self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads"]))
1354
1624
 
1355
1625
  # preprocessor config
1356
- image_mean = DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
1357
- image_std = DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
1626
+ image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
1627
+ image_std = _MISTRAL_COMMON_DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
1358
1628
 
1359
1629
  self.gguf_writer.add_vision_image_mean(image_mean)
1360
1630
  self.gguf_writer.add_vision_image_std(image_std)
@@ -1403,11 +1673,9 @@ class GPTNeoXModel(TextModel):
1403
1673
  model_arch = gguf.MODEL_ARCH.GPTNEOX
1404
1674
 
1405
1675
  def set_gguf_parameters(self):
1406
- block_count = self.hparams["num_hidden_layers"]
1407
-
1408
1676
  self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
1409
1677
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1410
- self.gguf_writer.add_block_count(block_count)
1678
+ self.gguf_writer.add_block_count(self.block_count)
1411
1679
  self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1412
1680
  self.gguf_writer.add_rope_dimension_count(
1413
1681
  int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
@@ -1465,7 +1733,7 @@ class BloomModel(TextModel):
1465
1733
  self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
1466
1734
  self.gguf_writer.add_embedding_length(n_embed)
1467
1735
  self.gguf_writer.add_feed_forward_length(4 * n_embed)
1468
- self.gguf_writer.add_block_count(self.hparams["n_layer"])
1736
+ self.gguf_writer.add_block_count(self.block_count)
1469
1737
  self.gguf_writer.add_head_count(n_head)
1470
1738
  self.gguf_writer.add_head_count_kv(n_head)
1471
1739
  self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
@@ -1528,10 +1796,9 @@ class MPTModel(TextModel):
1528
1796
  self.gguf_writer.add_unk_token_id(0)
1529
1797
 
1530
1798
  def set_gguf_parameters(self):
1531
- block_count = self.hparams["n_layers"]
1532
1799
  self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
1533
1800
  self.gguf_writer.add_embedding_length(self.hparams["d_model"])
1534
- self.gguf_writer.add_block_count(block_count)
1801
+ self.gguf_writer.add_block_count(self.block_count)
1535
1802
  self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
1536
1803
  self.gguf_writer.add_head_count(self.hparams["n_heads"])
1537
1804
  if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
@@ -1564,7 +1831,6 @@ class OrionModel(TextModel):
1564
1831
  self._set_vocab_sentencepiece()
1565
1832
 
1566
1833
  def set_gguf_parameters(self):
1567
- block_count = self.hparams["num_hidden_layers"]
1568
1834
  head_count = self.hparams["num_attention_heads"]
1569
1835
  head_count_kv = self.hparams.get("num_key_value_heads", head_count)
1570
1836
 
@@ -1582,7 +1848,7 @@ class OrionModel(TextModel):
1582
1848
  self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
1583
1849
  self.gguf_writer.add_context_length(ctx_length)
1584
1850
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1585
- self.gguf_writer.add_block_count(block_count)
1851
+ self.gguf_writer.add_block_count(self.block_count)
1586
1852
  self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1587
1853
  self.gguf_writer.add_head_count(head_count)
1588
1854
  self.gguf_writer.add_head_count_kv(head_count_kv)
@@ -1599,7 +1865,6 @@ class BaichuanModel(TextModel):
1599
1865
  self._set_vocab_sentencepiece()
1600
1866
 
1601
1867
  def set_gguf_parameters(self):
1602
- block_count = self.hparams["num_hidden_layers"]
1603
1868
  head_count = self.hparams["num_attention_heads"]
1604
1869
  head_count_kv = self.hparams.get("num_key_value_heads", head_count)
1605
1870
 
@@ -1616,7 +1881,7 @@ class BaichuanModel(TextModel):
1616
1881
  self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
1617
1882
  self.gguf_writer.add_context_length(ctx_length)
1618
1883
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1619
- self.gguf_writer.add_block_count(block_count)
1884
+ self.gguf_writer.add_block_count(self.block_count)
1620
1885
  self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1621
1886
  self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1622
1887
  self.gguf_writer.add_head_count(head_count)
@@ -1723,7 +1988,6 @@ class XverseModel(TextModel):
1723
1988
  special_vocab.add_to_gguf(self.gguf_writer)
1724
1989
 
1725
1990
  def set_gguf_parameters(self):
1726
- block_count = self.hparams["num_hidden_layers"]
1727
1991
  head_count = self.hparams["num_attention_heads"]
1728
1992
  head_count_kv = self.hparams.get("num_key_value_heads", head_count)
1729
1993
 
@@ -1740,7 +2004,7 @@ class XverseModel(TextModel):
1740
2004
  self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
1741
2005
  self.gguf_writer.add_context_length(ctx_length)
1742
2006
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1743
- self.gguf_writer.add_block_count(block_count)
2007
+ self.gguf_writer.add_block_count(self.block_count)
1744
2008
  self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
1745
2009
  self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1746
2010
  self.gguf_writer.add_head_count(head_count)
@@ -1783,10 +2047,6 @@ class FalconModel(TextModel):
1783
2047
  model_arch = gguf.MODEL_ARCH.FALCON
1784
2048
 
1785
2049
  def set_gguf_parameters(self):
1786
- block_count = self.hparams.get("num_hidden_layers")
1787
- if block_count is None:
1788
- block_count = self.hparams["n_layer"] # old name
1789
-
1790
2050
  n_head = self.hparams.get("num_attention_heads")
1791
2051
  if n_head is None:
1792
2052
  n_head = self.hparams["n_head"] # old name
@@ -1799,7 +2059,7 @@ class FalconModel(TextModel):
1799
2059
  self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
1800
2060
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
1801
2061
  self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
1802
- self.gguf_writer.add_block_count(block_count)
2062
+ self.gguf_writer.add_block_count(self.block_count)
1803
2063
  self.gguf_writer.add_head_count(n_head)
1804
2064
  self.gguf_writer.add_head_count_kv(n_head_kv)
1805
2065
  self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
@@ -1837,12 +2097,10 @@ class StarCoderModel(TextModel):
1837
2097
  model_arch = gguf.MODEL_ARCH.STARCODER
1838
2098
 
1839
2099
  def set_gguf_parameters(self):
1840
- block_count = self.hparams["n_layer"]
1841
-
1842
2100
  self.gguf_writer.add_context_length(self.hparams["n_positions"])
1843
2101
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
1844
2102
  self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
1845
- self.gguf_writer.add_block_count(block_count)
2103
+ self.gguf_writer.add_block_count(self.block_count)
1846
2104
  self.gguf_writer.add_head_count(self.hparams["n_head"])
1847
2105
  self.gguf_writer.add_head_count_kv(1)
1848
2106
  self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
@@ -1872,14 +2130,12 @@ class RefactModel(TextModel):
1872
2130
  multiple_of = 256
1873
2131
  ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
1874
2132
 
1875
- block_count = self.hparams["n_layer"]
1876
-
1877
2133
  # refact uses Alibi. So this is from config.json which might be used by training.
1878
2134
  self.gguf_writer.add_context_length(self.hparams["n_positions"])
1879
2135
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
1880
2136
 
1881
2137
  self.gguf_writer.add_feed_forward_length(ff_dim)
1882
- self.gguf_writer.add_block_count(block_count)
2138
+ self.gguf_writer.add_block_count(self.block_count)
1883
2139
  self.gguf_writer.add_head_count(self.hparams["n_head"])
1884
2140
  self.gguf_writer.add_head_count_kv(1)
1885
2141
  self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
@@ -1926,11 +2182,10 @@ class StableLMModel(TextModel):
1926
2182
 
1927
2183
  def set_gguf_parameters(self):
1928
2184
  hparams = self.hparams
1929
- block_count = hparams["num_hidden_layers"]
1930
2185
 
1931
2186
  self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
1932
2187
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
1933
- self.gguf_writer.add_block_count(block_count)
2188
+ self.gguf_writer.add_block_count(self.block_count)
1934
2189
  self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
1935
2190
  rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
1936
2191
  self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
@@ -2023,6 +2278,9 @@ class LlamaModel(TextModel):
2023
2278
  self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
2024
2279
 
2025
2280
  def _set_vocab_mistral(self):
2281
+ if not _mistral_common_installed:
2282
+ raise ImportError(_mistral_import_error_msg)
2283
+
2026
2284
  vocab = MistralVocab(self.dir_model)
2027
2285
  logger.info(
2028
2286
  f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
@@ -2273,24 +2531,93 @@ class ArceeModel(LlamaModel):
2273
2531
  self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
2274
2532
 
2275
2533
 
2534
+ @ModelBase.register("AfmoeForCausalLM")
2535
+ class AfmoeModel(LlamaModel):
2536
+ model_arch = gguf.MODEL_ARCH.AFMOE
2537
+
2538
+ def set_gguf_parameters(self):
2539
+ super().set_gguf_parameters()
2540
+
2541
+ # MoE parameters
2542
+ if (n_experts := self.hparams.get("num_experts")) is not None:
2543
+ self.gguf_writer.add_expert_count(n_experts)
2544
+ if (n_shared_experts := self.hparams.get("num_shared_experts")) is not None:
2545
+ self.gguf_writer.add_expert_shared_count(n_shared_experts)
2546
+ if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
2547
+ self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
2548
+ if (n_dense_layers := self.hparams.get("num_dense_layers")) is not None:
2549
+ self.gguf_writer.add_leading_dense_block_count(n_dense_layers)
2550
+
2551
+ # Route normalization and scaling
2552
+ if (route_norm := self.hparams.get("route_norm")) is not None:
2553
+ self.gguf_writer.add_expert_weights_norm(route_norm)
2554
+ if (route_scale := self.hparams.get("route_scale")) is not None:
2555
+ self.gguf_writer.add_expert_weights_scale(route_scale)
2556
+
2557
+ # Sliding window attention
2558
+ if (sliding_window := self.hparams.get("sliding_window")) is not None:
2559
+ self.gguf_writer.add_sliding_window(sliding_window)
2560
+
2561
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2562
+ # Handle expert weights - they're already merged in the HF format
2563
+ # process the experts separately
2564
+ if name.find("mlp.experts") != -1:
2565
+ n_experts = self.hparams["num_experts"]
2566
+ assert bid is not None
2567
+
2568
+ if self._experts is None:
2569
+ self._experts = [{} for _ in range(self.block_count)]
2570
+
2571
+ self._experts[bid][name] = data_torch
2572
+
2573
+ if len(self._experts[bid]) >= n_experts * 3:
2574
+ tensors: list[tuple[str, Tensor]] = []
2575
+
2576
+ # merge the experts into a single 3d tensor
2577
+ for w_name in ["gate_proj", "up_proj", "down_proj"]:
2578
+ datas: list[Tensor] = []
2579
+
2580
+ for xid in range(n_experts):
2581
+ ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
2582
+ datas.append(self._experts[bid][ename_to_retrieve])
2583
+ del self._experts[bid][ename_to_retrieve]
2584
+
2585
+ data_torch = torch.stack(datas, dim=0)
2586
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
2587
+ new_name = self.map_tensor_name(merged_name)
2588
+ tensors.append((new_name, data_torch))
2589
+
2590
+ return tensors
2591
+ else:
2592
+ return []
2593
+
2594
+ if name.endswith(".expert_bias"):
2595
+ name = name.replace(".expert_bias", ".expert_bias.bias")
2596
+
2597
+ return [(self.map_tensor_name(name), data_torch)]
2598
+
2599
+
2276
2600
  @ModelBase.register(
2277
2601
  "LlavaForConditionalGeneration", # pixtral
2278
2602
  "Mistral3ForConditionalGeneration", # mistral small 3.1
2279
2603
  )
2280
2604
  class LlavaVisionModel(MmprojModel):
2281
2605
  img_break_tok_id = -1
2606
+ use_break_tok = True
2282
2607
 
2283
2608
  def __init__(self, *args, **kwargs):
2284
2609
  super().__init__(*args, **kwargs)
2285
2610
  if self.hparams.get("model_type") == "pixtral":
2286
2611
  # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
2287
2612
  self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
2288
- self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
2613
+ if self.use_break_tok:
2614
+ self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
2289
2615
  elif self.is_mistral_format:
2290
2616
  # hparams is already vision config here so norm_eps is only defined in global_config.
2291
2617
  self.hparams["norm_eps"] = self.global_config.get("norm_eps", None)
2292
2618
  assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
2293
- self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
2619
+ if self.use_break_tok:
2620
+ self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
2294
2621
  else:
2295
2622
  raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
2296
2623
  logger.info(f"Image break token id: {self.img_break_tok_id}")
@@ -2376,6 +2703,10 @@ class SmolVLMModel(MmprojModel):
2376
2703
  self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
2377
2704
  self.gguf_writer.add_vision_use_gelu(True)
2378
2705
 
2706
+ # Add the preprocessor longest edge size
2707
+ preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size)
2708
+ self.gguf_writer.add_vision_preproc_image_size(preproc_image_size)
2709
+
2379
2710
  def tensor_force_quant(self, name, new_name, bid, n_dims):
2380
2711
  if ".embeddings." in name:
2381
2712
  return gguf.GGMLQuantizationType.F32
@@ -2391,7 +2722,10 @@ class SmolVLMModel(MmprojModel):
2391
2722
  return [] # skip other tensors
2392
2723
 
2393
2724
 
2394
- @ModelBase.register("Llama4ForConditionalGeneration")
2725
+ @ModelBase.register(
2726
+ "Llama4ForConditionalGeneration",
2727
+ "Llama4ForCausalLM",
2728
+ )
2395
2729
  class Llama4Model(LlamaModel):
2396
2730
  model_arch = gguf.MODEL_ARCH.LLAMA4
2397
2731
  undo_permute = False
@@ -2409,6 +2743,10 @@ class Llama4Model(LlamaModel):
2409
2743
  super().set_gguf_parameters()
2410
2744
  self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
2411
2745
  self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
2746
+ if "layer_types" in self.hparams:
2747
+ if all(lt == "full_attention" for lt in self.hparams["layer_types"]):
2748
+ # all layers are full attention (for MobileLLM), disable swa
2749
+ self.gguf_writer.add_sliding_window(0)
2412
2750
 
2413
2751
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
2414
2752
  if name.startswith("language_model."):
@@ -2686,12 +3024,20 @@ class BitnetModel(TextModel):
2686
3024
  yield (new_name, data_torch)
2687
3025
 
2688
3026
 
2689
- @ModelBase.register("GrokForCausalLM")
3027
+ @ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM")
2690
3028
  class GrokModel(TextModel):
2691
3029
  model_arch = gguf.MODEL_ARCH.GROK
2692
3030
 
2693
3031
  def set_vocab(self):
2694
- self._set_vocab_sentencepiece()
3032
+ if (self.dir_model / 'tokenizer.model').is_file():
3033
+ self._set_vocab_sentencepiece()
3034
+ return
3035
+
3036
+ if not (self.dir_model / 'tokenizer.json').is_file() or not (self.dir_model / 'chat_template.jinja').is_file():
3037
+ logger.error('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer')
3038
+ sys.exit(1)
3039
+
3040
+ self._set_vocab_gpt2()
2695
3041
 
2696
3042
  def __init__(self, *args, **kwargs):
2697
3043
  super().__init__(*args, **kwargs)
@@ -2699,11 +3045,46 @@ class GrokModel(TextModel):
2699
3045
  def set_gguf_parameters(self):
2700
3046
  super().set_gguf_parameters()
2701
3047
 
2702
- _experts: list[dict[str, Tensor]] | None = None
3048
+ self.gguf_writer.add_attn_logit_softcapping(self.hparams.get("attn_logit_softcapping", 30.0))
3049
+ self.gguf_writer.add_router_logit_softcapping(self.hparams.get("router_logit_softcapping", 30.0))
3050
+ if (final_logit_softcap := self.hparams.get("final_logit_softcapping")):
3051
+ self.gguf_writer.add_final_logit_softcapping(final_logit_softcap)
3052
+
3053
+ if (rope_dim := self.hparams.get("head_dim")) is None:
3054
+ rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
3055
+
3056
+ if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
3057
+ self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
3058
+
3059
+ # Treat "original" as "yarn", seems to have been a mistake
3060
+ if self.hparams.get("rope_type") in ("yarn", "original"):
3061
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
3062
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["scaling_factor"])
3063
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["original_max_position_embeddings"])
3064
+ self.gguf_writer.add_rope_scaling_yarn_ext_factor(self.hparams["extrapolation_factor"])
3065
+ self.gguf_writer.add_rope_scaling_yarn_attn_factor(self.hparams["attn_factor"])
3066
+ self.gguf_writer.add_rope_scaling_yarn_beta_fast(self.hparams["beta_fast"])
3067
+ self.gguf_writer.add_rope_scaling_yarn_beta_slow(self.hparams["beta_slow"])
3068
+
3069
+ if temp_len := self.hparams.get("attn_temperature_len"):
3070
+ self.gguf_writer.add_attn_temperature_length(temp_len)
3071
+
3072
+ self.gguf_writer.add_attn_output_scale(self.hparams.get("attn_output_multiplier", rope_dim**-0.5))
3073
+ self.gguf_writer.add_embedding_scale(self.hparams["embedding_multiplier_scale"])
3074
+ self.gguf_writer.add_logit_scale(self.hparams["output_multiplier_scale"])
3075
+
3076
+ _experts: list[dict[str, list[Tensor]]] | None = None
3077
+ _cur_expert = ""
2703
3078
 
2704
3079
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3080
+ tensors: list[tuple[str, Tensor]] = []
3081
+ is_expert = ".moe." in name or ".block_sparse_moe.experts." in name
3082
+
3083
+ if not is_expert:
3084
+ tensors.append((self.map_tensor_name(name), data_torch))
3085
+
2705
3086
  # process the experts separately
2706
- if name.find(".moe.") != -1:
3087
+ if is_expert or self._cur_expert:
2707
3088
  n_experts = self.hparams["num_local_experts"]
2708
3089
 
2709
3090
  assert bid is not None
@@ -2711,32 +3092,41 @@ class GrokModel(TextModel):
2711
3092
  if self._experts is None:
2712
3093
  self._experts = [{} for _ in range(self.block_count)]
2713
3094
 
2714
- self._experts[bid][name] = data_torch
2715
-
2716
- if len(self._experts[bid]) >= n_experts * 3:
2717
- tensors: list[tuple[str, Tensor]] = []
3095
+ # concatenate split tensors
3096
+ if name in self._experts[bid]:
3097
+ self._cur_expert = name
3098
+ self._experts[bid][name].append(data_torch)
3099
+ return []
3100
+ elif is_expert:
3101
+ self._cur_expert = name
3102
+ self._experts[bid][name] = [data_torch]
3103
+ return []
3104
+ else:
3105
+ self._cur_expert = ""
2718
3106
 
2719
- # merge the experts into a single 3d tensor
2720
- for wid in ["linear", "linear_1", "linear_v"]:
2721
- datas: list[Tensor] = []
3107
+ for bid in range(self.block_count):
3108
+ if len(self._experts[bid]) >= n_experts * 3:
3109
+ # merge the experts into a single 3d tensor
3110
+ for wid in [("linear", "w1", 0), ("linear_1", "w2", 1), ("linear_v", "w3", 0)]:
3111
+ datas: list[Tensor] = []
2722
3112
 
2723
- for xid in range(n_experts):
2724
- ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
2725
- datas.append(self._experts[bid][ename])
2726
- del self._experts[bid][ename]
3113
+ for xid in range(n_experts):
3114
+ ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight"
3115
+ if ename not in self._experts[bid]:
3116
+ ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight"
3117
+ tensor_list = self._experts[bid][ename]
3118
+ datas.append(torch.cat(tensor_list, dim=wid[2]) if len(tensor_list) > 1 else tensor_list[0])
3119
+ del self._experts[bid][ename]
2727
3120
 
2728
- data_torch = torch.stack(datas, dim=0)
3121
+ data_torch = torch.stack(datas, dim=0)
2729
3122
 
2730
- merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
3123
+ merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight"
2731
3124
 
2732
- new_name = self.map_tensor_name(merged_name)
3125
+ new_name = self.map_tensor_name(merged_name)
2733
3126
 
2734
- tensors.append((new_name, data_torch))
2735
- return tensors
2736
- else:
2737
- return []
3127
+ yield (new_name, data_torch)
2738
3128
 
2739
- return [(self.map_tensor_name(name), data_torch)]
3129
+ yield from tensors
2740
3130
 
2741
3131
 
2742
3132
  @ModelBase.register("DbrxForCausalLM")
@@ -2746,7 +3136,7 @@ class DbrxModel(TextModel):
2746
3136
  def set_gguf_parameters(self):
2747
3137
  ffn_config = self.hparams["ffn_config"]
2748
3138
  attn_config = self.hparams["attn_config"]
2749
- self.gguf_writer.add_block_count(self.hparams["n_layers"])
3139
+ self.gguf_writer.add_block_count(self.block_count)
2750
3140
 
2751
3141
  self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
2752
3142
  self.gguf_writer.add_embedding_length(self.hparams["d_model"])
@@ -2948,7 +3338,7 @@ class QwenModel(TextModel):
2948
3338
 
2949
3339
  def set_gguf_parameters(self):
2950
3340
  self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
2951
- self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
3341
+ self.gguf_writer.add_block_count(self.block_count)
2952
3342
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
2953
3343
  self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
2954
3344
  self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
@@ -3605,7 +3995,43 @@ class Qwen2MoeModel(TextModel):
3605
3995
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3606
3996
  # process the experts separately
3607
3997
  name = name.replace("language_model.", "") # InternVL
3608
- if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
3998
+
3999
+ # handle aggregated expert tensors
4000
+ # GGUF stores dimensions reversed from PyTorch, so:
4001
+ # PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
4002
+ # Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp)
4003
+ # Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down
4004
+ if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
4005
+ mapped = f"{name}.weight" if not name.endswith(".weight") else name
4006
+ # Input: (n_expert=128, n_ff_exp=768, n_embd=2048)
4007
+ # Want GGML ne: {n_ff_exp, n_embd, n_expert} = {768, 2048, 128}
4008
+ # Need PyTorch: (128, 2048, 768) [reversed of GGML]
4009
+ # So: permute(0, 2, 1): (128, 768, 2048) -> (128, 2048, 768)
4010
+ permuted = data_torch.permute(0, 2, 1).contiguous()
4011
+ return [(self.map_tensor_name(mapped), permuted)]
4012
+
4013
+ if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"):
4014
+ if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0:
4015
+ raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}")
4016
+ split_dim = data_torch.shape[-1] // 2
4017
+ gate = data_torch[..., :split_dim].contiguous()
4018
+ up = data_torch[..., split_dim:].contiguous()
4019
+ # Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768)
4020
+ # Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128}
4021
+ # Need PyTorch: (128, 768, 2048) [reversed of GGML]
4022
+ # So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048)
4023
+ base_name = name.removesuffix(".weight")
4024
+ base = base_name.rsplit('.', 1)[0]
4025
+ mapped_gate = f"{base}.gate_proj.weight"
4026
+ mapped_up = f"{base}.up_proj.weight"
4027
+ perm_gate = gate.permute(0, 2, 1).contiguous()
4028
+ perm_up = up.permute(0, 2, 1).contiguous()
4029
+ return [
4030
+ (self.map_tensor_name(mapped_gate), perm_gate),
4031
+ (self.map_tensor_name(mapped_up), perm_up),
4032
+ ]
4033
+
4034
+ if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"):
3609
4035
  # skip visual tensors
3610
4036
  return []
3611
4037
  if name.find("experts") != -1:
@@ -3656,11 +4082,29 @@ class Qwen2MoeModel(TextModel):
3656
4082
  class Qwen3Model(Qwen2Model):
3657
4083
  model_arch = gguf.MODEL_ARCH.QWEN3
3658
4084
 
4085
+ # extra logic for rerank models
4086
+ is_rerank: bool = False
4087
+ is_tied_embeddings: bool = False
4088
+ token_false_id: int | None = None
4089
+ token_true_id: int | None = None
4090
+
3659
4091
  def __init__(self, *args, **kwargs):
3660
4092
  super().__init__(*args, **kwargs)
4093
+
4094
+ # track for intern-s1-mini
3661
4095
  hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
3662
4096
  self.origin_hf_arch = hparams.get('architectures', [None])[0]
3663
4097
 
4098
+ # a bit hacky, but currently the only way to detect if this is a rerank model
4099
+ # ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
4100
+ readme_path = self.dir_model / "README.md"
4101
+ readme_text = ""
4102
+ if readme_path.exists():
4103
+ with readme_path.open("r", encoding="utf-8") as f:
4104
+ readme_text = f.read()
4105
+ if "# Qwen3-Reranker" in readme_text:
4106
+ self._find_rerank_config()
4107
+
3664
4108
  def set_vocab(self):
3665
4109
  # deal with intern-s1-mini
3666
4110
  if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
@@ -3669,6 +4113,57 @@ class Qwen3Model(Qwen2Model):
3669
4113
 
3670
4114
  super().set_vocab()
3671
4115
 
4116
+ def _find_rerank_config(self):
4117
+ from transformers import AutoTokenizer
4118
+ tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
4119
+
4120
+ self.is_rerank = True
4121
+ self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
4122
+ self.token_false_id = tokenizer.convert_tokens_to_ids("no")
4123
+ self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
4124
+ self.sep_token_id = tokenizer.convert_tokens_to_ids("|")
4125
+
4126
+ assert self.token_false_id is not None and self.token_true_id is not None
4127
+
4128
+ def set_gguf_parameters(self):
4129
+ super().set_gguf_parameters()
4130
+ if self.is_rerank:
4131
+ self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
4132
+ self.gguf_writer.add_classifier_output_labels(["yes", "no"])
4133
+ self.gguf_writer.add_chat_template([{
4134
+ "name": "rerank",
4135
+ "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n"
4136
+ "<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n"
4137
+ "<|im_start|>assistant\n<think>\n\n</think>\n\n"
4138
+ }])
4139
+
4140
+ def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
4141
+ # extract "yes" and "no" tokens from the output lm_head tensor
4142
+ false_row = data_torch[self.token_false_id]
4143
+ true_row = data_torch[self.token_true_id]
4144
+ return torch.stack([true_row, false_row], dim=0)
4145
+
4146
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4147
+ if "model.vision_" in name:
4148
+ # skip multimodal tensors
4149
+ return []
4150
+
4151
+ if self.is_rerank:
4152
+ is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
4153
+ is_real_head = not self.is_tied_embeddings and "lm_head" in name
4154
+ if is_tied_head or is_real_head:
4155
+ cls_out_head = (
4156
+ gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight",
4157
+ self._get_cls_out_tensor(data_torch),
4158
+ )
4159
+ if is_tied_head:
4160
+ embed = (self.map_tensor_name(name), data_torch)
4161
+ return [cls_out_head, embed]
4162
+ if is_real_head:
4163
+ return [cls_out_head]
4164
+
4165
+ return super().modify_tensors(data_torch, name, bid)
4166
+
3672
4167
 
3673
4168
  @ModelBase.register("Qwen3MoeForCausalLM")
3674
4169
  class Qwen3MoeModel(Qwen2MoeModel):
@@ -3688,12 +4183,193 @@ class Qwen3MoeModel(Qwen2MoeModel):
3688
4183
  super().set_vocab()
3689
4184
 
3690
4185
 
4186
+ @ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration")
4187
+ class Qwen3VLVisionModel(MmprojModel):
4188
+ def __init__(self, *args, **kwargs):
4189
+ super().__init__(*args, **kwargs)
4190
+ assert self.hparams_vision is not None
4191
+ # Compute image_size if not present
4192
+ if "image_size" not in self.hparams_vision:
4193
+ # For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings
4194
+ num_pos = self.hparams_vision.get("num_position_embeddings", 2304)
4195
+ patch_size = self.hparams_vision.get("patch_size", 16)
4196
+ # num_position_embeddings = (image_size / patch_size) ** 2
4197
+ # So image_size = sqrt(num_position_embeddings) * patch_size
4198
+ image_size = int(num_pos**0.5 * patch_size)
4199
+ self.hparams_vision["image_size"] = image_size
4200
+
4201
+ # Rename config values for compatibility
4202
+ self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
4203
+ self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
4204
+
4205
+ self.is_deepstack_layers = [False] * int(self.hparams_vision["num_hidden_layers"] or 0)
4206
+ for idx in self.hparams_vision.get("deepstack_visual_indexes", []):
4207
+ self.is_deepstack_layers[idx] = True
4208
+
4209
+ def set_gguf_parameters(self):
4210
+ super().set_gguf_parameters()
4211
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
4212
+ self.gguf_writer.add_vision_use_gelu(True)
4213
+
4214
+ if self.hparams_vision is not None:
4215
+ merge_size = self.hparams_vision.get("spatial_merge_size")
4216
+ if merge_size is not None:
4217
+ self.gguf_writer.add_vision_spatial_merge_size(int(merge_size))
4218
+
4219
+ # Use text config's rms_norm_eps for vision attention layernorm eps
4220
+ rms_norm_eps = self.global_config.get("text_config", {}).get("rms_norm_eps", 1e-6)
4221
+ self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
4222
+
4223
+ if self.is_deepstack_layers:
4224
+ self.gguf_writer.add_vision_is_deepstack_layers(self.is_deepstack_layers)
4225
+
4226
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4227
+ assert self.hparams_vision is not None
4228
+ # Skip text model tensors - they go in the text model file
4229
+ if name.startswith("model.language_model.") or name.startswith("lm_head."):
4230
+ return []
4231
+
4232
+ if name.startswith("model.visual."):
4233
+ name = name.replace("model.visual.", "visual.", 1)
4234
+
4235
+ if name.startswith("visual.deepstack_merger_list."):
4236
+ prefix, rest = name.split(".", maxsplit=3)[2:]
4237
+ # prefix is the layer index, convert to absolute clip layer index!
4238
+ idx = self.hparams_vision.get("deepstack_visual_indexes", [])[int(prefix)]
4239
+ target = rest
4240
+
4241
+ tensor_type: gguf.MODEL_TENSOR
4242
+ if target.startswith("norm."):
4243
+ tensor_type = gguf.MODEL_TENSOR.V_DS_NORM
4244
+ suffix = target.split(".", 1)[1]
4245
+ elif target.startswith("linear_fc1."):
4246
+ tensor_type = gguf.MODEL_TENSOR.V_DS_FC1
4247
+ suffix = target.split(".", 1)[1]
4248
+ elif target.startswith("linear_fc2."):
4249
+ tensor_type = gguf.MODEL_TENSOR.V_DS_FC2
4250
+ suffix = target.split(".", 1)[1]
4251
+ else:
4252
+ raise ValueError(f"Unexpected deepstack tensor: {name}")
4253
+
4254
+ new_name = self.format_tensor_name(tensor_type, idx, suffix=f".{suffix}")
4255
+ return [(new_name, data_torch)]
4256
+
4257
+ if name.startswith("visual.merger."):
4258
+ suffix = name.split(".", 2)[2]
4259
+ if suffix.startswith("linear_fc"):
4260
+ fc_idx_str, tail = suffix.split(".", 1)
4261
+ fc_num = int(fc_idx_str.replace("linear_fc", ""))
4262
+ # Qwen3VL has linear_fc1 and linear_fc2
4263
+ # Map to indices 0 and 2 (matching Qwen2VL which uses indices 0 and 2)
4264
+ if fc_num == 1:
4265
+ fc_idx = 0
4266
+ elif fc_num == 2:
4267
+ fc_idx = 2
4268
+ else:
4269
+ raise ValueError(f"unexpected fc index {fc_num} in {name}")
4270
+ new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, fc_idx, suffix=f".{tail}")
4271
+ elif suffix.startswith("norm."):
4272
+ new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_POST_NORM, suffix=f".{suffix.split('.', 1)[1]}")
4273
+ else:
4274
+ raise ValueError(f"Unexpected merger tensor: {name}")
4275
+ return [(new_name, data_torch)]
4276
+
4277
+ if name == "visual.patch_embed.proj.weight":
4278
+ # split Conv3D into Conv2Ds along temporal dimension
4279
+ c1, c2, kt, _, _ = data_torch.shape
4280
+ del c1, c2
4281
+ if kt != 2:
4282
+ raise ValueError("Current implementation only supports temporal_patch_size of 2")
4283
+ return [
4284
+ (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...]),
4285
+ (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]),
4286
+ ]
4287
+
4288
+ if name == "visual.patch_embed.proj.bias":
4289
+ # Include the bias - it's used by the C++ code
4290
+ return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch)]
4291
+
4292
+ if name.startswith("visual."):
4293
+ return [(self.map_tensor_name(name), data_torch)]
4294
+
4295
+ # Fall back to parent class for other tensors
4296
+ return super().modify_tensors(data_torch, name, bid)
4297
+
4298
+
4299
+ @ModelBase.register("Qwen3VLForConditionalGeneration")
4300
+ class Qwen3VLTextModel(Qwen3Model):
4301
+ model_arch = gguf.MODEL_ARCH.QWEN3VL
4302
+
4303
+ def set_gguf_parameters(self):
4304
+ super().set_gguf_parameters()
4305
+
4306
+ # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
4307
+ text_config = self.hparams.get("text_config", {})
4308
+ # rope_scaling is deprecated in V5, use rope_parameters instead
4309
+ rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
4310
+
4311
+ if rope_scaling.get("mrope_section"):
4312
+ # mrope_section contains [time, height, width] dimensions
4313
+ mrope_section = rope_scaling["mrope_section"]
4314
+ # Pad to 4 dimensions [time, height, width, extra]
4315
+ while len(mrope_section) < 4:
4316
+ mrope_section.append(0)
4317
+ self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
4318
+
4319
+ logger.info(f"MRoPE sections: {mrope_section[:4]}")
4320
+
4321
+ vision_config = self.hparams.get("vision_config", {})
4322
+ deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
4323
+ self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
4324
+
4325
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4326
+ # Skip vision tensors - they go in the mmproj file
4327
+ if name.startswith("model.visual."):
4328
+ return []
4329
+
4330
+ return super().modify_tensors(data_torch, name, bid)
4331
+
4332
+
4333
+ @ModelBase.register("Qwen3VLMoeForConditionalGeneration")
4334
+ class Qwen3VLMoeTextModel(Qwen3MoeModel):
4335
+ model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
4336
+
4337
+ def set_gguf_parameters(self):
4338
+ super().set_gguf_parameters()
4339
+
4340
+ # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
4341
+ text_config = self.hparams.get("text_config", {})
4342
+ # rope_scaling is deprecated in V5, use rope_parameters instead
4343
+ rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
4344
+
4345
+ if rope_scaling.get("mrope_section"):
4346
+ # mrope_section contains [time, height, width] dimensions
4347
+ mrope_section = rope_scaling["mrope_section"]
4348
+ # Pad to 4 dimensions [time, height, width, extra]
4349
+ while len(mrope_section) < 4:
4350
+ mrope_section.append(0)
4351
+ self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
4352
+
4353
+ logger.info(f"MRoPE sections: {mrope_section[:4]}")
4354
+
4355
+ vision_config = self.hparams.get("vision_config", {})
4356
+ deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
4357
+ self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
4358
+
4359
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4360
+ # Skip vision tensors - they go in the mmproj file
4361
+ if name.startswith("model.visual."):
4362
+ return []
4363
+
4364
+ return super().modify_tensors(data_torch, name, bid)
4365
+
4366
+
3691
4367
  @ModelBase.register("GPT2LMHeadModel")
3692
4368
  class GPT2Model(TextModel):
3693
4369
  model_arch = gguf.MODEL_ARCH.GPT2
3694
4370
 
3695
4371
  def set_gguf_parameters(self):
3696
- self.gguf_writer.add_block_count(self.hparams["n_layer"])
4372
+ self.gguf_writer.add_block_count(self.block_count)
3697
4373
  self.gguf_writer.add_context_length(self.hparams["n_ctx"])
3698
4374
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
3699
4375
  self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
@@ -3725,8 +4401,6 @@ class Phi2Model(TextModel):
3725
4401
  model_arch = gguf.MODEL_ARCH.PHI2
3726
4402
 
3727
4403
  def set_gguf_parameters(self):
3728
- block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
3729
-
3730
4404
  rot_pct = self.find_hparam(["partial_rotary_factor"])
3731
4405
  n_embd = self.find_hparam(["hidden_size", "n_embd"])
3732
4406
  n_head = self.find_hparam(["num_attention_heads", "n_head"])
@@ -3735,7 +4409,7 @@ class Phi2Model(TextModel):
3735
4409
 
3736
4410
  self.gguf_writer.add_embedding_length(n_embd)
3737
4411
  self.gguf_writer.add_feed_forward_length(4 * n_embd)
3738
- self.gguf_writer.add_block_count(block_count)
4412
+ self.gguf_writer.add_block_count(self.block_count)
3739
4413
  self.gguf_writer.add_head_count(n_head)
3740
4414
  self.gguf_writer.add_head_count_kv(n_head)
3741
4415
  self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
@@ -3853,8 +4527,6 @@ class Phi3MiniModel(TextModel):
3853
4527
  special_vocab.add_to_gguf(self.gguf_writer)
3854
4528
 
3855
4529
  def set_gguf_parameters(self):
3856
- block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
3857
-
3858
4530
  n_embd = self.find_hparam(["hidden_size", "n_embd"])
3859
4531
  n_head = self.find_hparam(["num_attention_heads", "n_head"])
3860
4532
  n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
@@ -3868,7 +4540,7 @@ class Phi3MiniModel(TextModel):
3868
4540
  self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
3869
4541
  self.gguf_writer.add_embedding_length(n_embd)
3870
4542
  self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
3871
- self.gguf_writer.add_block_count(block_count)
4543
+ self.gguf_writer.add_block_count(self.block_count)
3872
4544
  self.gguf_writer.add_head_count(n_head)
3873
4545
  self.gguf_writer.add_head_count_kv(n_head_kv)
3874
4546
  self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
@@ -3988,12 +4660,11 @@ class PlamoModel(TextModel):
3988
4660
 
3989
4661
  def set_gguf_parameters(self):
3990
4662
  hparams = self.hparams
3991
- block_count = hparams["num_hidden_layers"]
3992
4663
 
3993
4664
  self.gguf_writer.add_context_length(4096) # not in config.json
3994
4665
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
3995
4666
  self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
3996
- self.gguf_writer.add_block_count(block_count)
4667
+ self.gguf_writer.add_block_count(self.block_count)
3997
4668
  self.gguf_writer.add_head_count(hparams["num_attention_heads"])
3998
4669
  self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
3999
4670
  self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
@@ -4116,7 +4787,6 @@ class Plamo2Model(TextModel):
4116
4787
 
4117
4788
  def set_gguf_parameters(self):
4118
4789
  hparams = self.hparams
4119
- block_count = hparams["num_hidden_layers"]
4120
4790
  self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
4121
4791
 
4122
4792
  # Which layers are Mamba layers
@@ -4124,27 +4794,32 @@ class Plamo2Model(TextModel):
4124
4794
  # This logic matches modeling_plamo.py's is_mamba function
4125
4795
  mamba_step = hparams.get("mamba_step", 2)
4126
4796
  mamba_enabled = hparams.get("mamba_enabled", True)
4127
- mamba_layers = []
4797
+ num_key_value_heads = []
4798
+ num_attention_heads = []
4128
4799
 
4129
4800
  if mamba_enabled:
4130
- for i in range(block_count):
4131
- if block_count <= (mamba_step // 2):
4801
+ for i in range(self.block_count):
4802
+ if self.block_count <= (mamba_step // 2):
4132
4803
  # use attention in last layer
4133
- is_mamba = (i != block_count - 1)
4804
+ is_mamba = (i != self.block_count - 1)
4134
4805
  else:
4135
4806
  is_mamba = (i % mamba_step) != (mamba_step // 2)
4136
4807
  if is_mamba:
4137
- mamba_layers.append(0)
4808
+ num_key_value_heads.append(0)
4809
+ num_attention_heads.append(0)
4138
4810
  else:
4139
- mamba_layers.append(hparams.get("num_key_value_heads", 4))
4811
+ num_key_value_heads.append(hparams.get("num_key_value_heads", 4))
4812
+ num_attention_heads.append(hparams.get("num_attention_heads", 32))
4140
4813
 
4141
- if mamba_layers:
4142
- self.gguf_writer.add_head_count_kv(mamba_layers)
4814
+ if num_key_value_heads and num_attention_heads:
4815
+ self.gguf_writer.add_head_count_kv(num_key_value_heads)
4816
+ self.gguf_writer.add_head_count(num_attention_heads)
4143
4817
 
4144
4818
  self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048))
4145
4819
  self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096))
4146
- self.gguf_writer.add_block_count(block_count)
4147
- self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
4820
+ self.gguf_writer.add_key_length(hparams.get("hidden_size_per_head", 128))
4821
+ self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128))
4822
+ self.gguf_writer.add_block_count(self.block_count)
4148
4823
  self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
4149
4824
  self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
4150
4825
 
@@ -4201,12 +4876,10 @@ class CodeShellModel(TextModel):
4201
4876
  model_arch = gguf.MODEL_ARCH.CODESHELL
4202
4877
 
4203
4878
  def set_gguf_parameters(self):
4204
- block_count = self.hparams["n_layer"]
4205
-
4206
4879
  self.gguf_writer.add_context_length(self.hparams["n_positions"])
4207
4880
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
4208
4881
  self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
4209
- self.gguf_writer.add_block_count(block_count)
4882
+ self.gguf_writer.add_block_count(self.block_count)
4210
4883
  self.gguf_writer.add_head_count(self.hparams["n_head"])
4211
4884
  self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
4212
4885
  self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
@@ -4215,27 +4888,6 @@ class CodeShellModel(TextModel):
4215
4888
  self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
4216
4889
  self.gguf_writer.add_rope_scaling_factor(1.0)
4217
4890
 
4218
- _has_tok_embd = False
4219
-
4220
- def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4221
- del bid # unused
4222
-
4223
- output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
4224
- tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
4225
-
4226
- new_name = self.map_tensor_name(name)
4227
-
4228
- # assuming token_embd.weight is seen before output.weight
4229
- if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
4230
- # even though the tensor file(s) does not contain the word embeddings they are still in the weight map
4231
- if self.tensor_names and "transformer.wte.weight" in self.tensor_names:
4232
- logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied")
4233
- self.tensor_names.remove("transformer.wte.weight")
4234
- elif new_name == tok_embd_name:
4235
- self._has_tok_embd = True
4236
-
4237
- return [(new_name, data_torch)]
4238
-
4239
4891
 
4240
4892
  @ModelBase.register("InternLM2ForCausalLM")
4241
4893
  class InternLM2Model(TextModel):
@@ -4369,7 +5021,7 @@ class InternLM2Model(TextModel):
4369
5021
 
4370
5022
  def set_gguf_parameters(self):
4371
5023
  self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
4372
- self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
5024
+ self.gguf_writer.add_block_count(self.block_count)
4373
5025
  self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
4374
5026
  self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
4375
5027
  self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
@@ -4990,11 +5642,10 @@ class GemmaModel(TextModel):
4990
5642
 
4991
5643
  def set_gguf_parameters(self):
4992
5644
  hparams = self.hparams
4993
- block_count = hparams["num_hidden_layers"]
4994
5645
 
4995
5646
  self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
4996
5647
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
4997
- self.gguf_writer.add_block_count(block_count)
5648
+ self.gguf_writer.add_block_count(self.block_count)
4998
5649
  self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
4999
5650
  self.gguf_writer.add_head_count(hparams["num_attention_heads"])
5000
5651
  self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
@@ -5030,11 +5681,10 @@ class Gemma2Model(TextModel):
5030
5681
 
5031
5682
  def set_gguf_parameters(self):
5032
5683
  hparams = self.hparams
5033
- block_count = hparams["num_hidden_layers"]
5034
5684
 
5035
5685
  self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
5036
5686
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
5037
- self.gguf_writer.add_block_count(block_count)
5687
+ self.gguf_writer.add_block_count(self.block_count)
5038
5688
  self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
5039
5689
  self.gguf_writer.add_head_count(hparams["num_attention_heads"])
5040
5690
  self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
@@ -5078,12 +5728,11 @@ class Gemma3Model(TextModel):
5078
5728
 
5079
5729
  def set_gguf_parameters(self):
5080
5730
  hparams = self.hparams
5081
- block_count = hparams["num_hidden_layers"]
5082
5731
 
5083
5732
  # some default values are not specified in the hparams
5084
5733
  self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
5085
5734
  self.gguf_writer.add_embedding_length(hparams["hidden_size"])
5086
- self.gguf_writer.add_block_count(block_count)
5735
+ self.gguf_writer.add_block_count(self.block_count)
5087
5736
  self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
5088
5737
  self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
5089
5738
  self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
@@ -5126,6 +5775,80 @@ class Gemma3Model(TextModel):
5126
5775
  return [(self.map_tensor_name(name), data_torch)]
5127
5776
 
5128
5777
 
5778
+ @ModelBase.register("Gemma3TextModel")
5779
+ class EmbeddingGemma(Gemma3Model):
5780
+ model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING
5781
+ module_paths = []
5782
+ dense_features_dims = {}
5783
+
5784
+ def __init__(self, *args, **kwargs):
5785
+ super().__init__(*args, **kwargs)
5786
+ if self.sentence_transformers_dense_modules:
5787
+ # read modules.json to determine if model has Dense layers
5788
+ modules_file = self.dir_model / "modules.json"
5789
+ if modules_file.is_file():
5790
+ with open(modules_file, encoding="utf-8") as modules_json_file:
5791
+ mods = json.load(modules_json_file)
5792
+ for mod in mods:
5793
+ if mod["type"] == "sentence_transformers.models.Dense":
5794
+ mod_path = mod["path"]
5795
+ # check if model.safetensors file for Dense layer exists
5796
+ model_tensors_file = self.dir_model / mod_path / "model.safetensors"
5797
+ if model_tensors_file.is_file():
5798
+ self.module_paths.append(mod_path)
5799
+ # read config.json of the Dense layer to get in/out features
5800
+ mod_conf_file = self.dir_model / mod_path / "config.json"
5801
+ if mod_conf_file.is_file():
5802
+ with open(mod_conf_file, encoding="utf-8") as mod_conf_json_file:
5803
+ mod_conf = json.load(mod_conf_json_file)
5804
+ # hparams dense_2_feat_out and dense_3_feat_in are required when loading model's dense weights
5805
+ prefix = self._get_dense_prefix(mod_path)
5806
+ if mod_conf["in_features"] is not None and mod_conf["out_features"] is not None:
5807
+ self.dense_features_dims[prefix] = (mod_conf["in_features"], mod_conf["out_features"])
5808
+
5809
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
5810
+ from safetensors.torch import load_file
5811
+ module_paths = list(self.module_paths)
5812
+ for i, module_path in enumerate(module_paths):
5813
+ tensors_file = self.dir_model / module_path / "model.safetensors"
5814
+ local_tensors = load_file(tensors_file)
5815
+ tensor_name = self._get_dense_prefix(module_path)
5816
+ for name, local_tensor in local_tensors.items():
5817
+ if not name.endswith(".weight"):
5818
+ continue
5819
+ orig_name = name.replace("linear", tensor_name)
5820
+ name = self.map_tensor_name(orig_name)
5821
+ yield name, local_tensor.clone()
5822
+
5823
+ @staticmethod
5824
+ def _get_dense_prefix(module_path) -> str:
5825
+ """Get the tensor name prefix for the Dense layer from module path."""
5826
+ tensor_name = "dense_2" if module_path == "2_Dense" else "dense_3"
5827
+ return tensor_name
5828
+
5829
+ def set_gguf_parameters(self):
5830
+ super().set_gguf_parameters()
5831
+
5832
+ # Override the sliding window size as it gets adjusted by the Gemma3TextConfig
5833
+ # constructor. We want to use the value from the original model's config.json.
5834
+ # ref: https://github.com/huggingface/transformers/pull/40700
5835
+ with open(self.dir_model / "config.json", "r", encoding="utf-8") as f:
5836
+ config = json.load(f)
5837
+ orig_sliding_window = config.get("sliding_window")
5838
+ if orig_sliding_window is None:
5839
+ raise ValueError("sliding_window not found in model config - this is required for the model")
5840
+
5841
+ logger.info(f"Using original sliding_window from config: {orig_sliding_window} "
5842
+ f"instead of {self.hparams['sliding_window']}")
5843
+ self.gguf_writer.add_sliding_window(orig_sliding_window)
5844
+ if self.sentence_transformers_dense_modules:
5845
+ for dense, dims in self.dense_features_dims.items():
5846
+ logger.info(f"Setting dense layer {dense} in/out features to {dims}")
5847
+ self.gguf_writer.add_dense_features_dims(dense, dims[0], dims[1])
5848
+
5849
+ self._try_set_pooling_type()
5850
+
5851
+
5129
5852
  @ModelBase.register("Gemma3ForConditionalGeneration")
5130
5853
  class Gemma3VisionModel(MmprojModel):
5131
5854
  def set_gguf_parameters(self):
@@ -5285,7 +6008,6 @@ class Rwkv6Model(TextModel):
5285
6008
  self._set_vocab_rwkv_world()
5286
6009
 
5287
6010
  def set_gguf_parameters(self):
5288
- block_count = self.hparams["num_hidden_layers"]
5289
6011
  head_size = self.hparams["head_size"]
5290
6012
  hidden_size = self.hparams["hidden_size"]
5291
6013
  layer_norm_eps = self.hparams["layer_norm_epsilon"]
@@ -5297,7 +6019,7 @@ class Rwkv6Model(TextModel):
5297
6019
  # RWKV isn't context limited
5298
6020
  self.gguf_writer.add_context_length(1048576)
5299
6021
  self.gguf_writer.add_embedding_length(hidden_size)
5300
- self.gguf_writer.add_block_count(block_count)
6022
+ self.gguf_writer.add_block_count(self.block_count)
5301
6023
  self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
5302
6024
  self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
5303
6025
  self.gguf_writer.add_wkv_head_size(head_size)
@@ -5361,7 +6083,6 @@ class RWKV6Qwen2Model(Rwkv6Model):
5361
6083
  self._set_vocab_gpt2()
5362
6084
 
5363
6085
  def set_gguf_parameters(self):
5364
- block_count = self.hparams["num_hidden_layers"]
5365
6086
  num_attention_heads = self.hparams["num_attention_heads"]
5366
6087
  num_key_value_heads = self.hparams["num_key_value_heads"]
5367
6088
  hidden_size = self.hparams["hidden_size"]
@@ -5374,7 +6095,7 @@ class RWKV6Qwen2Model(Rwkv6Model):
5374
6095
  # RWKV isn't context limited
5375
6096
  self.gguf_writer.add_context_length(1048576)
5376
6097
  self.gguf_writer.add_embedding_length(hidden_size)
5377
- self.gguf_writer.add_block_count(block_count)
6098
+ self.gguf_writer.add_block_count(self.block_count)
5378
6099
  self.gguf_writer.add_wkv_head_size(head_size)
5379
6100
  self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
5380
6101
  self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
@@ -5415,7 +6136,6 @@ class Rwkv7Model(TextModel):
5415
6136
  return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32
5416
6137
 
5417
6138
  def set_gguf_parameters(self):
5418
- block_count = self.hparams["num_hidden_layers"]
5419
6139
  try:
5420
6140
  head_size = self.hparams["head_size"]
5421
6141
  layer_norm_eps = self.hparams["layer_norm_epsilon"]
@@ -5440,7 +6160,7 @@ class Rwkv7Model(TextModel):
5440
6160
  # RWKV isn't context limited
5441
6161
  self.gguf_writer.add_context_length(1048576)
5442
6162
  self.gguf_writer.add_embedding_length(hidden_size)
5443
- self.gguf_writer.add_block_count(block_count)
6163
+ self.gguf_writer.add_block_count(self.block_count)
5444
6164
  self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
5445
6165
  self.gguf_writer.add_wkv_head_size(head_size)
5446
6166
  self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
@@ -5534,7 +6254,6 @@ class ARwkv7Model(Rwkv7Model):
5534
6254
  self._set_vocab_gpt2()
5535
6255
 
5536
6256
  def set_gguf_parameters(self):
5537
- block_count = self.hparams["num_hidden_layers"]
5538
6257
  hidden_size = self.hparams["hidden_size"]
5539
6258
  head_size = self.hparams["head_size"]
5540
6259
  rms_norm_eps = self.hparams["rms_norm_eps"]
@@ -5551,7 +6270,7 @@ class ARwkv7Model(Rwkv7Model):
5551
6270
  # RWKV isn't context limited
5552
6271
  self.gguf_writer.add_context_length(1048576)
5553
6272
  self.gguf_writer.add_embedding_length(hidden_size)
5554
- self.gguf_writer.add_block_count(block_count)
6273
+ self.gguf_writer.add_block_count(self.block_count)
5555
6274
  self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
5556
6275
  self.gguf_writer.add_wkv_head_size(head_size)
5557
6276
  self.gguf_writer.add_decay_lora_rank(lora_rank_decay)
@@ -5749,20 +6468,12 @@ class Mamba2Model(TextModel):
5749
6468
  class JambaModel(TextModel):
5750
6469
  model_arch = gguf.MODEL_ARCH.JAMBA
5751
6470
 
5752
- def get_vocab_base_pre(self, tokenizer) -> str:
5753
- del tokenizer # unused
5754
-
5755
- return "gpt-2"
5756
-
5757
6471
  def set_vocab(self):
5758
6472
  if (self.dir_model / "tokenizer.model").is_file():
5759
- # Using Jamba's tokenizer.json causes errors on model load
5760
- # (something about "byte not found in vocab"),
5761
- # but there's a working tokenizer.model
5762
6473
  self._set_vocab_sentencepiece()
5763
6474
  else:
5764
- # Some Jamba models only have a tokenizer.json, which works.
5765
- self._set_vocab_gpt2()
6475
+ self._set_vocab_llama_hf()
6476
+ self.gguf_writer.add_add_space_prefix(False)
5766
6477
 
5767
6478
  def set_gguf_parameters(self):
5768
6479
  d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
@@ -5932,9 +6643,34 @@ class SeedOssModel(TextModel):
5932
6643
 
5933
6644
 
5934
6645
  @ModelBase.register("Olmo2ForCausalLM")
6646
+ @ModelBase.register("Olmo3ForCausalLM")
5935
6647
  class Olmo2Model(TextModel):
5936
6648
  model_arch = gguf.MODEL_ARCH.OLMO2
5937
6649
 
6650
+ def set_gguf_parameters(self):
6651
+ super().set_gguf_parameters()
6652
+
6653
+ rope_scaling = self.hparams.get("rope_scaling") or {}
6654
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
6655
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
6656
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
6657
+ self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"])
6658
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
6659
+
6660
+ if "sliding_window" in self.hparams:
6661
+ self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
6662
+
6663
+ sliding_window_pattern = []
6664
+ if "layer_types" in self.hparams:
6665
+ sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]]
6666
+ else:
6667
+ # Olmo2 does not use sliding window attention.
6668
+ # Olmo3 defaults to using sliding window for all layers except every 4th.
6669
+ for i in range(self.hparams["num_hidden_layers"]):
6670
+ sliding_window_pattern.append((i + 1) % 4 != 0)
6671
+
6672
+ self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
6673
+
5938
6674
 
5939
6675
  @ModelBase.register("OlmoeForCausalLM")
5940
6676
  class OlmoeModel(TextModel):
@@ -6417,13 +7153,6 @@ class DeepseekV2Model(TextModel):
6417
7153
  self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
6418
7154
  self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
6419
7155
 
6420
- if hparams["scoring_func"] == "sigmoid":
6421
- self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
6422
- elif hparams["scoring_func"] == "softmax":
6423
- self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
6424
- else:
6425
- raise ValueError(f"Unsupported scoring_func value: {hparams['scoring_func']}")
6426
-
6427
7156
  self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
6428
7157
 
6429
7158
  rope_scaling = self.hparams.get("rope_scaling") or {}
@@ -6518,6 +7247,94 @@ class DeepseekV2Model(TextModel):
6518
7247
  raise ValueError(f"Unprocessed experts: {experts}")
6519
7248
 
6520
7249
 
7250
+ @ModelBase.register("MiniMaxM2ForCausalLM")
7251
+ class MiniMaxM2Model(TextModel):
7252
+ model_arch = gguf.MODEL_ARCH.MINIMAXM2
7253
+ _experts_cache: dict[int, dict[str, Tensor]] = {}
7254
+
7255
+ def __init__(self, *args, **kwargs):
7256
+ super().__init__(*args, **kwargs)
7257
+ self.hparams["num_experts"] = self.hparams["num_local_experts"]
7258
+
7259
+ def set_gguf_parameters(self):
7260
+ super().set_gguf_parameters()
7261
+
7262
+ self.gguf_writer.add_expert_feed_forward_length(self.find_hparam(["intermediate_size"]))
7263
+ self.gguf_writer.add_rope_dimension_count(self.find_hparam(["rotary_dim"]))
7264
+
7265
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
7266
+ if name.endswith("e_score_correction_bias"):
7267
+ name = name.replace("e_score_correction_bias", "e_score_correction.bias")
7268
+
7269
+ # merge expert weights
7270
+ if 'experts' in name:
7271
+ n_experts = self.hparams["num_experts"]
7272
+ assert bid is not None
7273
+
7274
+ expert_cache = self._experts_cache.setdefault(bid, {})
7275
+ expert_cache[name] = data_torch
7276
+ expert_weights = ["w1", "w2", "w3"]
7277
+
7278
+ # not enough expert weights to merge
7279
+ if len(expert_cache) < n_experts * len(expert_weights):
7280
+ return []
7281
+
7282
+ tensors: list[tuple[str, Tensor]] = []
7283
+ for w_name in expert_weights:
7284
+ datas: list[Tensor] = []
7285
+
7286
+ for xid in range(n_experts):
7287
+ ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
7288
+ datas.append(expert_cache[ename])
7289
+ del expert_cache[ename]
7290
+
7291
+ data_torch = torch.stack(datas, dim=0)
7292
+ merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
7293
+ new_name = self.map_tensor_name(merged_name)
7294
+ tensors.append((new_name, data_torch))
7295
+
7296
+ del self._experts_cache[bid]
7297
+ return tensors
7298
+
7299
+ return super().modify_tensors(data_torch, name, bid)
7300
+
7301
+
7302
+ @ModelBase.register("PanguEmbeddedForCausalLM")
7303
+ class PanguEmbeddedModel(TextModel):
7304
+ model_arch = gguf.MODEL_ARCH.PANGU_EMBED
7305
+
7306
+ def set_vocab(self):
7307
+ self._set_vocab_sentencepiece()
7308
+
7309
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
7310
+ if tokenizer_config_file.is_file():
7311
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
7312
+ tokenizer_config_json = json.load(f)
7313
+ if "add_prefix_space" in tokenizer_config_json:
7314
+ self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
7315
+
7316
+ def set_gguf_parameters(self):
7317
+ super().set_gguf_parameters()
7318
+ hparams = self.hparams
7319
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
7320
+
7321
+ # PanguEmbedded's hparam loaded from config.json without head_dim
7322
+ if (rope_dim := hparams.get("head_dim")) is None:
7323
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
7324
+ self.gguf_writer.add_rope_dimension_count(rope_dim)
7325
+
7326
+ if hparams.get("head_dim") is None:
7327
+ self.gguf_writer.add_key_length(rope_dim)
7328
+ self.gguf_writer.add_value_length(rope_dim)
7329
+
7330
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7331
+ if name == "lm_head.weight":
7332
+ if self.hparams.get("tie_word_embeddings", False):
7333
+ logger.info("Skipping tied output layer 'lm_head.weight'")
7334
+ return []
7335
+ return [(self.map_tensor_name(name), data_torch)]
7336
+
7337
+
6521
7338
  @ModelBase.register("Dots1ForCausalLM")
6522
7339
  class Dots1Model(Qwen2MoeModel):
6523
7340
  model_arch = gguf.MODEL_ARCH.DOTS1
@@ -6533,11 +7350,6 @@ class Dots1Model(Qwen2MoeModel):
6533
7350
  self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
6534
7351
  self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
6535
7352
 
6536
- if self.hparams["scoring_func"] == "noaux_tc":
6537
- self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
6538
- else:
6539
- raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
6540
-
6541
7353
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
6542
7354
  if name.endswith("e_score_correction_bias"):
6543
7355
  name = name.replace("e_score_correction_bias", "e_score_correction.bias")
@@ -6573,6 +7385,7 @@ class PLMModel(TextModel):
6573
7385
  @ModelBase.register("T5ForConditionalGeneration")
6574
7386
  @ModelBase.register("MT5ForConditionalGeneration")
6575
7387
  @ModelBase.register("UMT5ForConditionalGeneration")
7388
+ @ModelBase.register("UMT5Model")
6576
7389
  class T5Model(TextModel):
6577
7390
  model_arch = gguf.MODEL_ARCH.T5
6578
7391
 
@@ -6681,7 +7494,9 @@ class T5Model(TextModel):
6681
7494
  self.gguf_writer.add_context_length(n_ctx)
6682
7495
  self.gguf_writer.add_embedding_length(self.hparams["d_model"])
6683
7496
  self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
6684
- self.gguf_writer.add_block_count(self.hparams["num_layers"])
7497
+ self.gguf_writer.add_block_count(self.block_count)
7498
+ if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None:
7499
+ self.gguf_writer.add_decoder_block_count(dec_n_layer)
6685
7500
  self.gguf_writer.add_head_count(self.hparams["num_heads"])
6686
7501
  self.gguf_writer.add_key_length(self.hparams["d_kv"])
6687
7502
  self.gguf_writer.add_value_length(self.hparams["d_kv"])
@@ -6818,7 +7633,7 @@ class T5EncoderModel(TextModel):
6818
7633
  self.gguf_writer.add_context_length(n_ctx)
6819
7634
  self.gguf_writer.add_embedding_length(self.hparams["d_model"])
6820
7635
  self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
6821
- self.gguf_writer.add_block_count(self.hparams["num_layers"])
7636
+ self.gguf_writer.add_block_count(self.block_count)
6822
7637
  self.gguf_writer.add_head_count(self.hparams["num_heads"])
6823
7638
  self.gguf_writer.add_key_length(self.hparams["d_kv"])
6824
7639
  self.gguf_writer.add_value_length(self.hparams["d_kv"])
@@ -6881,7 +7696,7 @@ class JaisModel(TextModel):
6881
7696
  self._set_vocab_gpt2()
6882
7697
 
6883
7698
  def set_gguf_parameters(self):
6884
- self.gguf_writer.add_block_count(self.hparams["n_layer"])
7699
+ self.gguf_writer.add_block_count(self.block_count)
6885
7700
  self.gguf_writer.add_context_length(self.hparams["n_positions"])
6886
7701
  self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
6887
7702
  self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
@@ -6995,12 +7810,6 @@ class Glm4MoeModel(TextModel):
6995
7810
  special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
6996
7811
  special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338
6997
7812
 
6998
- # Patch broken chat template
6999
- if isinstance(special_vocab.chat_template, str) and "visible_text(m.content).endswith" in special_vocab.chat_template:
7000
- special_vocab.chat_template = special_vocab.chat_template.replace(
7001
- """{{ visible_text(m.content) }}\n{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}""",
7002
- """{% set content = visible_text(m.content) %}{{ content }}\n{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not content.endswith("/nothink")) else '' -}}""")
7003
-
7004
7813
  special_vocab.add_to_gguf(self.gguf_writer)
7005
7814
 
7006
7815
  def set_gguf_parameters(self):
@@ -7229,7 +8038,7 @@ class ChatGLMModel(TextModel):
7229
8038
  self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
7230
8039
  self.gguf_writer.add_embedding_length(n_embed)
7231
8040
  self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
7232
- self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"]))
8041
+ self.gguf_writer.add_block_count(self.block_count)
7233
8042
  self.gguf_writer.add_head_count(n_head)
7234
8043
  self.gguf_writer.add_head_count_kv(n_head_kv)
7235
8044
  self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
@@ -7311,7 +8120,6 @@ class ExaoneModel(TextModel):
7311
8120
  num_kv_heads = hparams.get("num_key_value_heads", num_heads)
7312
8121
  layer_norm_eps = hparams["layer_norm_epsilon"]
7313
8122
  intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
7314
- num_layers = hparams["num_layers"]
7315
8123
  # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
7316
8124
  # attention_dropout_rate = hparams["attention_dropout"]
7317
8125
  # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
@@ -7322,7 +8130,7 @@ class ExaoneModel(TextModel):
7322
8130
  self.gguf_writer.add_context_length(max_position_embeddings)
7323
8131
  self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
7324
8132
  self.gguf_writer.add_feed_forward_length(intermediate_size)
7325
- self.gguf_writer.add_block_count(num_layers)
8133
+ self.gguf_writer.add_block_count(self.block_count)
7326
8134
  self.gguf_writer.add_file_type(self.ftype)
7327
8135
 
7328
8136
  if (rope_theta := self.hparams.get("rope_theta")) is not None:
@@ -7545,6 +8353,21 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
7545
8353
  if i not in self._attn_layers
7546
8354
  ]
7547
8355
 
8356
+ # There are some models in this family that are non-hybrid, but keep the
8357
+ # same parent class by setting all layers to "attention." If this is the
8358
+ # case, the model architecture needs to be updated to a standard
8359
+ # "granite" or "granitemoe" model
8360
+ if not self._ssm_layers:
8361
+ has_experts = self.find_hparam(["num_experts_per_tok"], optional=True)
8362
+ new_arch = (
8363
+ gguf.MODEL_ARCH.GRANITE_MOE
8364
+ if has_experts else
8365
+ gguf.MODEL_ARCH.GRANITE
8366
+ )
8367
+ self.model_arch = new_arch
8368
+ self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch]
8369
+ self.gguf_writer.add_architecture()
8370
+
7548
8371
  # n_group and d_inner are used during reshape_tensors for mamba2
7549
8372
  # NOTE: Explicitly include hparam prefix prefix for d_model to
7550
8373
  # disambiguate with top-level head_dim
@@ -7629,8 +8452,11 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
7629
8452
  self.gguf_writer.add_rope_dimension_count(rope_dim)
7630
8453
  self.gguf_writer.add_head_count_kv(head_count_kv_vec)
7631
8454
 
7632
- ## If Bamba, use rope, otherwise don't
7633
- use_rope = "BambaForCausalLM" in self.hparams["architectures"]
8455
+ ## If Bamba or non-hybrid, use rope, otherwise don't
8456
+ use_rope = (
8457
+ "BambaForCausalLM" in self.hparams["architectures"]
8458
+ or not self._ssm_layers
8459
+ )
7634
8460
  self.gguf_writer.add_rope_scaling_finetuned(use_rope)
7635
8461
  if not use_rope:
7636
8462
  self.gguf_writer.add_context_length(2**20)
@@ -7801,6 +8627,209 @@ class BailingMoeModel(TextModel):
7801
8627
  raise ValueError(f"Unprocessed experts: {experts}")
7802
8628
 
7803
8629
 
8630
+ @ModelBase.register("BailingMoeV2ForCausalLM")
8631
+ class BailingMoeV2Model(TextModel):
8632
+ model_arch = gguf.MODEL_ARCH.BAILINGMOE2
8633
+
8634
+ def __init__(self, *args, **kwargs):
8635
+ super().__init__(*args, **kwargs)
8636
+ if nextn_layers := self.hparams.get("num_nextn_predict_layers", 0):
8637
+ self.block_count = self.hparams["num_hidden_layers"] + nextn_layers
8638
+ self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
8639
+
8640
+ def set_vocab(self):
8641
+ self._set_vocab_gpt2()
8642
+
8643
+ def set_gguf_parameters(self):
8644
+ super().set_gguf_parameters()
8645
+ hparams = self.hparams
8646
+ if (rope_dim := hparams.get("head_dim")) is None:
8647
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
8648
+
8649
+ self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
8650
+ rope_scaling = self.hparams.get("rope_scaling") or {}
8651
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
8652
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
8653
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
8654
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
8655
+ else:
8656
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
8657
+ self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
8658
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
8659
+ self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
8660
+ self.gguf_writer.add_expert_shared_feed_forward_length(hparams.get("moe_shared_expert_intermediate_size", hparams["moe_intermediate_size"] * hparams["num_shared_experts"]))
8661
+ self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
8662
+ self.gguf_writer.add_expert_count(hparams["num_experts"])
8663
+ self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
8664
+ self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
8665
+
8666
+ if (nextn_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
8667
+ self.gguf_writer.add_nextn_predict_layers(nextn_layers)
8668
+
8669
+ _experts: list[dict[str, Tensor]] | None = None
8670
+
8671
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8672
+ if "mlp.experts" in name:
8673
+ n_experts = self.hparams["num_experts"]
8674
+ assert bid is not None
8675
+
8676
+ tensors: list[tuple[str, Tensor]] = []
8677
+
8678
+ if self._experts is None:
8679
+ self._experts = [{} for _ in range(self.block_count)]
8680
+
8681
+ self._experts[bid][name] = data_torch
8682
+
8683
+ if len(self._experts[bid]) >= n_experts * 3:
8684
+ # merge the experts into a single 3d tensor
8685
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
8686
+ datas: list[Tensor] = []
8687
+
8688
+ for xid in range(n_experts):
8689
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
8690
+ datas.append(self._experts[bid][ename])
8691
+ del self._experts[bid][ename]
8692
+
8693
+ data_torch = torch.stack(datas, dim=0)
8694
+
8695
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
8696
+
8697
+ new_name = self.map_tensor_name(merged_name)
8698
+
8699
+ tensors.append((new_name, data_torch))
8700
+
8701
+ return tensors
8702
+
8703
+ if name.endswith(".expert_bias"):
8704
+ name = name.replace(".expert_bias", ".expert_bias.bias")
8705
+
8706
+ return [(self.map_tensor_name(name), data_torch)]
8707
+
8708
+ def prepare_tensors(self):
8709
+ super().prepare_tensors()
8710
+
8711
+ if self._experts is not None:
8712
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
8713
+ experts = [k for d in self._experts for k in d.keys()]
8714
+ if len(experts) > 0:
8715
+ raise ValueError(f"Unprocessed experts: {experts}")
8716
+
8717
+
8718
+ @ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
8719
+ class GroveMoeModel(TextModel):
8720
+ model_arch = gguf.MODEL_ARCH.GROVEMOE
8721
+
8722
+ def set_gguf_parameters(self):
8723
+ super().set_gguf_parameters()
8724
+ if (n_experts := self.hparams.get("num_experts")) is not None:
8725
+ self.gguf_writer.add_expert_count(n_experts)
8726
+ if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
8727
+ self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
8728
+ logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
8729
+ # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299
8730
+ self.gguf_writer.add_expert_chunk_feed_forward_length(self.hparams.get("head_dim") or 128)
8731
+ # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298
8732
+ self.gguf_writer.add_experts_per_group(2)
8733
+ # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
8734
+ self.gguf_writer.add_expert_group_scale(0.05)
8735
+ # YaRN is not enabled by default
8736
+ # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
8737
+ rope_scaling = self.hparams.get("rope_scaling") or {}
8738
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
8739
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
8740
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
8741
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
8742
+
8743
+ _experts: list[dict[str, Tensor]] | None = None
8744
+ _chunk_experts: list[dict[str, Tensor]] | None = None
8745
+
8746
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
8747
+ if name.endswith(".expert_bias"):
8748
+ # FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303
8749
+ return []
8750
+
8751
+ # process the experts separately
8752
+ if name.find("chunk_experts") != -1:
8753
+ n_experts = self.hparams["num_experts"] // 2 # see add_experts_per_group
8754
+ assert bid is not None
8755
+
8756
+ if self._chunk_experts is None:
8757
+ self._chunk_experts = [{} for _ in range(self.block_count)]
8758
+
8759
+ self._chunk_experts[bid][name] = data_torch
8760
+
8761
+ if len(self._chunk_experts[bid]) >= n_experts * 3:
8762
+ tensors: list[tuple[str, Tensor]] = []
8763
+
8764
+ # merge the experts into a single 3d tensor
8765
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
8766
+ datas: list[Tensor] = []
8767
+
8768
+ for xid in range(n_experts):
8769
+ ename = f"model.layers.{bid}.mlp.chunk_experts.{xid}.{w_name}.weight"
8770
+ datas.append(self._chunk_experts[bid][ename])
8771
+ del self._chunk_experts[bid][ename]
8772
+
8773
+ data_torch = torch.stack(datas, dim=0)
8774
+
8775
+ merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight"
8776
+
8777
+ new_name = self.map_tensor_name(merged_name)
8778
+
8779
+ tensors.append((new_name, data_torch))
8780
+ return tensors
8781
+ else:
8782
+ return []
8783
+ elif name.find("experts") != -1:
8784
+ n_experts = self.hparams["num_experts"]
8785
+ assert bid is not None
8786
+
8787
+ if self._experts is None:
8788
+ self._experts = [{} for _ in range(self.block_count)]
8789
+
8790
+ self._experts[bid][name] = data_torch
8791
+
8792
+ if len(self._experts[bid]) >= n_experts * 3:
8793
+ tensors: list[tuple[str, Tensor]] = []
8794
+
8795
+ # merge the experts into a single 3d tensor
8796
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
8797
+ datas: list[Tensor] = []
8798
+
8799
+ for xid in range(n_experts):
8800
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
8801
+ datas.append(self._experts[bid][ename])
8802
+ del self._experts[bid][ename]
8803
+
8804
+ data_torch = torch.stack(datas, dim=0)
8805
+
8806
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
8807
+
8808
+ new_name = self.map_tensor_name(merged_name)
8809
+
8810
+ tensors.append((new_name, data_torch))
8811
+ return tensors
8812
+ else:
8813
+ return []
8814
+
8815
+ return [(self.map_tensor_name(name), data_torch)]
8816
+
8817
+ def prepare_tensors(self):
8818
+ super().prepare_tensors()
8819
+
8820
+ if self._chunk_experts is not None:
8821
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
8822
+ chunk_experts = [k for d in self._chunk_experts for k in d.keys()]
8823
+ if len(chunk_experts) > 0:
8824
+ raise ValueError(f"Unprocessed adjugate experts: {chunk_experts}")
8825
+
8826
+ if self._experts is not None:
8827
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
8828
+ experts = [k for d in self._experts for k in d.keys()]
8829
+ if len(experts) > 0:
8830
+ raise ValueError(f"Unprocessed experts: {experts}")
8831
+
8832
+
7804
8833
  @ModelBase.register("ChameleonForConditionalGeneration")
7805
8834
  @ModelBase.register("ChameleonForCausalLM") # obsolete
7806
8835
  class ChameleonModel(TextModel):
@@ -8163,6 +9192,76 @@ class HunYuanMoEModel(TextModel):
8163
9192
  raise ValueError(f"Unprocessed experts: {experts}")
8164
9193
 
8165
9194
 
9195
+ @ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM")
9196
+ class LLaDAMoEModel(TextModel):
9197
+ model_arch = gguf.MODEL_ARCH.LLADA_MOE
9198
+
9199
+ def set_gguf_parameters(self):
9200
+ super().set_gguf_parameters()
9201
+ if (n_experts := self.hparams.get("num_experts")) is not None:
9202
+ self.gguf_writer.add_expert_count(n_experts)
9203
+
9204
+ if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None:
9205
+ self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
9206
+
9207
+ # number of experts used per token (top-k)
9208
+ if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
9209
+ self.gguf_writer.add_expert_used_count(n_experts_used)
9210
+
9211
+ self.gguf_writer.add_mask_token_id(156895)
9212
+ self.gguf_writer.add_causal_attention(False)
9213
+ self.gguf_writer.add_diffusion_shift_logits(False)
9214
+
9215
+ _experts: list[dict[str, Tensor]] | None = None
9216
+
9217
+ # Copied from: Qwen2MoeModel
9218
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9219
+ # process the experts separately
9220
+ if name.find("experts") != -1:
9221
+ n_experts = self.hparams["num_experts"]
9222
+ assert bid is not None
9223
+
9224
+ if self._experts is None:
9225
+ self._experts = [{} for _ in range(self.block_count)]
9226
+
9227
+ self._experts[bid][name] = data_torch
9228
+
9229
+ if len(self._experts[bid]) >= n_experts * 3:
9230
+ tensors: list[tuple[str, Tensor]] = []
9231
+
9232
+ # merge the experts into a single 3d tensor
9233
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
9234
+ datas: list[Tensor] = []
9235
+
9236
+ for xid in range(n_experts):
9237
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
9238
+ datas.append(self._experts[bid][ename])
9239
+ del self._experts[bid][ename]
9240
+
9241
+ data_torch = torch.stack(datas, dim=0)
9242
+
9243
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
9244
+
9245
+ new_name = self.map_tensor_name(merged_name)
9246
+
9247
+ tensors.append((new_name, data_torch))
9248
+ return tensors
9249
+ else:
9250
+ return []
9251
+
9252
+ return [(self.map_tensor_name(name), data_torch)]
9253
+
9254
+ # Copied from: Qwen2MoeModel
9255
+ def prepare_tensors(self):
9256
+ super().prepare_tensors()
9257
+
9258
+ if self._experts is not None:
9259
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
9260
+ experts = [k for d in self._experts for k in d.keys()]
9261
+ if len(experts) > 0:
9262
+ raise ValueError(f"Unprocessed experts: {experts}")
9263
+
9264
+
8166
9265
  @ModelBase.register("HunYuanDenseV1ForCausalLM")
8167
9266
  class HunYuanModel(TextModel):
8168
9267
  model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
@@ -8259,21 +9358,18 @@ class HunYuanModel(TextModel):
8259
9358
  class SmolLM3Model(LlamaModel):
8260
9359
  model_arch = gguf.MODEL_ARCH.SMOLLM3
8261
9360
 
8262
- def set_vocab(self):
8263
- super().set_vocab()
8264
- # remove unsupported array slicing in chat template
8265
- # ref: https://huggingface.co/ggml-org/SmolLM3-3B-GGUF/discussions/1
8266
- from transformers import AutoTokenizer
8267
- tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
8268
- if tokenizer.chat_template is not None:
8269
- chat_template = tokenizer.chat_template.replace("[:]", "")
8270
- self.gguf_writer.add_chat_template(chat_template)
8271
-
8272
9361
 
8273
9362
  @ModelBase.register("GptOssForCausalLM")
8274
9363
  class GptOssModel(TextModel):
8275
9364
  model_arch = gguf.MODEL_ARCH.GPT_OSS
8276
9365
 
9366
+ # TODO: remove once MXFP4 is supported more generally
9367
+ def dequant_model(self):
9368
+ quant_config = self.hparams.get("quantization_config")
9369
+ if quant_config is not None and quant_config.get("quant_method") == "mxfp4":
9370
+ return
9371
+ return super().dequant_model()
9372
+
8277
9373
  def transform_nibble_layout(self, tensor):
8278
9374
  assert tensor.dtype == torch.uint8
8279
9375
  assert tensor.shape[-1] == 16
@@ -8443,6 +9539,75 @@ class LFM2Model(TextModel):
8443
9539
  return [(self.map_tensor_name(name), data_torch)]
8444
9540
 
8445
9541
 
9542
+ @ModelBase.register("Lfm2MoeForCausalLM")
9543
+ class LFM2MoeModel(TextModel):
9544
+ model_arch = gguf.MODEL_ARCH.LFM2MOE
9545
+
9546
+ def set_gguf_parameters(self):
9547
+ # set num_key_value_heads only for attention layers
9548
+ self.hparams["num_key_value_heads"] = [
9549
+ self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
9550
+ for layer_type in self.hparams["layer_types"]
9551
+ ]
9552
+
9553
+ super().set_gguf_parameters()
9554
+
9555
+ self.gguf_writer.add_expert_count(self.hparams["num_experts"])
9556
+ self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
9557
+ self.gguf_writer.add_leading_dense_block_count(self.hparams["num_dense_layers"])
9558
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
9559
+
9560
+ self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
9561
+ self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
9562
+
9563
+ # cache for experts weights for merging
9564
+ _experts_cache: dict[int, dict[str, Tensor]] = {}
9565
+
9566
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9567
+ # conv op requires 2d tensor
9568
+ if 'conv.conv' in name:
9569
+ data_torch = data_torch.squeeze(1)
9570
+
9571
+ if name.endswith(".expert_bias"):
9572
+ name = name.replace(".expert_bias", ".expert_bias.bias")
9573
+
9574
+ # merge expert weights
9575
+ if 'experts' in name:
9576
+ n_experts = self.hparams["num_experts"]
9577
+ assert bid is not None
9578
+
9579
+ expert_cache = self._experts_cache.setdefault(bid, {})
9580
+ expert_cache[name] = data_torch
9581
+ expert_weights = ["w1", "w2", "w3"]
9582
+
9583
+ # not enough expert weights to merge
9584
+ if len(expert_cache) < n_experts * len(expert_weights):
9585
+ return []
9586
+
9587
+ tensors: list[tuple[str, Tensor]] = []
9588
+ for w_name in expert_weights:
9589
+ datas: list[Tensor] = []
9590
+
9591
+ for xid in range(n_experts):
9592
+ ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{w_name}.weight"
9593
+ datas.append(expert_cache[ename])
9594
+ del expert_cache[ename]
9595
+
9596
+ data_torch = torch.stack(datas, dim=0)
9597
+ merged_name = f"layers.{bid}.feed_forward.experts.{w_name}.weight"
9598
+ new_name = self.map_tensor_name(merged_name)
9599
+ tensors.append((new_name, data_torch))
9600
+
9601
+ del self._experts_cache[bid]
9602
+ return tensors
9603
+
9604
+ return [(self.map_tensor_name(name), data_torch)]
9605
+
9606
+ def prepare_tensors(self):
9607
+ super().prepare_tensors()
9608
+ assert not self._experts_cache
9609
+
9610
+
8446
9611
  @ModelBase.register("Lfm2VlForConditionalGeneration")
8447
9612
  class LFM2VLModel(MmprojModel):
8448
9613
  def __init__(self, *args, **kwargs):
@@ -8561,6 +9726,43 @@ class SmallThinkerModel(TextModel):
8561
9726
  raise ValueError(f"Unprocessed experts: {experts}")
8562
9727
 
8563
9728
 
9729
+ @ModelBase.register("ApertusForCausalLM")
9730
+ class ApertusModel(LlamaModel):
9731
+ model_arch = gguf.MODEL_ARCH.APERTUS
9732
+ undo_permute = False
9733
+
9734
+ _alpha_n = {}
9735
+ _alpha_p = {}
9736
+ _beta = {}
9737
+ _eps = {}
9738
+
9739
+ def modify_tensors(self, data_torch, name, bid):
9740
+ # Handle xIELU activation parameters
9741
+ n_layers = self.hparams["num_hidden_layers"]
9742
+ if name.endswith(".act_fn.alpha_n"):
9743
+ self._alpha_n[bid] = data_torch.to("cpu").float().item()
9744
+ if (len(self._alpha_n) == n_layers):
9745
+ self.gguf_writer.add_xielu_alpha_n([self._alpha_n[k] for k in sorted(self._alpha_n)])
9746
+ return []
9747
+ if name.endswith(".act_fn.alpha_p"):
9748
+ self._alpha_p[bid] = data_torch.to("cpu").float().item()
9749
+ if (len(self._alpha_p) == n_layers):
9750
+ self.gguf_writer.add_xielu_alpha_p([self._alpha_p[k] for k in sorted(self._alpha_p)])
9751
+ return []
9752
+ if name.endswith(".act_fn.beta"):
9753
+ self._beta[bid] = data_torch.to("cpu").float().item()
9754
+ if (len(self._beta) == n_layers):
9755
+ self.gguf_writer.add_xielu_beta([self._beta[k] for k in sorted(self._beta)])
9756
+ return []
9757
+ if name.endswith(".act_fn.eps"):
9758
+ self._eps[bid] = data_torch.to("cpu").float().item()
9759
+ if (len(self._eps) == n_layers):
9760
+ self.gguf_writer.add_xielu_eps([self._eps[k] for k in sorted(self._eps)])
9761
+ return []
9762
+
9763
+ return super().modify_tensors(data_torch, name, bid)
9764
+
9765
+
8564
9766
  class MistralModel(LlamaModel):
8565
9767
  model_arch = gguf.MODEL_ARCH.LLAMA
8566
9768
  model_name = "Mistral"
@@ -8570,7 +9772,7 @@ class MistralModel(LlamaModel):
8570
9772
 
8571
9773
  @staticmethod
8572
9774
  def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool):
8573
- assert TokenizerVersion is not None, "mistral_common is not installed"
9775
+ assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg
8574
9776
  assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), (
8575
9777
  f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}"
8576
9778
  )
@@ -8638,6 +9840,21 @@ class PixtralModel(LlavaVisionModel):
8638
9840
  return super().map_tensor_name(name, try_suffixes)
8639
9841
 
8640
9842
 
9843
+ @ModelBase.register("LightOnOCRForConditionalGeneration")
9844
+ class LightOnOCRVisionModel(LlavaVisionModel):
9845
+ is_mistral_format = False
9846
+ use_break_tok = False
9847
+
9848
+ def set_gguf_parameters(self):
9849
+ super().set_gguf_parameters()
9850
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LIGHTONOCR)
9851
+
9852
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
9853
+ name = name.replace("model.vision_encoder.", "vision_tower.")
9854
+ name = name.replace("model.vision_projection.", "multi_modal_projector.")
9855
+ return super().modify_tensors(data_torch, name, bid)
9856
+
9857
+
8641
9858
  @ModelBase.register("KimiVLForConditionalGeneration")
8642
9859
  class KimiVLModel(MmprojModel):
8643
9860
  def __init__(self, *args, **kwargs):
@@ -8674,6 +9891,144 @@ class KimiVLModel(MmprojModel):
8674
9891
 
8675
9892
  return [] # skip other tensors
8676
9893
 
9894
+
9895
+ @ModelBase.register("CogVLMForCausalLM")
9896
+ class CogVLMVisionModel(MmprojModel):
9897
+
9898
+ def set_gguf_parameters(self):
9899
+ super().set_gguf_parameters()
9900
+ self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
9901
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM)
9902
+
9903
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9904
+ del bid # unused
9905
+
9906
+ if not name.startswith("model.vision."):
9907
+ return []
9908
+
9909
+ return [(self.map_tensor_name(name), data_torch)]
9910
+
9911
+
9912
+ @ModelBase.register("CogVLMForCausalLM")
9913
+ class CogVLMModel(LlamaModel):
9914
+ model_arch = gguf.MODEL_ARCH.COGVLM
9915
+
9916
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9917
+ del bid # unused
9918
+
9919
+ # block vision tensors
9920
+ if name.startswith("model.vision."):
9921
+ return []
9922
+
9923
+ return [(self.map_tensor_name(name), data_torch)]
9924
+
9925
+
9926
+ @ModelBase.register("JanusForConditionalGeneration")
9927
+ class JanusProModel(LlamaModel):
9928
+ model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch
9929
+
9930
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
9931
+ # Skip vision, aligner, and generation tensors
9932
+ skip_prefixes = (
9933
+ 'model.vision_model.',
9934
+ 'model.aligner.',
9935
+ 'model.vqmodel.',
9936
+ 'model.generation_embeddings.',
9937
+ 'model.generation_aligner.',
9938
+ 'model.generation_head.',
9939
+ )
9940
+ if name.startswith(skip_prefixes):
9941
+ return []
9942
+
9943
+ if name.startswith('model.language_model.'):
9944
+ name = name.replace('model.language_model.', 'model.')
9945
+ elif name.startswith('language_model.'):
9946
+ name = name.replace('language_model.', '')
9947
+
9948
+ return super().modify_tensors(data_torch, name, bid)
9949
+
9950
+
9951
+ @ModelBase.register("JanusForConditionalGeneration")
9952
+ class JanusProVisionModel(MmprojModel):
9953
+ def __init__(self, *args, **kwargs):
9954
+ super().__init__(*args, **kwargs)
9955
+ assert self.hparams_vision is not None
9956
+ if "intermediate_size" not in self.hparams_vision:
9957
+ mlp_ratio = self.hparams_vision.get("mlp_ratio")
9958
+ hidden_size = self.hparams_vision.get("hidden_size")
9959
+ if mlp_ratio is not None and hidden_size is not None:
9960
+ self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
9961
+
9962
+ def set_gguf_parameters(self):
9963
+ super().set_gguf_parameters()
9964
+ assert self.hparams_vision is not None
9965
+
9966
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO)
9967
+
9968
+ self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
9969
+
9970
+ hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
9971
+ if hidden_act == "gelu":
9972
+ self.gguf_writer.add_vision_use_gelu(True)
9973
+ elif hidden_act == "silu":
9974
+ self.gguf_writer.add_vision_use_silu(True)
9975
+
9976
+ def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]:
9977
+ """Map aligner tensors to projector format"""
9978
+ suffix = ".bias" if name.endswith(".bias") else ".weight"
9979
+
9980
+ if name.startswith("model.aligner."):
9981
+ local_name = name[len("model.aligner."):]
9982
+ elif name.startswith("aligner."):
9983
+ local_name = name[len("aligner."):]
9984
+ else:
9985
+ raise ValueError(f"Unsupported Janus aligner prefix: {name}")
9986
+
9987
+ if local_name.startswith("fc1."):
9988
+ mm_index = 0
9989
+ elif local_name.startswith("hidden_layers."):
9990
+ parts = local_name.split(".", 2)
9991
+ if len(parts) < 3:
9992
+ raise ValueError(f"Unexpected Janus aligner tensor name: {name}")
9993
+ mm_index = int(parts[1]) + 1
9994
+ else:
9995
+ raise ValueError(f"Unsupported Janus aligner tensor: {name}")
9996
+
9997
+ tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix)
9998
+ return [(tensor_name, data_torch)]
9999
+
10000
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
10001
+ del bid # unused
10002
+
10003
+ # Skip language model tensors as they will be handled by `JanusProModel`
10004
+ if name.startswith(('model.language_model.', 'language_model.')):
10005
+ return []
10006
+
10007
+ # Skip generation-related components
10008
+ skip_generation_prefixes = (
10009
+ 'model.vqmodel.',
10010
+ 'vqmodel.',
10011
+ 'model.generation_embeddings.',
10012
+ 'generation_embeddings.',
10013
+ 'model.generation_aligner.',
10014
+ 'generation_aligner.',
10015
+ 'model.generation_head.',
10016
+ 'generation_head.',
10017
+ )
10018
+ if name.startswith(skip_generation_prefixes):
10019
+ return []
10020
+
10021
+ # Handle aligner tensors
10022
+ if name.startswith(('model.aligner.', 'aligner.')):
10023
+ return list(self._map_aligner_tensor(data_torch, name))
10024
+
10025
+ # Handle vision tensors
10026
+ if name.startswith(('model.vision_model.', 'vision_model.')):
10027
+ return [(self.map_tensor_name(name), data_torch)]
10028
+
10029
+ return []
10030
+
10031
+
8677
10032
  ###### CONVERSION LOGIC ######
8678
10033
 
8679
10034
 
@@ -8728,7 +10083,17 @@ class LazyTorchTensor(gguf.LazyBase):
8728
10083
  def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
8729
10084
  dtype = cls._dtype_str_map[st_slice.get_dtype()]
8730
10085
  shape: tuple[int, ...] = tuple(st_slice.get_shape())
8731
- lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
10086
+ lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[...] if len(s.get_shape()) == 0 else s[:])
10087
+ return cast(torch.Tensor, lazy)
10088
+
10089
+ @classmethod
10090
+ def from_local_tensor(cls, t: gguf.utility.LocalTensor) -> Tensor:
10091
+ def load_tensor(tensor: gguf.utility.LocalTensor) -> Tensor:
10092
+ dtype = cls._dtype_str_map[tensor.dtype]
10093
+ return torch.from_numpy(tensor.mmap_bytes()).view(dtype).reshape(tensor.shape)
10094
+ dtype = cls._dtype_str_map[t.dtype]
10095
+ shape = t.shape
10096
+ lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(t,), func=lambda r: load_tensor(r))
8732
10097
  return cast(torch.Tensor, lazy)
8733
10098
 
8734
10099
  @classmethod
@@ -8836,6 +10201,13 @@ def parse_args() -> argparse.Namespace:
8836
10201
  )
8837
10202
  )
8838
10203
 
10204
+ parser.add_argument(
10205
+ "--sentence-transformers-dense-modules", action="store_true",
10206
+ help=("Whether to include sentence-transformers dense modules."
10207
+ "It can be used for sentence-transformers models, like google/embeddinggemma-300m"
10208
+ "Default these modules are not included.")
10209
+ )
10210
+
8839
10211
  args = parser.parse_args()
8840
10212
  if not args.print_supported_models and args.model is None:
8841
10213
  parser.error("the following arguments are required: model")
@@ -8898,9 +10270,13 @@ def main() -> None:
8898
10270
  if args.remote:
8899
10271
  hf_repo_id = args.model
8900
10272
  from huggingface_hub import snapshot_download
10273
+ allowed_patterns = ["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"]
10274
+ if args.sentence_transformers_dense_modules:
10275
+ # include sentence-transformers dense modules safetensors files
10276
+ allowed_patterns.append("*.safetensors")
8901
10277
  local_dir = snapshot_download(
8902
10278
  repo_id=hf_repo_id,
8903
- allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
10279
+ allow_patterns=allowed_patterns)
8904
10280
  dir_model = Path(local_dir)
8905
10281
  logger.info(f"Downloaded config and tokenizer to {local_dir}")
8906
10282
  else:
@@ -8936,11 +10312,9 @@ def main() -> None:
8936
10312
 
8937
10313
  logger.info(f"Loading model: {dir_model.name}")
8938
10314
 
8939
- if args.mmproj:
8940
- if "mmproj" not in fname_out.name:
8941
- fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")
8942
-
8943
10315
  is_mistral_format = args.mistral_format
10316
+ if is_mistral_format and not _mistral_common_installed:
10317
+ raise ImportError(_mistral_import_error_msg)
8944
10318
  disable_mistral_community_chat_template = args.disable_mistral_community_chat_template
8945
10319
 
8946
10320
  with torch.inference_mode():
@@ -8968,7 +10342,8 @@ def main() -> None:
8968
10342
  split_max_tensors=args.split_max_tensors,
8969
10343
  split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
8970
10344
  small_first_shard=args.no_tensor_first_split,
8971
- remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template
10345
+ remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
10346
+ sentence_transformers_dense_modules=args.sentence_transformers_dense_modules
8972
10347
  )
8973
10348
 
8974
10349
  if args.vocab_only: