@novastera-oss/llamarn 0.3.1 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (347) hide show
  1. package/README.md +86 -3
  2. package/RNLlamaCpp.podspec +1 -1
  3. package/android/CMakeLists.txt +11 -3
  4. package/android/generated/jni/react/renderer/components/RNLlamaCppSpec/RNLlamaCppSpecJSI.h +49 -4
  5. package/android/src/main/cpp/include/llama.h +53 -114
  6. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  12. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  13. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  15. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  16. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  17. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  20. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  21. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  22. package/cpp/LlamaCppModel.cpp +2 -10
  23. package/cpp/PureCppImpl.cpp +71 -4
  24. package/cpp/SystemUtils.cpp +3 -7
  25. package/cpp/build-info.cpp +2 -2
  26. package/cpp/llama.cpp/CMakeLists.txt +2 -0
  27. package/cpp/llama.cpp/CODEOWNERS +1 -1
  28. package/cpp/llama.cpp/Makefile +6 -1605
  29. package/cpp/llama.cpp/README.md +5 -1
  30. package/cpp/llama.cpp/common/arg.cpp +230 -51
  31. package/cpp/llama.cpp/common/chat-parser.cpp +9 -1
  32. package/cpp/llama.cpp/common/chat.cpp +539 -8
  33. package/cpp/llama.cpp/common/chat.h +8 -1
  34. package/cpp/llama.cpp/common/common.cpp +60 -15
  35. package/cpp/llama.cpp/common/common.h +64 -15
  36. package/cpp/llama.cpp/common/speculative.cpp +135 -54
  37. package/cpp/llama.cpp/common/speculative.h +8 -1
  38. package/cpp/llama.cpp/convert_hf_to_gguf.py +1216 -109
  39. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +19 -6
  40. package/cpp/llama.cpp/convert_lora_to_gguf.py +1 -1
  41. package/cpp/llama.cpp/flake.nix +0 -5
  42. package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -3
  43. package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +71 -70
  44. package/cpp/llama.cpp/ggml/include/ggml-opt.h +25 -6
  45. package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
  46. package/cpp/llama.cpp/ggml/include/ggml.h +90 -3
  47. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +13 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +1 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +10 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +113 -17
  51. package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +4 -4
  52. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +14 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +701 -585
  54. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +13 -3
  55. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +52 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +274 -91
  57. package/cpp/llama.cpp/ggml/src/ggml-common.h +17 -0
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +2 -2
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +371 -298
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +33 -2
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +26 -1
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
  73. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
  74. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  75. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +428 -23
  76. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -8
  77. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +458 -46
  80. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +22 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
  82. package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
  83. package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
  84. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
  85. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +122 -5
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +9 -11
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cu +58 -0
  88. package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cuh +3 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +275 -170
  90. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +103 -65
  92. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  93. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +171 -0
  94. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  95. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +33 -7
  96. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +13 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +2 -10
  98. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +3 -4
  99. package/cpp/llama.cpp/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  100. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +83 -27
  101. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +116 -57
  102. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +45 -18
  103. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +56 -29
  104. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +61 -39
  105. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +70 -49
  106. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +70 -21
  107. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +162 -50
  108. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cuh +2 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +5 -4
  110. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +208 -97
  111. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +46 -35
  112. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +56 -2
  113. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +95 -51
  114. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +427 -0
  115. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +5 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +204 -57
  117. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +252 -168
  118. package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
  119. package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
  120. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +10 -5
  121. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +192 -19
  122. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +5 -0
  123. package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  124. package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  125. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +82 -0
  126. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  127. package/cpp/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  128. package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cu +67 -0
  129. package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cuh +5 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +1 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cu +34 -0
  132. package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cuh +5 -0
  133. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +16 -10
  134. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +153 -71
  135. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +6 -10
  136. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +21 -4
  137. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  138. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +75 -0
  139. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  140. package/cpp/llama.cpp/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  141. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  142. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -25
  143. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -1
  144. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +10 -2
  145. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +31 -20
  147. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +342 -131
  148. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +464 -134
  149. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
  150. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +8 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1108 -176
  152. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  153. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  154. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +66 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +343 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +343 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +346 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +41 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  161. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  164. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  166. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +10 -2
  167. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +10 -2
  168. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +10 -2
  169. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +10 -2
  170. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +97 -41
  173. package/cpp/llama.cpp/ggml/src/ggml-quants.c +110 -16
  174. package/cpp/llama.cpp/ggml/src/ggml-quants.h +6 -0
  175. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +22 -9
  176. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  177. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -212
  178. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +213 -1
  179. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +117 -238
  180. package/cpp/llama.cpp/ggml/src/ggml-sycl/quantize.hpp +133 -0
  181. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +94 -0
  182. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1666 -633
  183. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  184. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  185. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  186. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  187. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +107 -43
  188. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  189. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +18 -0
  190. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  191. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  192. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
  193. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +21 -0
  194. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +16 -1
  195. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +44 -8
  196. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +44 -16
  197. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +26 -1
  198. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -17
  199. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
  200. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +37 -1
  201. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  202. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +109 -55
  203. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +71 -41
  204. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +6 -0
  205. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  206. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  207. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +49 -11
  208. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  209. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +9 -3
  210. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  211. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  212. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  213. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +55 -0
  214. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  215. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +75 -20
  216. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +2 -2
  217. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +807 -412
  218. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +72 -22
  219. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +8 -8
  220. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +1794 -0
  221. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +82 -0
  222. package/cpp/llama.cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  223. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +97 -0
  224. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +846 -0
  225. package/cpp/llama.cpp/ggml/src/ggml.c +204 -50
  226. package/cpp/llama.cpp/gguf-py/gguf/constants.py +187 -2
  227. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +11 -2
  228. package/cpp/llama.cpp/gguf-py/gguf/quants.py +53 -4
  229. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +67 -63
  230. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +7 -1
  231. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +120 -16
  232. package/cpp/llama.cpp/gguf-py/gguf/utility.py +5 -1
  233. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +284 -1
  234. package/cpp/llama.cpp/gguf-py/tests/test_quants.py +14 -5
  235. package/cpp/llama.cpp/include/llama.h +53 -114
  236. package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +171 -0
  237. package/cpp/llama.cpp/models/templates/README.md +2 -1
  238. package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +59 -0
  239. package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +331 -0
  240. package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +105 -0
  241. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -1
  242. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -6
  243. package/cpp/llama.cpp/requirements/requirements-pydantic.txt +1 -1
  244. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  245. package/cpp/llama.cpp/src/llama-adapter.cpp +68 -4
  246. package/cpp/llama.cpp/src/llama-adapter.h +3 -0
  247. package/cpp/llama.cpp/src/llama-arch.cpp +192 -2
  248. package/cpp/llama.cpp/src/llama-arch.h +18 -0
  249. package/cpp/llama.cpp/src/llama-batch.cpp +2 -2
  250. package/cpp/llama.cpp/src/llama-chat.cpp +47 -6
  251. package/cpp/llama.cpp/src/llama-chat.h +3 -0
  252. package/cpp/llama.cpp/src/llama-context.cpp +61 -252
  253. package/cpp/llama.cpp/src/llama-context.h +10 -15
  254. package/cpp/llama.cpp/src/llama-cparams.h +0 -1
  255. package/cpp/llama.cpp/src/llama-graph.cpp +180 -85
  256. package/cpp/llama.cpp/src/llama-graph.h +90 -51
  257. package/cpp/llama.cpp/src/llama-hparams.cpp +34 -3
  258. package/cpp/llama.cpp/src/llama-hparams.h +21 -6
  259. package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +79 -56
  260. package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +30 -28
  261. package/cpp/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +240 -632
  262. package/cpp/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +39 -74
  263. package/cpp/llama.cpp/src/llama-kv-cells.h +21 -21
  264. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +41 -35
  265. package/cpp/llama.cpp/src/llama-memory-hybrid.h +26 -29
  266. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +13 -9
  267. package/cpp/llama.cpp/src/llama-memory-recurrent.h +10 -14
  268. package/cpp/llama.cpp/src/llama-memory.h +13 -10
  269. package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
  270. package/cpp/llama.cpp/src/llama-model-loader.h +3 -2
  271. package/cpp/llama.cpp/src/llama-model.cpp +1959 -419
  272. package/cpp/llama.cpp/src/llama-model.h +28 -4
  273. package/cpp/llama.cpp/src/llama-quant.cpp +40 -4
  274. package/cpp/llama.cpp/src/llama-vocab.cpp +51 -2
  275. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  276. package/cpp/llama.cpp/vendor/minja/chat-template.hpp +16 -7
  277. package/cpp/llama.cpp/vendor/minja/minja.hpp +47 -12
  278. package/cpp/rn-completion.cpp +3 -27
  279. package/ios/generated/RNLlamaCppSpec/RNLlamaCppSpec.h +30 -0
  280. package/ios/generated/RNLlamaCppSpecJSI.h +49 -4
  281. package/ios/include/chat.h +8 -1
  282. package/ios/include/common/minja/chat-template.hpp +16 -7
  283. package/ios/include/common/minja/minja.hpp +47 -12
  284. package/ios/include/common.h +64 -15
  285. package/ios/include/llama.h +53 -114
  286. package/ios/include/speculative.h +8 -1
  287. package/ios/libs/llama.xcframework/Info.plist +18 -18
  288. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  289. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5557 -5267
  290. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +25 -6
  291. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +90 -3
  292. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +53 -114
  293. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  294. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  295. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5520 -5238
  296. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
  297. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
  298. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
  299. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
  300. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  301. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  302. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
  303. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4242 -4016
  304. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +25 -6
  305. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +90 -3
  306. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +53 -114
  307. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +25 -6
  308. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +90 -3
  309. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +53 -114
  310. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  311. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +25 -6
  312. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +90 -3
  313. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +53 -114
  314. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  315. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  316. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  317. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5556 -5267
  318. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +25 -6
  319. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +90 -3
  320. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +53 -114
  321. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  322. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  323. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
  324. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
  325. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
  326. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
  327. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
  328. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  329. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  330. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5553 -5303
  331. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +25 -6
  332. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +90 -3
  333. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +53 -114
  334. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  335. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  336. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5515 -5274
  337. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4238 -4044
  338. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
  339. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
  340. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
  341. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  342. package/lib/module/NativeRNLlamaCpp.js.map +1 -1
  343. package/lib/typescript/src/NativeRNLlamaCpp.d.ts +5 -0
  344. package/lib/typescript/src/NativeRNLlamaCpp.d.ts.map +1 -1
  345. package/package.json +1 -2
  346. package/src/NativeRNLlamaCpp.ts +7 -0
  347. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -56
@@ -105,6 +105,7 @@ class Keys:
105
105
  EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
106
106
  EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
107
107
  MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers"
108
+ NEXTN_PREDICT_LAYERS = "{arch}.nextn_predict_layers"
108
109
  POOLING_TYPE = "{arch}.pooling_type"
109
110
  LOGIT_SCALE = "{arch}.logit_scale"
110
111
  DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
@@ -230,8 +231,10 @@ class Keys:
230
231
  MIDDLE_ID = "tokenizer.ggml.middle_token_id"
231
232
 
232
233
  class Adapter:
233
- TYPE = "adapter.type"
234
- LORA_ALPHA = "adapter.lora.alpha"
234
+ TYPE = "adapter.type"
235
+ LORA_ALPHA = "adapter.lora.alpha"
236
+ LORA_TASK_NAME = "adapter.lora.task_name"
237
+ LORA_PROMPT_PREFIX = "adapter.lora.prompt_prefix"
235
238
 
236
239
  class IMatrix:
237
240
  CHUNK_COUNT = "imatrix.chunk_count"
@@ -279,6 +282,9 @@ class Keys:
279
282
  class Projector:
280
283
  STACK_FACTOR = "clip.audio.projector.stack_factor"
281
284
 
285
+ class Diffusion:
286
+ SHIFT_LOGITS = "diffusion.shift_logits"
287
+
282
288
  #
283
289
  # recommended mapping of model tensor names for storage in gguf
284
290
  #
@@ -311,6 +317,7 @@ class MODEL_ARCH(IntEnum):
311
317
  NOMIC_BERT_MOE = auto()
312
318
  NEO_BERT = auto()
313
319
  JINA_BERT_V2 = auto()
320
+ JINA_BERT_V3 = auto()
314
321
  BLOOM = auto()
315
322
  STABLELM = auto()
316
323
  QWEN = auto()
@@ -354,11 +361,13 @@ class MODEL_ARCH(IntEnum):
354
361
  DEEPSEEK2 = auto()
355
362
  CHATGLM = auto()
356
363
  GLM4 = auto()
364
+ GLM4_MOE = auto()
357
365
  BITNET = auto()
358
366
  T5 = auto()
359
367
  T5ENCODER = auto()
360
368
  JAIS = auto()
361
369
  NEMOTRON = auto()
370
+ NEMOTRON_H = auto()
362
371
  EXAONE = auto()
363
372
  EXAONE4 = auto()
364
373
  GRANITE = auto()
@@ -373,9 +382,14 @@ class MODEL_ARCH(IntEnum):
373
382
  ERNIE4_5 = auto()
374
383
  ERNIE4_5_MOE = auto()
375
384
  HUNYUAN_MOE = auto()
385
+ HUNYUAN_DENSE = auto()
376
386
  SMOLLM3 = auto()
387
+ GPT_OSS = auto()
377
388
  LFM2 = auto()
378
389
  DREAM = auto()
390
+ SMALLTHINKER = auto()
391
+ LLADA = auto()
392
+ SEED_OSS = auto()
379
393
 
380
394
 
381
395
  class VISION_PROJECTOR_TYPE(IntEnum):
@@ -408,6 +422,7 @@ class MODEL_TENSOR(IntEnum):
408
422
  ATTN_OUT_NORM = auto()
409
423
  ATTN_POST_NORM = auto()
410
424
  ATTN_ROT_EMBD = auto()
425
+ ATTN_SINKS = auto()
411
426
  FFN_GATE_INP = auto()
412
427
  FFN_GATE_INP_SHEXP = auto()
413
428
  FFN_NORM = auto()
@@ -608,6 +623,13 @@ class MODEL_TENSOR(IntEnum):
608
623
  A_MMPROJ_FC = auto()
609
624
  A_MM_NORM_PRE = auto()
610
625
  A_MM_NORM_MID = auto()
626
+ # nextn/mtp
627
+ NEXTN_EH_PROJ = auto()
628
+ NEXTN_EMBED_TOKENS = auto()
629
+ NEXTN_ENORM = auto()
630
+ NEXTN_HNORM = auto()
631
+ NEXTN_SHARED_HEAD_HEAD = auto()
632
+ NEXTN_SHARED_HEAD_NORM = auto()
611
633
 
612
634
 
613
635
  MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -629,6 +651,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
629
651
  MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
630
652
  MODEL_ARCH.NEO_BERT: "neo-bert",
631
653
  MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
654
+ MODEL_ARCH.JINA_BERT_V3: "jina-bert-v3",
632
655
  MODEL_ARCH.BLOOM: "bloom",
633
656
  MODEL_ARCH.STABLELM: "stablelm",
634
657
  MODEL_ARCH.QWEN: "qwen",
@@ -672,11 +695,13 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
672
695
  MODEL_ARCH.DEEPSEEK2: "deepseek2",
673
696
  MODEL_ARCH.CHATGLM: "chatglm",
674
697
  MODEL_ARCH.GLM4: "glm4",
698
+ MODEL_ARCH.GLM4_MOE: "glm4moe",
675
699
  MODEL_ARCH.BITNET: "bitnet",
676
700
  MODEL_ARCH.T5: "t5",
677
701
  MODEL_ARCH.T5ENCODER: "t5encoder",
678
702
  MODEL_ARCH.JAIS: "jais",
679
703
  MODEL_ARCH.NEMOTRON: "nemotron",
704
+ MODEL_ARCH.NEMOTRON_H: "nemotron_h",
680
705
  MODEL_ARCH.EXAONE: "exaone",
681
706
  MODEL_ARCH.EXAONE4: "exaone4",
682
707
  MODEL_ARCH.GRANITE: "granite",
@@ -692,9 +717,14 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
692
717
  MODEL_ARCH.ERNIE4_5_MOE: "ernie4_5-moe",
693
718
  MODEL_ARCH.FALCON_H1: "falcon-h1",
694
719
  MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
720
+ MODEL_ARCH.HUNYUAN_DENSE: "hunyuan-dense",
695
721
  MODEL_ARCH.SMOLLM3: "smollm3",
722
+ MODEL_ARCH.GPT_OSS: "gpt-oss",
696
723
  MODEL_ARCH.LFM2: "lfm2",
697
724
  MODEL_ARCH.DREAM: "dream",
725
+ MODEL_ARCH.SMALLTHINKER: "smallthinker",
726
+ MODEL_ARCH.LLADA: "llada",
727
+ MODEL_ARCH.SEED_OSS: "seed_oss",
698
728
  }
699
729
 
700
730
  VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -725,6 +755,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
725
755
  MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
726
756
  MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
727
757
  MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
758
+ MODEL_TENSOR.ATTN_SINKS: "blk.{bid}.attn_sinks",
728
759
  MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
729
760
  MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
730
761
  MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
@@ -927,6 +958,13 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
927
958
  MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc",
928
959
  MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
929
960
  MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid",
961
+ # NextN/MTP
962
+ MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj",
963
+ MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens",
964
+ MODEL_TENSOR.NEXTN_ENORM: "blk.{bid}.nextn.enorm",
965
+ MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm",
966
+ MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head",
967
+ MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm",
930
968
  }
931
969
 
932
970
  MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -1202,6 +1240,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1202
1240
  MODEL_TENSOR.LAYER_OUT_NORM,
1203
1241
  MODEL_TENSOR.CLS,
1204
1242
  ],
1243
+ MODEL_ARCH.JINA_BERT_V3: [
1244
+ MODEL_TENSOR.TOKEN_EMBD,
1245
+ MODEL_TENSOR.TOKEN_EMBD_NORM,
1246
+ MODEL_TENSOR.TOKEN_TYPES,
1247
+ MODEL_TENSOR.OUTPUT_NORM,
1248
+ MODEL_TENSOR.ATTN_OUT_NORM,
1249
+ MODEL_TENSOR.ATTN_QKV,
1250
+ MODEL_TENSOR.ATTN_OUT,
1251
+ MODEL_TENSOR.FFN_DOWN,
1252
+ MODEL_TENSOR.FFN_UP,
1253
+ MODEL_TENSOR.LAYER_OUT_NORM,
1254
+ ],
1205
1255
  MODEL_ARCH.MPT: [
1206
1256
  MODEL_TENSOR.TOKEN_EMBD,
1207
1257
  MODEL_TENSOR.OUTPUT_NORM,
@@ -1316,6 +1366,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1316
1366
  MODEL_TENSOR.FFN_DOWN,
1317
1367
  MODEL_TENSOR.FFN_UP,
1318
1368
  ],
1369
+ MODEL_ARCH.LLADA: [
1370
+ MODEL_TENSOR.TOKEN_EMBD,
1371
+ MODEL_TENSOR.OUTPUT_NORM,
1372
+ MODEL_TENSOR.OUTPUT,
1373
+ MODEL_TENSOR.ROPE_FREQS,
1374
+ MODEL_TENSOR.ATTN_NORM,
1375
+ MODEL_TENSOR.ATTN_Q,
1376
+ MODEL_TENSOR.ATTN_K,
1377
+ MODEL_TENSOR.ATTN_V,
1378
+ MODEL_TENSOR.ATTN_OUT,
1379
+ MODEL_TENSOR.FFN_NORM,
1380
+ MODEL_TENSOR.FFN_GATE,
1381
+ MODEL_TENSOR.FFN_DOWN,
1382
+ MODEL_TENSOR.FFN_UP,
1383
+ ],
1319
1384
  MODEL_ARCH.QWEN2VL: [
1320
1385
  MODEL_TENSOR.TOKEN_EMBD,
1321
1386
  MODEL_TENSOR.OUTPUT_NORM,
@@ -1928,6 +1993,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
1928
1993
  MODEL_TENSOR.FFN_DOWN,
1929
1994
  MODEL_TENSOR.FFN_UP,
1930
1995
  ],
1996
+ MODEL_ARCH.SEED_OSS: [
1997
+ MODEL_TENSOR.TOKEN_EMBD,
1998
+ MODEL_TENSOR.ATTN_NORM,
1999
+ MODEL_TENSOR.ATTN_Q,
2000
+ MODEL_TENSOR.ATTN_K,
2001
+ MODEL_TENSOR.ATTN_V,
2002
+ MODEL_TENSOR.ATTN_OUT,
2003
+ MODEL_TENSOR.ATTN_POST_NORM,
2004
+ MODEL_TENSOR.FFN_GATE,
2005
+ MODEL_TENSOR.FFN_DOWN,
2006
+ MODEL_TENSOR.FFN_UP,
2007
+ MODEL_TENSOR.OUTPUT_NORM,
2008
+ MODEL_TENSOR.OUTPUT,
2009
+ ],
1931
2010
  MODEL_ARCH.OLMOE: [
1932
2011
  MODEL_TENSOR.TOKEN_EMBD,
1933
2012
  MODEL_TENSOR.OUTPUT_NORM,
@@ -2100,6 +2179,37 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
2100
2179
  MODEL_TENSOR.ATTN_POST_NORM,
2101
2180
  MODEL_TENSOR.FFN_POST_NORM,
2102
2181
  ],
2182
+ MODEL_ARCH.GLM4_MOE: [
2183
+ MODEL_TENSOR.TOKEN_EMBD,
2184
+ MODEL_TENSOR.OUTPUT_NORM,
2185
+ MODEL_TENSOR.OUTPUT,
2186
+ MODEL_TENSOR.ATTN_NORM,
2187
+ MODEL_TENSOR.ATTN_POST_NORM,
2188
+ MODEL_TENSOR.ATTN_Q,
2189
+ MODEL_TENSOR.ATTN_K,
2190
+ MODEL_TENSOR.ATTN_V,
2191
+ MODEL_TENSOR.ATTN_OUT,
2192
+ MODEL_TENSOR.ATTN_Q_NORM,
2193
+ MODEL_TENSOR.ATTN_K_NORM,
2194
+ MODEL_TENSOR.FFN_GATE,
2195
+ MODEL_TENSOR.FFN_DOWN,
2196
+ MODEL_TENSOR.FFN_UP,
2197
+ MODEL_TENSOR.FFN_GATE_INP,
2198
+ MODEL_TENSOR.FFN_GATE_EXP,
2199
+ MODEL_TENSOR.FFN_DOWN_EXP,
2200
+ MODEL_TENSOR.FFN_UP_EXP,
2201
+ MODEL_TENSOR.FFN_GATE_SHEXP,
2202
+ MODEL_TENSOR.FFN_DOWN_SHEXP,
2203
+ MODEL_TENSOR.FFN_UP_SHEXP,
2204
+ MODEL_TENSOR.FFN_EXP_PROBS_B,
2205
+ # NextN/MTP tensors - preserved but unused
2206
+ MODEL_TENSOR.NEXTN_EH_PROJ,
2207
+ MODEL_TENSOR.NEXTN_EMBED_TOKENS,
2208
+ MODEL_TENSOR.NEXTN_ENORM,
2209
+ MODEL_TENSOR.NEXTN_HNORM,
2210
+ MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
2211
+ MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
2212
+ ],
2103
2213
  MODEL_ARCH.BITNET: [
2104
2214
  MODEL_TENSOR.ATTN_Q,
2105
2215
  MODEL_TENSOR.ATTN_K,
@@ -2189,6 +2299,25 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
2189
2299
  MODEL_TENSOR.FFN_DOWN,
2190
2300
  MODEL_TENSOR.FFN_UP,
2191
2301
  ],
2302
+ MODEL_ARCH.NEMOTRON_H: [
2303
+ MODEL_TENSOR.TOKEN_EMBD,
2304
+ MODEL_TENSOR.OUTPUT_NORM,
2305
+ MODEL_TENSOR.OUTPUT,
2306
+ MODEL_TENSOR.ATTN_NORM,
2307
+ MODEL_TENSOR.SSM_IN,
2308
+ MODEL_TENSOR.SSM_CONV1D,
2309
+ MODEL_TENSOR.SSM_DT,
2310
+ MODEL_TENSOR.SSM_A,
2311
+ MODEL_TENSOR.SSM_D,
2312
+ MODEL_TENSOR.SSM_NORM,
2313
+ MODEL_TENSOR.SSM_OUT,
2314
+ MODEL_TENSOR.ATTN_Q,
2315
+ MODEL_TENSOR.ATTN_K,
2316
+ MODEL_TENSOR.ATTN_V,
2317
+ MODEL_TENSOR.ATTN_OUT,
2318
+ MODEL_TENSOR.FFN_DOWN,
2319
+ MODEL_TENSOR.FFN_UP,
2320
+ ],
2192
2321
  MODEL_ARCH.EXAONE: [
2193
2322
  MODEL_TENSOR.TOKEN_EMBD,
2194
2323
  MODEL_TENSOR.OUTPUT_NORM,
@@ -2449,6 +2578,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
2449
2578
  MODEL_TENSOR.FFN_DOWN_SHEXP,
2450
2579
  MODEL_TENSOR.FFN_UP_SHEXP,
2451
2580
  ],
2581
+ MODEL_ARCH.HUNYUAN_DENSE: [
2582
+ MODEL_TENSOR.TOKEN_EMBD,
2583
+ MODEL_TENSOR.OUTPUT_NORM,
2584
+ MODEL_TENSOR.OUTPUT,
2585
+ MODEL_TENSOR.ATTN_NORM,
2586
+ MODEL_TENSOR.ATTN_Q,
2587
+ MODEL_TENSOR.ATTN_Q_NORM,
2588
+ MODEL_TENSOR.ATTN_K,
2589
+ MODEL_TENSOR.ATTN_K_NORM,
2590
+ MODEL_TENSOR.ATTN_V,
2591
+ MODEL_TENSOR.ATTN_OUT,
2592
+ MODEL_TENSOR.FFN_NORM,
2593
+ MODEL_TENSOR.FFN_GATE,
2594
+ MODEL_TENSOR.FFN_DOWN,
2595
+ MODEL_TENSOR.FFN_UP,
2596
+ ],
2452
2597
  MODEL_ARCH.SMOLLM3: [
2453
2598
  MODEL_TENSOR.TOKEN_EMBD,
2454
2599
  MODEL_TENSOR.OUTPUT_NORM,
@@ -2465,6 +2610,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
2465
2610
  MODEL_TENSOR.FFN_DOWN,
2466
2611
  MODEL_TENSOR.FFN_UP,
2467
2612
  ],
2613
+ MODEL_ARCH.GPT_OSS: [
2614
+ MODEL_TENSOR.TOKEN_EMBD,
2615
+ MODEL_TENSOR.OUTPUT_NORM,
2616
+ MODEL_TENSOR.OUTPUT,
2617
+ MODEL_TENSOR.ATTN_NORM,
2618
+ MODEL_TENSOR.ATTN_POST_NORM,
2619
+ MODEL_TENSOR.ATTN_Q,
2620
+ MODEL_TENSOR.ATTN_K,
2621
+ MODEL_TENSOR.ATTN_V,
2622
+ MODEL_TENSOR.ATTN_OUT,
2623
+ MODEL_TENSOR.ATTN_SINKS,
2624
+ MODEL_TENSOR.FFN_GATE_INP,
2625
+ MODEL_TENSOR.FFN_GATE_EXP,
2626
+ MODEL_TENSOR.FFN_DOWN_EXP,
2627
+ MODEL_TENSOR.FFN_UP_EXP,
2628
+ ],
2468
2629
  MODEL_ARCH.LFM2: [
2469
2630
  MODEL_TENSOR.TOKEN_EMBD,
2470
2631
  MODEL_TENSOR.TOKEN_EMBD_NORM,
@@ -2482,6 +2643,25 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
2482
2643
  MODEL_TENSOR.ATTN_K,
2483
2644
  MODEL_TENSOR.ATTN_V,
2484
2645
  MODEL_TENSOR.ATTN_OUT,
2646
+ MODEL_TENSOR.OUTPUT,
2647
+ ],
2648
+ MODEL_ARCH.SMALLTHINKER: [
2649
+ MODEL_TENSOR.TOKEN_EMBD,
2650
+ MODEL_TENSOR.OUTPUT_NORM,
2651
+ MODEL_TENSOR.OUTPUT,
2652
+ MODEL_TENSOR.ATTN_NORM,
2653
+ MODEL_TENSOR.ATTN_Q,
2654
+ MODEL_TENSOR.ATTN_K,
2655
+ MODEL_TENSOR.ATTN_V,
2656
+ MODEL_TENSOR.ATTN_OUT,
2657
+ MODEL_TENSOR.FFN_NORM,
2658
+ MODEL_TENSOR.FFN_GATE,
2659
+ MODEL_TENSOR.FFN_DOWN,
2660
+ MODEL_TENSOR.FFN_UP,
2661
+ MODEL_TENSOR.FFN_GATE_INP,
2662
+ MODEL_TENSOR.FFN_GATE_EXP,
2663
+ MODEL_TENSOR.FFN_DOWN_EXP,
2664
+ MODEL_TENSOR.FFN_UP_EXP,
2485
2665
  ],
2486
2666
  # TODO
2487
2667
  }
@@ -2601,6 +2781,7 @@ class GGMLQuantizationType(IntEnum):
2601
2781
  BF16 = 30
2602
2782
  TQ1_0 = 34
2603
2783
  TQ2_0 = 35
2784
+ MXFP4 = 39
2604
2785
 
2605
2786
 
2606
2787
  class ExpertGatingFuncType(IntEnum):
@@ -2704,6 +2885,9 @@ class VisionProjectorType:
2704
2885
  INTERNVL = "internvl"
2705
2886
  QWEN2A = "qwen2a" # audio
2706
2887
  QWEN25O = "qwen2.5o" # omni
2888
+ VOXTRAL = "voxtral"
2889
+ LFM2 = "lfm2"
2890
+ KIMIVL = "kimivl"
2707
2891
 
2708
2892
 
2709
2893
  # Items here are (block size, type size)
@@ -2740,6 +2924,7 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
2740
2924
  GGMLQuantizationType.BF16: (1, 2),
2741
2925
  GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
2742
2926
  GGMLQuantizationType.TQ2_0: (256, 2 + 64),
2927
+ GGMLQuantizationType.MXFP4: (32, 1 + 16),
2743
2928
  }
2744
2929
 
2745
2930
 
@@ -138,8 +138,9 @@ class GGUFWriter:
138
138
  size = prod(shape)
139
139
 
140
140
  if "_exps." in name:
141
- expert_params += (size // shape[-3])
142
- expert_sum += shape[-3]
141
+ expert_count = shape[-2 if ".bias" in name else -3]
142
+ expert_params += (size // expert_count)
143
+ expert_sum += expert_count
143
144
  n_expert_tensors += 1
144
145
  else:
145
146
  shared_params += size
@@ -753,6 +754,9 @@ class GGUFWriter:
753
754
  def add_moe_every_n_layers(self, value: int) -> None:
754
755
  self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value)
755
756
 
757
+ def add_nextn_predict_layers(self, count: int) -> None:
758
+ self.add_uint32(Keys.LLM.NEXTN_PREDICT_LAYERS.format(arch=self.arch), count)
759
+
756
760
  def add_swin_norm(self, value: bool) -> None:
757
761
  self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
758
762
 
@@ -1047,6 +1051,11 @@ class GGUFWriter:
1047
1051
  def add_audio_stack_factor(self, value: int) -> None:
1048
1052
  self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
1049
1053
 
1054
+ # diffusion models
1055
+
1056
+ def add_diffusion_shift_logits(self, value: bool) -> None:
1057
+ self.add_bool(Keys.Diffusion.SHIFT_LOGITS, value)
1058
+
1050
1059
  def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
1051
1060
  pack_prefix = ''
1052
1061
  if not skip_pack_prefix:
@@ -228,8 +228,7 @@ class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
228
228
  d = max / -8
229
229
  with np.errstate(divide="ignore"):
230
230
  id = np.where(d == 0, 0, 1 / d)
231
- # FIXME: Q4_0's reference rounding is cursed and depends on FMA
232
- qs = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(8.5), dtype=np.float32).astype(np.uint8).clip(0, 15)
231
+ qs = np.trunc((blocks * id) + np.float32(8.5), dtype=np.float32).astype(np.uint8).clip(0, 15)
233
232
 
234
233
  qs = qs.reshape((n_blocks, 2, cls.block_size // 2))
235
234
  qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4))
@@ -300,8 +299,7 @@ class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0):
300
299
  d = max / -16
301
300
  with np.errstate(divide="ignore"):
302
301
  id = np.where(d == 0, 0, 1 / d)
303
- # FIXME: Q5_0's reference rounding is cursed and depends on FMA
304
- q = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(16.5), dtype=np.float32).astype(np.uint8).clip(0, 31)
302
+ q = np.trunc((blocks * id) + np.float32(16.5), dtype=np.float32).astype(np.uint8).clip(0, 31)
305
303
 
306
304
  qs = q.reshape((n_blocks, 2, cls.block_size // 2))
307
305
  qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
@@ -655,6 +653,57 @@ class TQ2_0(__Quant, qtype=GGMLQuantizationType.TQ2_0):
655
653
  return (d * qs.astype(np.float32))
656
654
 
657
655
 
656
+ class MXFP4(__Quant, qtype=GGMLQuantizationType.MXFP4):
657
+ # e2m1 values (doubled)
658
+ # ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
659
+ kvalues = (0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12)
660
+
661
+ @staticmethod
662
+ # see ggml_e8m0_to_fp32_half in ggml-impl.h
663
+ def e8m0_to_fp32_half(x: np.ndarray) -> np.ndarray:
664
+ bits = np.where(x < 2, np.uint32(0x00200000) << np.uint32(x), np.uint32(x - 1) << np.uint32(23))
665
+ return bits.view(np.float32)
666
+
667
+ @classmethod
668
+ def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
669
+ n_blocks = blocks.shape[0]
670
+
671
+ d = abs(blocks).max(axis=-1, keepdims=True)
672
+
673
+ with np.errstate(divide="ignore"):
674
+ e = np.where(d > 0, np.floor(np.log2(d)) - 2 + 127, 0).astype(np.uint8)
675
+
676
+ d = cls.e8m0_to_fp32_half(e)
677
+
678
+ kvalues = np.array(cls.kvalues, dtype=np.int8).reshape((1, 1, 16))
679
+
680
+ errs = np.abs(d.reshape((n_blocks, 1, 1)) * kvalues.astype(np.float32) - blocks.reshape((n_blocks, cls.block_size, 1)))
681
+ best = np.argmin(errs, axis=-1, keepdims=True)
682
+
683
+ qs = best.reshape(n_blocks, 2, cls.block_size // 2).astype(np.uint8)
684
+ qs = qs[:, 0] | (qs[:, 1] << np.uint8(4))
685
+
686
+ qs = qs.reshape((n_blocks, cls.block_size // 2))
687
+
688
+ return np.concatenate([e, qs], axis=-1)
689
+
690
+ @classmethod
691
+ def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
692
+ n_blocks = blocks.shape[0]
693
+
694
+ e, qs = np.hsplit(blocks, [1])
695
+
696
+ d = cls.e8m0_to_fp32_half(e)
697
+
698
+ qs = qs.reshape((n_blocks, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 2, 1))
699
+ qs = (qs & np.uint8(0x0F)).view(np.int8)
700
+
701
+ kvalues = np.array(cls.kvalues, dtype=np.int8).reshape(1, 1, 16)
702
+ qs = np.take_along_axis(kvalues, qs, axis=-1).reshape((n_blocks, cls.block_size))
703
+
704
+ return (d * qs.astype(np.float32))
705
+
706
+
658
707
  class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
659
708
  ksigns: bytes = (
660
709
  b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"
@@ -19,6 +19,61 @@ import gguf
19
19
  logger = logging.getLogger("gguf-convert-endian")
20
20
 
21
21
 
22
+ def byteswap_q4_0(tensor, block_offs):
23
+ # Each block_q4_0 consists of an f16 delta (scaling factor) followed by 16 int8 quantizations.
24
+
25
+ # Byte-Swap f16 sized delta field
26
+ delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
27
+ delta.byteswap(inplace=True)
28
+
29
+
30
+ def byteswap_q8_0(tensor, block_offs):
31
+ # Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations.
32
+
33
+ # Byte-Swap f16 sized delta field
34
+ delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
35
+ delta.byteswap(inplace=True)
36
+
37
+
38
+ def byteswap_q4_k(tensor, block_offs):
39
+ # Each block_q4_k consists of 2 f16 values followed by 140 int8 values.
40
+
41
+ # Byte-Swap f16 sized fields
42
+ delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
43
+ delta.byteswap(inplace=True)
44
+
45
+ delta = tensor.data[block_offs + 2:block_offs + 4].view(dtype=np.uint16)
46
+ delta.byteswap(inplace=True)
47
+
48
+
49
+ def byteswap_q6_k(tensor, block_offs):
50
+ # Each block_q6_k consists of 208 int8 values followed by 1 f16 value.
51
+
52
+ # Byte-Swap f16 sized field
53
+ delta = tensor.data[block_offs + 208:block_offs + 210].view(dtype=np.uint16)
54
+ delta.byteswap(inplace=True)
55
+
56
+
57
+ byteswap_tensors = {
58
+ gguf.GGMLQuantizationType.Q4_0: {
59
+ "block_size": 18, # 18 bytes = <f16 delta scaling factor> + 16 * <int8 quant>
60
+ "byteswap_func": byteswap_q4_0,
61
+ },
62
+ gguf.GGMLQuantizationType.Q8_0: {
63
+ "block_size": 34, # 34 bytes = <f16 delta scaling factor> + 32 * <int8 quant>
64
+ "byteswap_func": byteswap_q8_0,
65
+ },
66
+ gguf.GGMLQuantizationType.Q4_K: {
67
+ "block_size": 144, # 144 bytes = 2 * <f16 delta scaling factor> + 140 * <int8 quant>
68
+ "byteswap_func": byteswap_q4_k,
69
+ },
70
+ gguf.GGMLQuantizationType.Q6_K: {
71
+ "block_size": 210, # 210 bytes = <f16 delta scaling factor> + 208 * <int8 quant>
72
+ "byteswap_func": byteswap_q6_k,
73
+ },
74
+ }
75
+
76
+
22
77
  def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None:
23
78
  file_endian = reader.endianess.name
24
79
  if reader.byte_order == 'S':
@@ -32,13 +87,11 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
32
87
  sys.exit(0)
33
88
  logger.info("* Checking tensors for conversion compatibility")
34
89
  for tensor in reader.tensors:
35
- if tensor.tensor_type not in (
36
- gguf.GGMLQuantizationType.F32,
37
- gguf.GGMLQuantizationType.F16,
38
- gguf.GGMLQuantizationType.Q8_0,
39
- gguf.GGMLQuantizationType.Q4_K,
40
- gguf.GGMLQuantizationType.Q6_K,
41
- ):
90
+ if tensor.tensor_type not in byteswap_tensors and \
91
+ tensor.tensor_type not in (
92
+ gguf.GGMLQuantizationType.F32,
93
+ gguf.GGMLQuantizationType.F16,
94
+ ):
42
95
  raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}")
43
96
  logger.info(f"* Preparing to convert from {file_endian} to {order}")
44
97
  if args.dry_run:
@@ -72,78 +125,29 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
72
125
  part.byteswap(inplace=True)
73
126
 
74
127
  # Byte-swap tensor data if necessary
75
- if tensor.tensor_type == gguf.GGMLQuantizationType.Q8_0:
76
- # Handle Q8_0 tensor blocks (block_q8_0)
77
- # Specific handling of block_q8_0 is required.
78
- # Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations.
79
-
80
- block_size = 34 # 34 bytes = <f16 delta scaling factor> + 32 * <int8 quant>
81
-
82
- n_blocks = len(tensor.data) // block_size
83
- for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
84
- block_offs = block_num * block_size
85
-
86
- # Byte-Swap f16 sized delta field
87
- delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
88
- delta.byteswap(inplace=True)
89
-
90
- # Byte-Swap Q8 weights
91
- if block_num % 100000 == 0:
92
- inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
93
-
94
- elif tensor.tensor_type == gguf.GGMLQuantizationType.Q4_K:
95
- # Handle Q4_K tensor blocks (block_q4_k)
96
- # Specific handling of block_q4_k is required.
97
- # Each block_q4_k consists of 2 f16 values followed by 140 int8 values.
98
-
128
+ if tensor.tensor_type in byteswap_tensors:
99
129
  # first flatten structure
130
+ oldshape = tensor.data.shape
100
131
  newshape = 1
101
132
  for i in tensor.data.shape:
102
133
  newshape *= i
103
134
 
104
135
  tensor.data.resize(newshape)
105
136
 
106
- block_size = 144
107
- n_blocks = len(tensor.data) // block_size
108
- for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
109
- block_offs = block_num * block_size
110
-
111
- # Byte-Swap f16 sized fields
112
- delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
113
- delta.byteswap(inplace=True)
114
-
115
- delta = tensor.data[block_offs + 2:block_offs + 4].view(dtype=np.uint16)
116
- delta.byteswap(inplace=True)
117
-
118
- # Byte-Swap
119
- if block_num % 100000 == 0:
120
- inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
121
-
122
- elif tensor.tensor_type == gguf.GGMLQuantizationType.Q6_K:
123
- # Handle Q6_K tensor blocks (block_q6_k)
124
- # Specific handling of block_q6_k is required.
125
- # Each block_q6_k consists of 208 int8 values followed by 1 f16 value.
126
-
127
- # first flatten structure
128
- newshape = 1
129
- for i in tensor.data.shape:
130
- newshape *= i
131
-
132
- tensor.data.resize(newshape)
137
+ block_size = byteswap_tensors[tensor.tensor_type]["block_size"]
138
+ byteswap_func = byteswap_tensors[tensor.tensor_type]["byteswap_func"]
133
139
 
134
- block_size = 210
135
140
  n_blocks = len(tensor.data) // block_size
136
141
  for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
137
142
  block_offs = block_num * block_size
138
143
 
139
- # Byte-Swap f16 sized field
140
- delta = tensor.data[block_offs + 208:block_offs + 210].view(dtype=np.uint16)
141
- delta.byteswap(inplace=True)
144
+ byteswap_func(tensor, block_offs)
142
145
 
143
- # Byte-Swap
144
146
  if block_num % 100000 == 0:
145
147
  inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
146
148
 
149
+ # restore old shape in case it's ever used
150
+ tensor.data.resize(oldshape)
147
151
  else:
148
152
  # Handle other tensor types
149
153
  tensor.data.byteswap(inplace=True)
@@ -111,6 +111,7 @@ def main() -> None:
111
111
  parser.add_argument("--general-description", type=str, help="The models general.description", metavar='"Description ..."')
112
112
  parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
113
113
  parser.add_argument("--chat-template-config", type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json')
114
+ parser.add_argument("--chat-template-file", type=Path, help="Jinja file containing chat template", metavar='chat_template.jinja')
114
115
  parser.add_argument("--pre-tokenizer", type=str, help="The models tokenizer.ggml.pre", metavar='"pre tokenizer"')
115
116
  parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model", metavar='general.url')
116
117
  parser.add_argument("--special-token", action="append", type=str, help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
@@ -134,12 +135,17 @@ def main() -> None:
134
135
  new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template)
135
136
 
136
137
  if args.chat_template_config:
137
- with open(args.chat_template_config, 'r') as fp:
138
+ with open(args.chat_template_config, 'r', encoding='utf-8') as fp:
138
139
  config = json.load(fp)
139
140
  template = config.get('chat_template')
140
141
  if template:
141
142
  new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
142
143
 
144
+ if args.chat_template_file:
145
+ with open(args.chat_template_file, 'r', encoding='utf-8') as fp:
146
+ template = fp.read()
147
+ new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
148
+
143
149
  if args.pre_tokenizer:
144
150
  new_metadata[gguf.Keys.Tokenizer.PRE] = MetadataDetails(gguf.GGUFValueType.STRING, args.pre_tokenizer)
145
151