@novastera-oss/llamarn 0.3.1 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (347) hide show
  1. package/README.md +86 -3
  2. package/RNLlamaCpp.podspec +1 -1
  3. package/android/CMakeLists.txt +11 -3
  4. package/android/generated/jni/react/renderer/components/RNLlamaCppSpec/RNLlamaCppSpecJSI.h +49 -4
  5. package/android/src/main/cpp/include/llama.h +53 -114
  6. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  12. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  13. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  15. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  16. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  17. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  20. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  21. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  22. package/cpp/LlamaCppModel.cpp +2 -10
  23. package/cpp/PureCppImpl.cpp +71 -4
  24. package/cpp/SystemUtils.cpp +3 -7
  25. package/cpp/build-info.cpp +2 -2
  26. package/cpp/llama.cpp/CMakeLists.txt +2 -0
  27. package/cpp/llama.cpp/CODEOWNERS +1 -1
  28. package/cpp/llama.cpp/Makefile +6 -1605
  29. package/cpp/llama.cpp/README.md +5 -1
  30. package/cpp/llama.cpp/common/arg.cpp +230 -51
  31. package/cpp/llama.cpp/common/chat-parser.cpp +9 -1
  32. package/cpp/llama.cpp/common/chat.cpp +539 -8
  33. package/cpp/llama.cpp/common/chat.h +8 -1
  34. package/cpp/llama.cpp/common/common.cpp +60 -15
  35. package/cpp/llama.cpp/common/common.h +64 -15
  36. package/cpp/llama.cpp/common/speculative.cpp +135 -54
  37. package/cpp/llama.cpp/common/speculative.h +8 -1
  38. package/cpp/llama.cpp/convert_hf_to_gguf.py +1216 -109
  39. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +19 -6
  40. package/cpp/llama.cpp/convert_lora_to_gguf.py +1 -1
  41. package/cpp/llama.cpp/flake.nix +0 -5
  42. package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -3
  43. package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +71 -70
  44. package/cpp/llama.cpp/ggml/include/ggml-opt.h +25 -6
  45. package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
  46. package/cpp/llama.cpp/ggml/include/ggml.h +90 -3
  47. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +13 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +1 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +10 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +113 -17
  51. package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +4 -4
  52. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +14 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +701 -585
  54. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +13 -3
  55. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +52 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +274 -91
  57. package/cpp/llama.cpp/ggml/src/ggml-common.h +17 -0
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +2 -2
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +371 -298
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +33 -2
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +26 -1
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
  73. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
  74. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  75. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +428 -23
  76. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -8
  77. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +458 -46
  80. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +22 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
  82. package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
  83. package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
  84. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
  85. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +122 -5
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +9 -11
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cu +58 -0
  88. package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cuh +3 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +275 -170
  90. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +103 -65
  92. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  93. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +171 -0
  94. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  95. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +33 -7
  96. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +13 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +2 -10
  98. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +3 -4
  99. package/cpp/llama.cpp/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  100. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +83 -27
  101. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +116 -57
  102. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +45 -18
  103. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +56 -29
  104. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +61 -39
  105. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +70 -49
  106. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +70 -21
  107. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +162 -50
  108. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cuh +2 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +5 -4
  110. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +208 -97
  111. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +46 -35
  112. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +56 -2
  113. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +95 -51
  114. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +427 -0
  115. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +5 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +204 -57
  117. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +252 -168
  118. package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
  119. package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
  120. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +10 -5
  121. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +192 -19
  122. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +5 -0
  123. package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  124. package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  125. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +82 -0
  126. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  127. package/cpp/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  128. package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cu +67 -0
  129. package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cuh +5 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +1 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cu +34 -0
  132. package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cuh +5 -0
  133. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +16 -10
  134. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +153 -71
  135. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +6 -10
  136. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +21 -4
  137. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  138. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +75 -0
  139. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  140. package/cpp/llama.cpp/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  141. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  142. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -25
  143. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -1
  144. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +10 -2
  145. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +31 -20
  147. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +342 -131
  148. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +464 -134
  149. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
  150. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +8 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1108 -176
  152. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  153. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  154. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +66 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +343 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +343 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +346 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +41 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  161. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  164. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  166. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +10 -2
  167. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +10 -2
  168. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +10 -2
  169. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +10 -2
  170. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +97 -41
  173. package/cpp/llama.cpp/ggml/src/ggml-quants.c +110 -16
  174. package/cpp/llama.cpp/ggml/src/ggml-quants.h +6 -0
  175. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +22 -9
  176. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  177. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -212
  178. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +213 -1
  179. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +117 -238
  180. package/cpp/llama.cpp/ggml/src/ggml-sycl/quantize.hpp +133 -0
  181. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +94 -0
  182. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1666 -633
  183. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  184. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  185. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  186. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  187. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +107 -43
  188. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  189. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +18 -0
  190. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  191. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  192. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
  193. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +21 -0
  194. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +16 -1
  195. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +44 -8
  196. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +44 -16
  197. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +26 -1
  198. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -17
  199. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
  200. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +37 -1
  201. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  202. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +109 -55
  203. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +71 -41
  204. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +6 -0
  205. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  206. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  207. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +49 -11
  208. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  209. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +9 -3
  210. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  211. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  212. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  213. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +55 -0
  214. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  215. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +75 -20
  216. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +2 -2
  217. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +807 -412
  218. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +72 -22
  219. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +8 -8
  220. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +1794 -0
  221. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +82 -0
  222. package/cpp/llama.cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  223. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +97 -0
  224. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +846 -0
  225. package/cpp/llama.cpp/ggml/src/ggml.c +204 -50
  226. package/cpp/llama.cpp/gguf-py/gguf/constants.py +187 -2
  227. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +11 -2
  228. package/cpp/llama.cpp/gguf-py/gguf/quants.py +53 -4
  229. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +67 -63
  230. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +7 -1
  231. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +120 -16
  232. package/cpp/llama.cpp/gguf-py/gguf/utility.py +5 -1
  233. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +284 -1
  234. package/cpp/llama.cpp/gguf-py/tests/test_quants.py +14 -5
  235. package/cpp/llama.cpp/include/llama.h +53 -114
  236. package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +171 -0
  237. package/cpp/llama.cpp/models/templates/README.md +2 -1
  238. package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +59 -0
  239. package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +331 -0
  240. package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +105 -0
  241. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -1
  242. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -6
  243. package/cpp/llama.cpp/requirements/requirements-pydantic.txt +1 -1
  244. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  245. package/cpp/llama.cpp/src/llama-adapter.cpp +68 -4
  246. package/cpp/llama.cpp/src/llama-adapter.h +3 -0
  247. package/cpp/llama.cpp/src/llama-arch.cpp +192 -2
  248. package/cpp/llama.cpp/src/llama-arch.h +18 -0
  249. package/cpp/llama.cpp/src/llama-batch.cpp +2 -2
  250. package/cpp/llama.cpp/src/llama-chat.cpp +47 -6
  251. package/cpp/llama.cpp/src/llama-chat.h +3 -0
  252. package/cpp/llama.cpp/src/llama-context.cpp +61 -252
  253. package/cpp/llama.cpp/src/llama-context.h +10 -15
  254. package/cpp/llama.cpp/src/llama-cparams.h +0 -1
  255. package/cpp/llama.cpp/src/llama-graph.cpp +180 -85
  256. package/cpp/llama.cpp/src/llama-graph.h +90 -51
  257. package/cpp/llama.cpp/src/llama-hparams.cpp +34 -3
  258. package/cpp/llama.cpp/src/llama-hparams.h +21 -6
  259. package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +79 -56
  260. package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +30 -28
  261. package/cpp/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +240 -632
  262. package/cpp/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +39 -74
  263. package/cpp/llama.cpp/src/llama-kv-cells.h +21 -21
  264. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +41 -35
  265. package/cpp/llama.cpp/src/llama-memory-hybrid.h +26 -29
  266. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +13 -9
  267. package/cpp/llama.cpp/src/llama-memory-recurrent.h +10 -14
  268. package/cpp/llama.cpp/src/llama-memory.h +13 -10
  269. package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
  270. package/cpp/llama.cpp/src/llama-model-loader.h +3 -2
  271. package/cpp/llama.cpp/src/llama-model.cpp +1959 -419
  272. package/cpp/llama.cpp/src/llama-model.h +28 -4
  273. package/cpp/llama.cpp/src/llama-quant.cpp +40 -4
  274. package/cpp/llama.cpp/src/llama-vocab.cpp +51 -2
  275. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  276. package/cpp/llama.cpp/vendor/minja/chat-template.hpp +16 -7
  277. package/cpp/llama.cpp/vendor/minja/minja.hpp +47 -12
  278. package/cpp/rn-completion.cpp +3 -27
  279. package/ios/generated/RNLlamaCppSpec/RNLlamaCppSpec.h +30 -0
  280. package/ios/generated/RNLlamaCppSpecJSI.h +49 -4
  281. package/ios/include/chat.h +8 -1
  282. package/ios/include/common/minja/chat-template.hpp +16 -7
  283. package/ios/include/common/minja/minja.hpp +47 -12
  284. package/ios/include/common.h +64 -15
  285. package/ios/include/llama.h +53 -114
  286. package/ios/include/speculative.h +8 -1
  287. package/ios/libs/llama.xcframework/Info.plist +18 -18
  288. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  289. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5557 -5267
  290. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +25 -6
  291. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +90 -3
  292. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +53 -114
  293. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  294. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  295. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5520 -5238
  296. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
  297. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
  298. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
  299. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
  300. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  301. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  302. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
  303. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4242 -4016
  304. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +25 -6
  305. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +90 -3
  306. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +53 -114
  307. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +25 -6
  308. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +90 -3
  309. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +53 -114
  310. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  311. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +25 -6
  312. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +90 -3
  313. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +53 -114
  314. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  315. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  316. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  317. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5556 -5267
  318. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +25 -6
  319. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +90 -3
  320. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +53 -114
  321. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  322. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  323. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
  324. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
  325. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
  326. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
  327. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
  328. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  329. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  330. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5553 -5303
  331. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +25 -6
  332. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +90 -3
  333. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +53 -114
  334. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  335. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  336. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5515 -5274
  337. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4238 -4044
  338. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
  339. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
  340. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
  341. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  342. package/lib/module/NativeRNLlamaCpp.js.map +1 -1
  343. package/lib/typescript/src/NativeRNLlamaCpp.d.ts +5 -0
  344. package/lib/typescript/src/NativeRNLlamaCpp.d.ts.map +1 -1
  345. package/package.json +1 -2
  346. package/src/NativeRNLlamaCpp.ts +7 -0
  347. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -56
@@ -582,9 +582,6 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
582
582
  #endif
583
583
 
584
584
  }
585
- static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
586
- static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
587
- static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
588
585
 
589
586
  static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
590
587
  [GGML_TYPE_I8] = {
@@ -690,6 +687,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
690
687
  .is_quantized = true,
691
688
  .from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref,
692
689
  },
690
+ [GGML_TYPE_MXFP4] = {
691
+ .type_name = "mxfp4",
692
+ .blck_size = QK_MXFP4,
693
+ .type_size = sizeof(block_mxfp4),
694
+ .is_quantized = true,
695
+ .to_float = (ggml_to_float_t) dequantize_row_mxfp4,
696
+ .from_float_ref = (ggml_from_float_t)quantize_row_mxfp4_ref,
697
+ },
693
698
  [GGML_TYPE_Q2_K] = {
694
699
  .type_name = "q2_K",
695
700
  .blck_size = QK_K,
@@ -917,6 +922,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
917
922
 
918
923
  "DUP",
919
924
  "ADD",
925
+ "ADD_ID",
920
926
  "ADD1",
921
927
  "ACC",
922
928
  "SUB",
@@ -969,6 +975,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
969
975
  "IM2COL",
970
976
  "IM2COL_BACK",
971
977
  "CONV_2D",
978
+ "CONV_3D",
972
979
  "CONV_2D_DW",
973
980
  "CONV_TRANSPOSE_2D",
974
981
  "POOL_1D",
@@ -1006,17 +1013,19 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
1006
1013
  "CROSS_ENTROPY_LOSS",
1007
1014
  "CROSS_ENTROPY_LOSS_BACK",
1008
1015
  "OPT_STEP_ADAMW",
1016
+ "OPT_STEP_SGD",
1009
1017
 
1010
1018
  "GLU",
1011
1019
  };
1012
1020
 
1013
- static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86");
1021
+ static_assert(GGML_OP_COUNT == 89, "GGML_OP_COUNT != 89");
1014
1022
 
1015
1023
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1016
1024
  "none",
1017
1025
 
1018
1026
  "x",
1019
1027
  "x+y",
1028
+ "x[i]+y",
1020
1029
  "x+y",
1021
1030
  "view(x,nb,offset)+=y->x",
1022
1031
  "x-y",
@@ -1069,6 +1078,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1069
1078
  "im2col(x)",
1070
1079
  "im2col_back(x)",
1071
1080
  "conv_2d(x)",
1081
+ "conv_3d(x)",
1072
1082
  "conv_2d_dw(x)",
1073
1083
  "conv_transpose_2d(x)",
1074
1084
  "pool_1d(x)",
@@ -1106,15 +1116,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1106
1116
  "cross_entropy_loss(x,y)",
1107
1117
  "cross_entropy_loss_back(x,y)",
1108
1118
  "adamw(x)",
1119
+ "sgd(x)",
1109
1120
 
1110
1121
  "glu(x)",
1111
1122
  };
1112
1123
 
1113
- static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86");
1124
+ static_assert(GGML_OP_COUNT == 89, "GGML_OP_COUNT != 89");
1114
1125
 
1115
1126
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1116
1127
 
1117
-
1118
1128
  static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
1119
1129
  "ABS",
1120
1130
  "SGN",
@@ -1140,11 +1150,12 @@ static const char * GGML_GLU_OP_NAME[GGML_GLU_OP_COUNT] = {
1140
1150
  "REGLU",
1141
1151
  "GEGLU",
1142
1152
  "SWIGLU",
1153
+ "SWIGLU_OAI",
1143
1154
  "GEGLU_ERF",
1144
1155
  "GEGLU_QUICK",
1145
1156
  };
1146
1157
 
1147
- static_assert(GGML_GLU_OP_COUNT == 5, "GGML_GLU_OP_COUNT != 5");
1158
+ static_assert(GGML_GLU_OP_COUNT == 6, "GGML_GLU_OP_COUNT != 6");
1148
1159
 
1149
1160
 
1150
1161
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -1312,6 +1323,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
1312
1323
  case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
1313
1324
  case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
1314
1325
  case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
1326
+ case GGML_FTYPE_MOSTLY_MXFP4: wtype = GGML_TYPE_MXFP4; break;
1315
1327
  case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
1316
1328
  case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
1317
1329
  case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
@@ -1962,6 +1974,27 @@ struct ggml_tensor * ggml_add_cast(
1962
1974
  return ggml_add_cast_impl(ctx, a, b, type);
1963
1975
  }
1964
1976
 
1977
+ struct ggml_tensor * ggml_add_id(
1978
+ struct ggml_context * ctx,
1979
+ struct ggml_tensor * a,
1980
+ struct ggml_tensor * b,
1981
+ struct ggml_tensor * ids) {
1982
+
1983
+ GGML_ASSERT(a->ne[0] == b->ne[0]);
1984
+ GGML_ASSERT(a->ne[1] == ids->ne[0]);
1985
+ GGML_ASSERT(a->ne[2] == ids->ne[1]);
1986
+ GGML_ASSERT(ids->type == GGML_TYPE_I32);
1987
+
1988
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
1989
+
1990
+ result->op = GGML_OP_ADD_ID;
1991
+ result->src[0] = a;
1992
+ result->src[1] = b;
1993
+ result->src[2] = ids;
1994
+
1995
+ return result;
1996
+ }
1997
+
1965
1998
  // ggml_add1
1966
1999
 
1967
2000
  static struct ggml_tensor * ggml_add1_impl(
@@ -2812,6 +2845,19 @@ struct ggml_tensor * ggml_geglu_quick_split(
2812
2845
  return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_GEGLU_QUICK, false);
2813
2846
  }
2814
2847
 
2848
+ struct ggml_tensor * ggml_swiglu_oai(
2849
+ struct ggml_context * ctx,
2850
+ struct ggml_tensor * a,
2851
+ struct ggml_tensor * b,
2852
+ float alpha,
2853
+ float limit) {
2854
+ struct ggml_tensor * result = ggml_glu_impl(ctx, a, b, GGML_GLU_OP_SWIGLU_OAI, false);
2855
+ ggml_set_op_params_f32(result, 2, alpha);
2856
+ ggml_set_op_params_f32(result, 3, limit);
2857
+
2858
+ return result;
2859
+ }
2860
+
2815
2861
  // ggml_norm
2816
2862
 
2817
2863
  static struct ggml_tensor * ggml_norm_impl(
@@ -3779,6 +3825,22 @@ struct ggml_tensor * ggml_soft_max_ext(
3779
3825
  return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
3780
3826
  }
3781
3827
 
3828
+ void ggml_soft_max_add_sinks(
3829
+ struct ggml_tensor * a,
3830
+ struct ggml_tensor * sinks) {
3831
+ if (!sinks) {
3832
+ a->src[2] = NULL;
3833
+ return;
3834
+ }
3835
+
3836
+ GGML_ASSERT(a->op == GGML_OP_SOFT_MAX);
3837
+ GGML_ASSERT(a->src[2] == NULL);
3838
+ GGML_ASSERT(a->src[0]->ne[2] == sinks->ne[0]);
3839
+ GGML_ASSERT(sinks->type == GGML_TYPE_F32);
3840
+
3841
+ a->src[2] = sinks;
3842
+ }
3843
+
3782
3844
  // ggml_soft_max_ext_back
3783
3845
 
3784
3846
  static struct ggml_tensor * ggml_soft_max_ext_back_impl(
@@ -3826,6 +3888,7 @@ static struct ggml_tensor * ggml_rope_impl(
3826
3888
  struct ggml_tensor * b,
3827
3889
  struct ggml_tensor * c,
3828
3890
  int n_dims,
3891
+ int sections[GGML_MROPE_SECTIONS],
3829
3892
  int mode,
3830
3893
  int n_ctx_orig,
3831
3894
  float freq_base,
@@ -3839,15 +3902,19 @@ static struct ggml_tensor * ggml_rope_impl(
3839
3902
 
3840
3903
  GGML_ASSERT(ggml_is_vector(b));
3841
3904
  GGML_ASSERT(b->type == GGML_TYPE_I32);
3842
- GGML_ASSERT(a->ne[2] == b->ne[0]);
3905
+
3906
+ bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
3907
+ if (mrope_used) {
3908
+ GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
3909
+ } else {
3910
+ GGML_ASSERT(a->ne[2] == b->ne[0]);
3911
+ }
3843
3912
 
3844
3913
  if (c) {
3845
3914
  GGML_ASSERT(c->type == GGML_TYPE_F32);
3846
3915
  GGML_ASSERT(c->ne[0] >= n_dims / 2);
3847
3916
  }
3848
3917
 
3849
- int sections[4] = {0, 0, 0, 0};
3850
-
3851
3918
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
3852
3919
 
3853
3920
  int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
@@ -3857,7 +3924,11 @@ static struct ggml_tensor * ggml_rope_impl(
3857
3924
  memcpy(params + 8, &attn_factor, sizeof(float));
3858
3925
  memcpy(params + 9, &beta_fast, sizeof(float));
3859
3926
  memcpy(params + 10, &beta_slow, sizeof(float));
3860
- memcpy(params + 11, &sections, sizeof(int)*4);
3927
+ if (mrope_used) {
3928
+ memcpy(params + 11, sections, sizeof(int32_t) * GGML_MROPE_SECTIONS);
3929
+ } else {
3930
+ memset(params + 11, 0, sizeof(int32_t) * GGML_MROPE_SECTIONS);
3931
+ }
3861
3932
  ggml_set_op_params(result, params, sizeof(params));
3862
3933
 
3863
3934
  result->op = GGML_OP_ROPE;
@@ -3875,7 +3946,7 @@ struct ggml_tensor * ggml_rope(
3875
3946
  int n_dims,
3876
3947
  int mode) {
3877
3948
  return ggml_rope_impl(
3878
- ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
3949
+ ctx, a, b, NULL, n_dims, NULL, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
3879
3950
  );
3880
3951
  }
3881
3952
 
@@ -3885,7 +3956,7 @@ struct ggml_tensor * ggml_rope_multi(
3885
3956
  struct ggml_tensor * b,
3886
3957
  struct ggml_tensor * c,
3887
3958
  int n_dims,
3888
- int sections[4],
3959
+ int sections[GGML_MROPE_SECTIONS],
3889
3960
  int mode,
3890
3961
  int n_ctx_orig,
3891
3962
  float freq_base,
@@ -3894,36 +3965,31 @@ struct ggml_tensor * ggml_rope_multi(
3894
3965
  float attn_factor,
3895
3966
  float beta_fast,
3896
3967
  float beta_slow) {
3897
- // Multimodal Rotary Position Embedding
3898
- GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
3899
-
3900
- GGML_ASSERT(ggml_is_vector(b));
3901
- GGML_ASSERT(b->type == GGML_TYPE_I32);
3902
- GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
3903
-
3904
- if (c) {
3905
- GGML_ASSERT(c->type == GGML_TYPE_F32);
3906
- GGML_ASSERT(c->ne[0] >= n_dims / 2);
3907
- }
3908
-
3909
- struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
3910
-
3911
- int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
3912
- memcpy(params + 5, &freq_base, sizeof(float));
3913
- memcpy(params + 6, &freq_scale, sizeof(float));
3914
- memcpy(params + 7, &ext_factor, sizeof(float));
3915
- memcpy(params + 8, &attn_factor, sizeof(float));
3916
- memcpy(params + 9, &beta_fast, sizeof(float));
3917
- memcpy(params + 10, &beta_slow, sizeof(float));
3918
- memcpy(&params[11], sections, sizeof(int)*4);
3919
- ggml_set_op_params(result, params, sizeof(params));
3920
-
3921
- result->op = GGML_OP_ROPE;
3922
- result->src[0] = a;
3923
- result->src[1] = b;
3924
- result->src[2] = c;
3968
+ return ggml_rope_impl(
3969
+ ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale,
3970
+ ext_factor, attn_factor, beta_fast, beta_slow, false
3971
+ );
3972
+ }
3925
3973
 
3926
- return result;
3974
+ struct ggml_tensor * ggml_rope_multi_inplace(
3975
+ struct ggml_context * ctx,
3976
+ struct ggml_tensor * a,
3977
+ struct ggml_tensor * b,
3978
+ struct ggml_tensor * c,
3979
+ int n_dims,
3980
+ int sections[GGML_MROPE_SECTIONS],
3981
+ int mode,
3982
+ int n_ctx_orig,
3983
+ float freq_base,
3984
+ float freq_scale,
3985
+ float ext_factor,
3986
+ float attn_factor,
3987
+ float beta_fast,
3988
+ float beta_slow) {
3989
+ return ggml_rope_impl(
3990
+ ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale,
3991
+ ext_factor, attn_factor, beta_fast, beta_slow, true
3992
+ );
3927
3993
  }
3928
3994
 
3929
3995
  struct ggml_tensor * ggml_rope_inplace(
@@ -3933,7 +3999,7 @@ struct ggml_tensor * ggml_rope_inplace(
3933
3999
  int n_dims,
3934
4000
  int mode) {
3935
4001
  return ggml_rope_impl(
3936
- ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
4002
+ ctx, a, b, NULL, n_dims, NULL, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
3937
4003
  );
3938
4004
  }
3939
4005
 
@@ -3952,7 +4018,7 @@ struct ggml_tensor * ggml_rope_ext(
3952
4018
  float beta_fast,
3953
4019
  float beta_slow) {
3954
4020
  return ggml_rope_impl(
3955
- ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
4021
+ ctx, a, b, c, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
3956
4022
  ext_factor, attn_factor, beta_fast, beta_slow, false
3957
4023
  );
3958
4024
  }
@@ -3972,7 +4038,7 @@ struct ggml_tensor * ggml_rope_ext_inplace(
3972
4038
  float beta_fast,
3973
4039
  float beta_slow) {
3974
4040
  return ggml_rope_impl(
3975
- ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
4041
+ ctx, a, b, c, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
3976
4042
  ext_factor, attn_factor, beta_fast, beta_slow, true
3977
4043
  );
3978
4044
  }
@@ -3991,7 +4057,7 @@ struct ggml_tensor * ggml_rope_custom(
3991
4057
  float beta_fast,
3992
4058
  float beta_slow) {
3993
4059
  return ggml_rope_impl(
3994
- ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
4060
+ ctx, a, b, NULL, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
3995
4061
  ext_factor, attn_factor, beta_fast, beta_slow, false
3996
4062
  );
3997
4063
  }
@@ -4010,7 +4076,7 @@ struct ggml_tensor * ggml_rope_custom_inplace(
4010
4076
  float beta_fast,
4011
4077
  float beta_slow) {
4012
4078
  return ggml_rope_impl(
4013
- ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
4079
+ ctx, a, b, NULL, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
4014
4080
  ext_factor, attn_factor, beta_fast, beta_slow, true
4015
4081
  );
4016
4082
  }
@@ -4208,14 +4274,13 @@ struct ggml_tensor * ggml_conv_1d_dw(
4208
4274
  int s0,
4209
4275
  int p0,
4210
4276
  int d0) {
4211
- struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], 1, a->ne[1], a->ne[2]);
4212
4277
  struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]);
4213
4278
 
4214
- struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16);
4279
+ struct ggml_tensor * im2col = ggml_im2col(ctx, a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16);
4215
4280
 
4216
4281
  struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a);
4217
4282
 
4218
- result = ggml_reshape_3d(ctx, result, b->ne[0], b->ne[1], 1);
4283
+ result = ggml_reshape_3d(ctx, result, result->ne[0], result->ne[2], 1);
4219
4284
 
4220
4285
  return result;
4221
4286
  }
@@ -4417,6 +4482,56 @@ struct ggml_tensor * ggml_conv_2d_direct(
4417
4482
  return result;
4418
4483
  }
4419
4484
 
4485
+ // ggml_conv_3d
4486
+
4487
+ struct ggml_tensor * ggml_conv_3d(
4488
+ struct ggml_context * ctx,
4489
+ struct ggml_tensor * a,
4490
+ struct ggml_tensor * b,
4491
+ int s0,
4492
+ int s1,
4493
+ int s2,
4494
+ int p0,
4495
+ int p1,
4496
+ int p2,
4497
+ int d0,
4498
+ int d1,
4499
+ int d2,
4500
+ int c,
4501
+ int n,
4502
+ int oc) {
4503
+
4504
+ GGML_ASSERT(a->ne[3] == (int64_t) c * oc);
4505
+ GGML_ASSERT(b->ne[3] == (int64_t) c * n);
4506
+
4507
+ int64_t ne[4];
4508
+ ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
4509
+ ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
4510
+ ne[2] = ggml_calc_conv_output_size(b->ne[2], a->ne[2], s2, p2, d2);
4511
+ ne[3] = (int64_t) oc * n;
4512
+
4513
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4514
+
4515
+ ggml_set_op_params_i32(result, 0, s0);
4516
+ ggml_set_op_params_i32(result, 1, s1);
4517
+ ggml_set_op_params_i32(result, 2, s2);
4518
+ ggml_set_op_params_i32(result, 3, p0);
4519
+ ggml_set_op_params_i32(result, 4, p1);
4520
+ ggml_set_op_params_i32(result, 5, p2);
4521
+ ggml_set_op_params_i32(result, 6, d0);
4522
+ ggml_set_op_params_i32(result, 7, d1);
4523
+ ggml_set_op_params_i32(result, 8, d2);
4524
+ ggml_set_op_params_i32(result, 9, c);
4525
+ ggml_set_op_params_i32(result, 10, n);
4526
+ ggml_set_op_params_i32(result, 11, oc);
4527
+
4528
+ result->op = GGML_OP_CONV_3D;
4529
+ result->src[0] = a;
4530
+ result->src[1] = b;
4531
+
4532
+ return result;
4533
+ }
4534
+
4420
4535
  // ggml_conv_transpose_2d_p0
4421
4536
 
4422
4537
  static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
@@ -4812,6 +4927,22 @@ enum ggml_prec ggml_flash_attn_ext_get_prec(
4812
4927
  return (enum ggml_prec) prec_i32;
4813
4928
  }
4814
4929
 
4930
+ void ggml_flash_attn_ext_add_sinks(
4931
+ struct ggml_tensor * a,
4932
+ struct ggml_tensor * sinks) {
4933
+ if (!sinks) {
4934
+ a->src[4] = NULL;
4935
+ return;
4936
+ }
4937
+
4938
+ GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
4939
+ GGML_ASSERT(a->src[4] == NULL);
4940
+ GGML_ASSERT(a->src[0]->ne[2] == sinks->ne[0]);
4941
+ GGML_ASSERT(sinks->type == GGML_TYPE_F32);
4942
+
4943
+ a->src[4] = sinks;
4944
+ }
4945
+
4815
4946
  // ggml_flash_attn_back
4816
4947
 
4817
4948
  struct ggml_tensor * ggml_flash_attn_back(
@@ -5527,6 +5658,28 @@ struct ggml_tensor * ggml_opt_step_adamw(
5527
5658
  return result;
5528
5659
  }
5529
5660
 
5661
+ // opt_step_sgd
5662
+
5663
+ struct ggml_tensor * ggml_opt_step_sgd(
5664
+ struct ggml_context * ctx,
5665
+ struct ggml_tensor * a,
5666
+ struct ggml_tensor * grad,
5667
+ struct ggml_tensor * params) {
5668
+ GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM);
5669
+ GGML_ASSERT(ggml_are_same_shape(a, grad));
5670
+ GGML_ASSERT(params->type == GGML_TYPE_F32);
5671
+ GGML_ASSERT(ggml_nelements(params) == 2);
5672
+
5673
+ struct ggml_tensor * result = ggml_view_tensor(ctx, a);
5674
+
5675
+ result->op = GGML_OP_OPT_STEP_SGD;
5676
+ result->src[0] = a;
5677
+ result->src[1] = grad;
5678
+ result->src[2] = params;
5679
+
5680
+ return result;
5681
+ }
5682
+
5530
5683
  ////////////////////////////////////////////////////////////////////////////////
5531
5684
 
5532
5685
  struct ggml_hash_set ggml_hash_set_new(size_t size) {
@@ -6872,6 +7025,7 @@ size_t ggml_quantize_chunk(
6872
7025
  case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6873
7026
  case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6874
7027
  case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
7028
+ case GGML_TYPE_MXFP4: result = quantize_mxfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6875
7029
  case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6876
7030
  case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6877
7031
  case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;