@novastera-oss/llamarn 0.3.1 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (347) hide show
  1. package/README.md +86 -3
  2. package/RNLlamaCpp.podspec +1 -1
  3. package/android/CMakeLists.txt +11 -3
  4. package/android/generated/jni/react/renderer/components/RNLlamaCppSpec/RNLlamaCppSpecJSI.h +49 -4
  5. package/android/src/main/cpp/include/llama.h +53 -114
  6. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  12. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  13. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  15. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  16. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  17. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  20. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  21. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  22. package/cpp/LlamaCppModel.cpp +2 -10
  23. package/cpp/PureCppImpl.cpp +71 -4
  24. package/cpp/SystemUtils.cpp +3 -7
  25. package/cpp/build-info.cpp +2 -2
  26. package/cpp/llama.cpp/CMakeLists.txt +2 -0
  27. package/cpp/llama.cpp/CODEOWNERS +1 -1
  28. package/cpp/llama.cpp/Makefile +6 -1605
  29. package/cpp/llama.cpp/README.md +5 -1
  30. package/cpp/llama.cpp/common/arg.cpp +230 -51
  31. package/cpp/llama.cpp/common/chat-parser.cpp +9 -1
  32. package/cpp/llama.cpp/common/chat.cpp +539 -8
  33. package/cpp/llama.cpp/common/chat.h +8 -1
  34. package/cpp/llama.cpp/common/common.cpp +60 -15
  35. package/cpp/llama.cpp/common/common.h +64 -15
  36. package/cpp/llama.cpp/common/speculative.cpp +135 -54
  37. package/cpp/llama.cpp/common/speculative.h +8 -1
  38. package/cpp/llama.cpp/convert_hf_to_gguf.py +1216 -109
  39. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +19 -6
  40. package/cpp/llama.cpp/convert_lora_to_gguf.py +1 -1
  41. package/cpp/llama.cpp/flake.nix +0 -5
  42. package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -3
  43. package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +71 -70
  44. package/cpp/llama.cpp/ggml/include/ggml-opt.h +25 -6
  45. package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
  46. package/cpp/llama.cpp/ggml/include/ggml.h +90 -3
  47. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +13 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +1 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +10 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +113 -17
  51. package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +4 -4
  52. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +14 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +701 -585
  54. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +13 -3
  55. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +52 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +274 -91
  57. package/cpp/llama.cpp/ggml/src/ggml-common.h +17 -0
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +2 -2
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +371 -298
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +33 -2
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +26 -1
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
  73. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
  74. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  75. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +428 -23
  76. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -8
  77. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +458 -46
  80. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +22 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
  82. package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
  83. package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
  84. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
  85. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +122 -5
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +9 -11
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cu +58 -0
  88. package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cuh +3 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +275 -170
  90. package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cuh +2 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +103 -65
  92. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
  93. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +171 -0
  94. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cuh +5 -0
  95. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +33 -7
  96. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +13 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +2 -10
  98. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +3 -4
  99. package/cpp/llama.cpp/ggml/src/ggml-cuda/dequantize.cuh +14 -40
  100. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +83 -27
  101. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +116 -57
  102. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +45 -18
  103. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +56 -29
  104. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +61 -39
  105. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +70 -49
  106. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +70 -21
  107. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +162 -50
  108. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cuh +2 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +5 -4
  110. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +208 -97
  111. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +46 -35
  112. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +56 -2
  113. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +95 -51
  114. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +427 -0
  115. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +5 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +204 -57
  117. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +252 -168
  118. package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
  119. package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
  120. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +10 -5
  121. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +192 -19
  122. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +5 -0
  123. package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
  124. package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
  125. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +82 -0
  126. package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
  127. package/cpp/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
  128. package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cu +67 -0
  129. package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cuh +5 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +1 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cu +34 -0
  132. package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cuh +5 -0
  133. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +16 -10
  134. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +153 -71
  135. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +6 -10
  136. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +21 -4
  137. package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
  138. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +75 -0
  139. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  140. package/cpp/llama.cpp/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
  141. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
  142. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -25
  143. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -1
  144. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +10 -2
  145. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +31 -20
  147. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +342 -131
  148. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +464 -134
  149. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
  150. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +8 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1108 -176
  152. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add.cl +107 -0
  153. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  154. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +66 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +343 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +343 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +346 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +41 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
  161. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  164. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
  166. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +10 -2
  167. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +10 -2
  168. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +10 -2
  169. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +10 -2
  170. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +97 -41
  173. package/cpp/llama.cpp/ggml/src/ggml-quants.c +110 -16
  174. package/cpp/llama.cpp/ggml/src/ggml-quants.h +6 -0
  175. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +22 -9
  176. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  177. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -212
  178. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +213 -1
  179. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +117 -238
  180. package/cpp/llama.cpp/ggml/src/ggml-sycl/quantize.hpp +133 -0
  181. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +94 -0
  182. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1666 -633
  183. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
  184. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
  185. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
  186. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
  187. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +107 -43
  188. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
  189. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +18 -0
  190. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
  191. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
  192. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
  193. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +21 -0
  194. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +16 -1
  195. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +44 -8
  196. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +44 -16
  197. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +26 -1
  198. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -17
  199. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
  200. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +37 -1
  201. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
  202. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +109 -55
  203. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +71 -41
  204. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +6 -0
  205. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
  206. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
  207. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +49 -11
  208. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
  209. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +9 -3
  210. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
  211. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
  212. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
  213. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +55 -0
  214. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
  215. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +75 -20
  216. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +2 -2
  217. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +807 -412
  218. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +72 -22
  219. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +8 -8
  220. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +1794 -0
  221. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +82 -0
  222. package/cpp/llama.cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  223. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +97 -0
  224. package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +846 -0
  225. package/cpp/llama.cpp/ggml/src/ggml.c +204 -50
  226. package/cpp/llama.cpp/gguf-py/gguf/constants.py +187 -2
  227. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +11 -2
  228. package/cpp/llama.cpp/gguf-py/gguf/quants.py +53 -4
  229. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +67 -63
  230. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +7 -1
  231. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +120 -16
  232. package/cpp/llama.cpp/gguf-py/gguf/utility.py +5 -1
  233. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +284 -1
  234. package/cpp/llama.cpp/gguf-py/tests/test_quants.py +14 -5
  235. package/cpp/llama.cpp/include/llama.h +53 -114
  236. package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +171 -0
  237. package/cpp/llama.cpp/models/templates/README.md +2 -1
  238. package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +59 -0
  239. package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +331 -0
  240. package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +105 -0
  241. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -1
  242. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -6
  243. package/cpp/llama.cpp/requirements/requirements-pydantic.txt +1 -1
  244. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  245. package/cpp/llama.cpp/src/llama-adapter.cpp +68 -4
  246. package/cpp/llama.cpp/src/llama-adapter.h +3 -0
  247. package/cpp/llama.cpp/src/llama-arch.cpp +192 -2
  248. package/cpp/llama.cpp/src/llama-arch.h +18 -0
  249. package/cpp/llama.cpp/src/llama-batch.cpp +2 -2
  250. package/cpp/llama.cpp/src/llama-chat.cpp +47 -6
  251. package/cpp/llama.cpp/src/llama-chat.h +3 -0
  252. package/cpp/llama.cpp/src/llama-context.cpp +61 -252
  253. package/cpp/llama.cpp/src/llama-context.h +10 -15
  254. package/cpp/llama.cpp/src/llama-cparams.h +0 -1
  255. package/cpp/llama.cpp/src/llama-graph.cpp +180 -85
  256. package/cpp/llama.cpp/src/llama-graph.h +90 -51
  257. package/cpp/llama.cpp/src/llama-hparams.cpp +34 -3
  258. package/cpp/llama.cpp/src/llama-hparams.h +21 -6
  259. package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +79 -56
  260. package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +30 -28
  261. package/cpp/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +240 -632
  262. package/cpp/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +39 -74
  263. package/cpp/llama.cpp/src/llama-kv-cells.h +21 -21
  264. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +41 -35
  265. package/cpp/llama.cpp/src/llama-memory-hybrid.h +26 -29
  266. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +13 -9
  267. package/cpp/llama.cpp/src/llama-memory-recurrent.h +10 -14
  268. package/cpp/llama.cpp/src/llama-memory.h +13 -10
  269. package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
  270. package/cpp/llama.cpp/src/llama-model-loader.h +3 -2
  271. package/cpp/llama.cpp/src/llama-model.cpp +1959 -419
  272. package/cpp/llama.cpp/src/llama-model.h +28 -4
  273. package/cpp/llama.cpp/src/llama-quant.cpp +40 -4
  274. package/cpp/llama.cpp/src/llama-vocab.cpp +51 -2
  275. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  276. package/cpp/llama.cpp/vendor/minja/chat-template.hpp +16 -7
  277. package/cpp/llama.cpp/vendor/minja/minja.hpp +47 -12
  278. package/cpp/rn-completion.cpp +3 -27
  279. package/ios/generated/RNLlamaCppSpec/RNLlamaCppSpec.h +30 -0
  280. package/ios/generated/RNLlamaCppSpecJSI.h +49 -4
  281. package/ios/include/chat.h +8 -1
  282. package/ios/include/common/minja/chat-template.hpp +16 -7
  283. package/ios/include/common/minja/minja.hpp +47 -12
  284. package/ios/include/common.h +64 -15
  285. package/ios/include/llama.h +53 -114
  286. package/ios/include/speculative.h +8 -1
  287. package/ios/libs/llama.xcframework/Info.plist +18 -18
  288. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  289. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5557 -5267
  290. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +25 -6
  291. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +90 -3
  292. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +53 -114
  293. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  294. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  295. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5520 -5238
  296. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
  297. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
  298. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
  299. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
  300. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  301. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  302. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
  303. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4242 -4016
  304. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +25 -6
  305. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +90 -3
  306. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +53 -114
  307. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +25 -6
  308. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +90 -3
  309. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +53 -114
  310. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  311. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +25 -6
  312. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +90 -3
  313. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +53 -114
  314. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  315. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  316. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  317. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5556 -5267
  318. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +25 -6
  319. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +90 -3
  320. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +53 -114
  321. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  322. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  323. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
  324. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
  325. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
  326. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
  327. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
  328. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  329. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  330. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5553 -5303
  331. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +25 -6
  332. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +90 -3
  333. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +53 -114
  334. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  335. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  336. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5515 -5274
  337. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4238 -4044
  338. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
  339. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
  340. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
  341. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  342. package/lib/module/NativeRNLlamaCpp.js.map +1 -1
  343. package/lib/typescript/src/NativeRNLlamaCpp.d.ts +5 -0
  344. package/lib/typescript/src/NativeRNLlamaCpp.d.ts.map +1 -1
  345. package/package.json +1 -2
  346. package/src/NativeRNLlamaCpp.ts +7 -0
  347. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -56
package/README.md CHANGED
@@ -18,6 +18,9 @@
18
18
  * Chat completion with templates (including Jinja template support)
19
19
  * Embeddings generation
20
20
  * Function/tool calling support
21
+ * **Advanced thinking and reasoning support** for compatible models
22
+ * **Flexible reasoning budget control** (unlimited, disabled, or limited)
23
+ * **Multiple reasoning format support** (none, auto, deepseek, deepseek-legacy)
21
24
 
22
25
  ## What Needs Help
23
26
 
@@ -38,6 +41,8 @@ We welcome contributions, especially in these areas:
38
41
  3. **Tool Support**:
39
42
  * Improving tool calling functionality for complex interactions
40
43
  * Better JSON validation and error handling
44
+ * Enhanced thinking and reasoning model support
45
+ * Advanced reasoning format implementations
41
46
 
42
47
  4. **Testing**:
43
48
  * Automated testing using the example project
@@ -139,7 +144,10 @@ import { initLlama } from '@novastera-oss/llamarn';
139
144
  const context = await initLlama({
140
145
  model: 'path/to/model.gguf',
141
146
  n_ctx: 2048,
142
- n_batch: 512
147
+ n_batch: 512,
148
+ // Optional: Enable thinking and reasoning capabilities
149
+ reasoning_budget: -1, // Unlimited thinking
150
+ reasoning_format: 'auto' // Automatic reasoning format detection
143
151
  });
144
152
 
145
153
  // Generate a completion
@@ -162,7 +170,10 @@ const context = await initLlama({
162
170
  model: 'path/to/model.gguf',
163
171
  n_ctx: 4096,
164
172
  n_batch: 512,
165
- use_jinja: true // Enable Jinja template parsing
173
+ use_jinja: true, // Enable Jinja template parsing
174
+ // Optional: Configure thinking and reasoning
175
+ reasoning_budget: -1, // Enable unlimited thinking
176
+ reasoning_format: 'deepseek' // Use DeepSeek reasoning format
166
177
  });
167
178
 
168
179
  // Chat completion with messages
@@ -189,9 +200,47 @@ const context = await initLlama({
189
200
  model: 'path/to/model.gguf',
190
201
  n_ctx: 2048,
191
202
  n_batch: 512,
192
- use_jinja: true // Enable template handling for tool calls
203
+ use_jinja: true, // Enable template handling for tool calls
204
+ parse_tool_calls: true, // Enable tool call parsing (auto-enabled with use_jinja)
205
+ parallel_tool_calls: false // Disable parallel tool calls for compatibility
206
+ });
207
+ ```
208
+
209
+ ### Thinking and Reasoning Models
210
+
211
+ For models that support reasoning and thinking, you can enable advanced thinking functionality:
212
+
213
+ ```js
214
+ import { initLlama } from '@novastera-oss/llamarn';
215
+
216
+ // Initialize a reasoning model with thinking capabilities
217
+ const context = await initLlama({
218
+ model: 'path/to/reasoning-model.gguf',
219
+ n_ctx: 4096,
220
+ n_batch: 512,
221
+ use_jinja: true,
222
+
223
+ // Thinking and reasoning options
224
+ reasoning_budget: -1, // -1 = unlimited thinking, 0 = disabled, >0 = limited
225
+ reasoning_format: 'deepseek', // Use DeepSeek reasoning format
226
+ thinking_forced_open: true, // Force the model to always output thinking
227
+ parse_tool_calls: true, // Enable tool call parsing
228
+ parallel_tool_calls: false // Disable parallel tool calls for compatibility
229
+ });
230
+
231
+ // Chat completion with thinking enabled
232
+ const result = await context.completion({
233
+ messages: [
234
+ { role: 'system', content: 'You are a helpful assistant. Think through problems step by step.' },
235
+ { role: 'user', content: 'Solve this math problem: What is 15% of 240?' }
236
+ ],
237
+ temperature: 0.7
193
238
  });
194
239
 
240
+ console.log('Response:', result.text);
241
+ // The response may include thinking tags like <think>...</think> depending on the model
242
+ ```
243
+
195
244
  // Create a chat with tool calling
196
245
  const response = await context.completion({
197
246
  messages: [
@@ -260,6 +309,40 @@ const embeddingResponse = await context.embedding({
260
309
  console.log('Embedding:', embeddingResponse.data[0].embedding);
261
310
  ```
262
311
 
312
+ ## Advanced Configuration Options
313
+
314
+ ### Thinking and Reasoning Parameters
315
+
316
+ The library supports advanced thinking and reasoning capabilities for models that support them:
317
+
318
+ - **`reasoning_budget`**: Controls the amount of thinking allowed
319
+ - `-1`: Unlimited thinking (default)
320
+ - `0`: Disabled thinking
321
+ - `>0`: Limited thinking with the specified budget
322
+
323
+ - **`reasoning_format`**: Controls how thinking is parsed and returned
324
+ - `'none'`: Leave thoughts unparsed in message content
325
+ - `'auto'`: Same as deepseek (default)
326
+ - `'deepseek'`: Extract thinking into `message.reasoning_content`
327
+ - `'deepseek-legacy'`: Extract thinking with streaming behavior
328
+
329
+ - **`thinking_forced_open`**: Forces reasoning models to always output thinking
330
+ - `false`: Normal thinking behavior (default)
331
+ - `true`: Always include thinking tags in output
332
+
333
+ - **`parse_tool_calls`**: Enables tool call parsing
334
+ - `true`: Parse and extract tool calls (default)
335
+ - `false`: Disable tool call parsing
336
+ - **Note**: Automatically enabled when `use_jinja` is true
337
+
338
+ - **`parallel_tool_calls`**: Enables multiple tool calls in a single response
339
+ - `false`: Single tool calls only (default, for compatibility)
340
+ - `true`: Allow parallel tool calls (only supported by some models)
341
+
342
+ ### Automatic Tool Call Enhancement
343
+
344
+ When `use_jinja` is enabled, `parse_tool_calls` is automatically enabled because Jinja templates provide better tool calling capabilities. This ensures optimal tool support when using advanced templates.
345
+
263
346
  ## Model Path Handling
264
347
 
265
348
  The module accepts different path formats depending on the platform:
@@ -53,7 +53,7 @@ Pod::Spec.new do |s|
53
53
  # Compiler settings
54
54
  s.pod_target_xcconfig = {
55
55
  "HEADER_SEARCH_PATHS" => "\"$(PODS_TARGET_SRCROOT)/ios/include\" \"$(PODS_TARGET_SRCROOT)/cpp\" \"$(PODS_TARGET_SRCROOT)/ios/generated/RNLlamaCppSpec\" \"$(PODS_TARGET_SRCROOT)/ios/generated\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/include\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/ggml/include\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/common\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/vendor\" \"$(PODS_ROOT)/boost\" \"$(PODS_ROOT)/Headers/Public/React-bridging\" \"$(PODS_ROOT)/Headers/Public/React\"",
56
- "OTHER_CPLUSPLUSFLAGS" => "-DFOLLY_NO_CONFIG -DFOLLY_MOBILE=1 -DFOLLY_USE_LIBCPP=1 -DLLAMA_METAL -DRCT_NEW_ARCH_ENABLED=1 -DFBJSRT_EXPORTED=1",
56
+ "OTHER_CPLUSPLUSFLAGS" => "-DFOLLY_NO_CONFIG -DFOLLY_MOBILE=1 -DFOLLY_USE_LIBCPP=1 -DFOLLY_CFG_NO_COROUTINES=1 -DLLAMA_METAL -DRCT_NEW_ARCH_ENABLED=1 -DFBJSRT_EXPORTED=1",
57
57
  "CLANG_CXX_LANGUAGE_STANDARD" => "c++17",
58
58
  "GCC_OPTIMIZATION_LEVEL" => "3", # Maximum optimization
59
59
  "SWIFT_OPTIMIZATION_LEVEL" => "-O",
@@ -78,9 +78,17 @@ add_library(
78
78
  ${CPP_DIR}/rn-completion.cpp
79
79
  )
80
80
 
81
- # Suppress unused function warnings for llama.cpp code
82
- target_compile_options(common PRIVATE -Wno-unused-function)
83
- target_compile_options(RNLlamaCpp PRIVATE -Wno-unused-function)
81
+ # Suppress additional warnings that are treated as errors in Expo SDK 54
82
+ target_compile_options(common PRIVATE )
83
+
84
+ # Use React Native's compile options function for proper C++ flags and RN_SERIALIZABLE_STATE
85
+ if(ReactAndroid_VERSION_MINOR GREATER_EQUAL 80)
86
+ # Add additional warning suppressions for RNLlamaCpp target
87
+ target_compile_reactnative_options(RNLlamaCpp PRIVATE)
88
+ target_compile_options(RNLlamaCpp PRIVATE -Wno-unused-function)
89
+ else()
90
+ target_compile_options(RNLlamaCpp PRIVATE -Wno-unused-function)
91
+ endif()
84
92
 
85
93
  # Check if Vulkan backend library is available
86
94
  set(VULKAN_BACKEND_AVAILABLE FALSE)
@@ -18,7 +18,7 @@ namespace facebook::react {
18
18
 
19
19
  #pragma mark - NativeRNLlamaCppLlamaModelParams
20
20
 
21
- template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9, typename P10, typename P11, typename P12, typename P13, typename P14, typename P15, typename P16, typename P17, typename P18, typename P19, typename P20, typename P21, typename P22, typename P23>
21
+ template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9, typename P10, typename P11, typename P12, typename P13, typename P14, typename P15, typename P16, typename P17, typename P18, typename P19, typename P20, typename P21, typename P22, typename P23, typename P24, typename P25, typename P26, typename P27, typename P28>
22
22
  struct NativeRNLlamaCppLlamaModelParams {
23
23
  P0 model;
24
24
  P1 n_ctx;
@@ -42,10 +42,15 @@ struct NativeRNLlamaCppLlamaModelParams {
42
42
  P19 chat_template;
43
43
  P20 use_jinja;
44
44
  P21 verbose;
45
- P22 lora_adapters;
46
- P23 grammar;
45
+ P22 reasoning_budget;
46
+ P23 reasoning_format;
47
+ P24 thinking_forced_open;
48
+ P25 parse_tool_calls;
49
+ P26 parallel_tool_calls;
50
+ P27 lora_adapters;
51
+ P28 grammar;
47
52
  bool operator==(const NativeRNLlamaCppLlamaModelParams &other) const {
48
- return model == other.model && n_ctx == other.n_ctx && n_batch == other.n_batch && n_ubatch == other.n_ubatch && n_threads == other.n_threads && n_keep == other.n_keep && n_gpu_layers == other.n_gpu_layers && use_mmap == other.use_mmap && use_mlock == other.use_mlock && vocab_only == other.vocab_only && embedding == other.embedding && seed == other.seed && rope_freq_base == other.rope_freq_base && rope_freq_scale == other.rope_freq_scale && yarn_ext_factor == other.yarn_ext_factor && yarn_attn_factor == other.yarn_attn_factor && yarn_beta_fast == other.yarn_beta_fast && yarn_beta_slow == other.yarn_beta_slow && logits_all == other.logits_all && chat_template == other.chat_template && use_jinja == other.use_jinja && verbose == other.verbose && lora_adapters == other.lora_adapters && grammar == other.grammar;
53
+ return model == other.model && n_ctx == other.n_ctx && n_batch == other.n_batch && n_ubatch == other.n_ubatch && n_threads == other.n_threads && n_keep == other.n_keep && n_gpu_layers == other.n_gpu_layers && use_mmap == other.use_mmap && use_mlock == other.use_mlock && vocab_only == other.vocab_only && embedding == other.embedding && seed == other.seed && rope_freq_base == other.rope_freq_base && rope_freq_scale == other.rope_freq_scale && yarn_ext_factor == other.yarn_ext_factor && yarn_attn_factor == other.yarn_attn_factor && yarn_beta_fast == other.yarn_beta_fast && yarn_beta_slow == other.yarn_beta_slow && logits_all == other.logits_all && chat_template == other.chat_template && use_jinja == other.use_jinja && verbose == other.verbose && reasoning_budget == other.reasoning_budget && reasoning_format == other.reasoning_format && thinking_forced_open == other.thinking_forced_open && parse_tool_calls == other.parse_tool_calls && parallel_tool_calls == other.parallel_tool_calls && lora_adapters == other.lora_adapters && grammar == other.grammar;
49
54
  }
50
55
  };
51
56
 
@@ -80,6 +85,11 @@ struct NativeRNLlamaCppLlamaModelParamsBridging {
80
85
  bridging::fromJs<decltype(types.chat_template)>(rt, value.getProperty(rt, "chat_template"), jsInvoker),
81
86
  bridging::fromJs<decltype(types.use_jinja)>(rt, value.getProperty(rt, "use_jinja"), jsInvoker),
82
87
  bridging::fromJs<decltype(types.verbose)>(rt, value.getProperty(rt, "verbose"), jsInvoker),
88
+ bridging::fromJs<decltype(types.reasoning_budget)>(rt, value.getProperty(rt, "reasoning_budget"), jsInvoker),
89
+ bridging::fromJs<decltype(types.reasoning_format)>(rt, value.getProperty(rt, "reasoning_format"), jsInvoker),
90
+ bridging::fromJs<decltype(types.thinking_forced_open)>(rt, value.getProperty(rt, "thinking_forced_open"), jsInvoker),
91
+ bridging::fromJs<decltype(types.parse_tool_calls)>(rt, value.getProperty(rt, "parse_tool_calls"), jsInvoker),
92
+ bridging::fromJs<decltype(types.parallel_tool_calls)>(rt, value.getProperty(rt, "parallel_tool_calls"), jsInvoker),
83
93
  bridging::fromJs<decltype(types.lora_adapters)>(rt, value.getProperty(rt, "lora_adapters"), jsInvoker),
84
94
  bridging::fromJs<decltype(types.grammar)>(rt, value.getProperty(rt, "grammar"), jsInvoker)};
85
95
  return result;
@@ -174,6 +184,26 @@ struct NativeRNLlamaCppLlamaModelParamsBridging {
174
184
  return bridging::toJs(rt, value);
175
185
  }
176
186
 
187
+ static double reasoning_budgetToJs(jsi::Runtime &rt, decltype(types.reasoning_budget) value) {
188
+ return bridging::toJs(rt, value);
189
+ }
190
+
191
+ static jsi::String reasoning_formatToJs(jsi::Runtime &rt, decltype(types.reasoning_format) value) {
192
+ return bridging::toJs(rt, value);
193
+ }
194
+
195
+ static bool thinking_forced_openToJs(jsi::Runtime &rt, decltype(types.thinking_forced_open) value) {
196
+ return bridging::toJs(rt, value);
197
+ }
198
+
199
+ static bool parse_tool_callsToJs(jsi::Runtime &rt, decltype(types.parse_tool_calls) value) {
200
+ return bridging::toJs(rt, value);
201
+ }
202
+
203
+ static bool parallel_tool_callsToJs(jsi::Runtime &rt, decltype(types.parallel_tool_calls) value) {
204
+ return bridging::toJs(rt, value);
205
+ }
206
+
177
207
  static jsi::Array lora_adaptersToJs(jsi::Runtime &rt, decltype(types.lora_adapters) value) {
178
208
  return bridging::toJs(rt, value);
179
209
  }
@@ -252,6 +282,21 @@ struct NativeRNLlamaCppLlamaModelParamsBridging {
252
282
  if (value.verbose) {
253
283
  result.setProperty(rt, "verbose", bridging::toJs(rt, value.verbose.value(), jsInvoker));
254
284
  }
285
+ if (value.reasoning_budget) {
286
+ result.setProperty(rt, "reasoning_budget", bridging::toJs(rt, value.reasoning_budget.value(), jsInvoker));
287
+ }
288
+ if (value.reasoning_format) {
289
+ result.setProperty(rt, "reasoning_format", bridging::toJs(rt, value.reasoning_format.value(), jsInvoker));
290
+ }
291
+ if (value.thinking_forced_open) {
292
+ result.setProperty(rt, "thinking_forced_open", bridging::toJs(rt, value.thinking_forced_open.value(), jsInvoker));
293
+ }
294
+ if (value.parse_tool_calls) {
295
+ result.setProperty(rt, "parse_tool_calls", bridging::toJs(rt, value.parse_tool_calls.value(), jsInvoker));
296
+ }
297
+ if (value.parallel_tool_calls) {
298
+ result.setProperty(rt, "parallel_tool_calls", bridging::toJs(rt, value.parallel_tool_calls.value(), jsInvoker));
299
+ }
255
300
  if (value.lora_adapters) {
256
301
  result.setProperty(rt, "lora_adapters", bridging::toJs(rt, value.lora_adapters.value(), jsInvoker));
257
302
  }
@@ -64,8 +64,6 @@ extern "C" {
64
64
 
65
65
  typedef struct llama_memory_i * llama_memory_t;
66
66
 
67
- struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
68
-
69
67
  typedef int32_t llama_pos;
70
68
  typedef int32_t llama_token;
71
69
  typedef int32_t llama_seq_id;
@@ -152,6 +150,7 @@ extern "C" {
152
150
  //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
153
151
  LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
154
152
  LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
153
+ LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
155
154
 
156
155
  LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
157
156
  };
@@ -284,10 +283,11 @@ extern "C" {
284
283
  const struct llama_model_kv_override * kv_overrides;
285
284
 
286
285
  // Keep the booleans together to avoid misalignment during copy-by-value.
287
- bool vocab_only; // only load the vocabulary, no weights
288
- bool use_mmap; // use mmap if possible
289
- bool use_mlock; // force system to keep model in RAM
290
- bool check_tensors; // validate model tensor data
286
+ bool vocab_only; // only load the vocabulary, no weights
287
+ bool use_mmap; // use mmap if possible
288
+ bool use_mlock; // force system to keep model in RAM
289
+ bool check_tensors; // validate model tensor data
290
+ bool use_extra_bufts; // use extra buffer types (used for weight repacking)
291
291
  };
292
292
 
293
293
  // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@@ -312,7 +312,7 @@ extern "C" {
312
312
  float yarn_beta_fast; // YaRN low correction dim
313
313
  float yarn_beta_slow; // YaRN high correction dim
314
314
  uint32_t yarn_orig_ctx; // YaRN original context size
315
- float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
315
+ float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
316
316
 
317
317
  ggml_backend_sched_eval_callback cb_eval;
318
318
  void * cb_eval_user_data;
@@ -467,8 +467,6 @@ extern "C" {
467
467
  LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
468
468
  LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
469
469
 
470
- DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
471
-
472
470
  LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
473
471
  LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
474
472
 
@@ -537,6 +535,9 @@ extern "C" {
537
535
  // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
538
536
  LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
539
537
 
538
+ // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
539
+ LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
540
+
540
541
  // Returns 0 on success
541
542
  LLAMA_API uint32_t llama_model_quantize(
542
543
  const char * fname_inp,
@@ -552,6 +553,24 @@ extern "C" {
552
553
  struct llama_model * model,
553
554
  const char * path_lora);
554
555
 
556
+ // Functions to access the adapter's GGUF metadata scalar values
557
+ // - The functions return the length of the string on success, or -1 on failure
558
+ // - The output string is always null-terminated and cleared on failure
559
+ // - When retrieving a string, an extra byte must be allocated to account for the null terminator
560
+ // - GGUF array values are not supported by these functions
561
+
562
+ // Get metadata value as a string by key name
563
+ LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
564
+
565
+ // Get the number of metadata key/value pairs
566
+ LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
567
+
568
+ // Get metadata key name by index
569
+ LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
570
+
571
+ // Get metadata value as a string by index
572
+ LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
573
+
555
574
  // Manually free a LoRA adapter
556
575
  // Note: loaded adapters will be free when the associated model is deleted
557
576
  LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
@@ -662,111 +681,6 @@ extern "C" {
662
681
  // Check if the memory supports shifting
663
682
  LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
664
683
 
665
- //
666
- // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
667
- //
668
-
669
- // Returns the number of tokens in the KV cache (slow, use only for debug)
670
- // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
671
- DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
672
- "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
673
-
674
- // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
675
- DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
676
- "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
677
-
678
- // Clear the KV cache - both cell info is erased and KV data is zeroed
679
- DEPRECATED(LLAMA_API void llama_kv_self_clear(
680
- struct llama_context * ctx),
681
- "Use llama_memory_clear() instead");
682
-
683
- // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
684
- // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
685
- // seq_id < 0 : match any sequence
686
- // p0 < 0 : [0, p1]
687
- // p1 < 0 : [p0, inf)
688
- DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
689
- struct llama_context * ctx,
690
- llama_seq_id seq_id,
691
- llama_pos p0,
692
- llama_pos p1),
693
- "Use llama_memory_seq_rm() instead");
694
-
695
- // Copy all tokens that belong to the specified sequence to another sequence
696
- // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
697
- // p0 < 0 : [0, p1]
698
- // p1 < 0 : [p0, inf)
699
- DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
700
- struct llama_context * ctx,
701
- llama_seq_id seq_id_src,
702
- llama_seq_id seq_id_dst,
703
- llama_pos p0,
704
- llama_pos p1),
705
- "Use llama_memory_seq_cp() instead");
706
-
707
- // Removes all tokens that do not belong to the specified sequence
708
- DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
709
- struct llama_context * ctx,
710
- llama_seq_id seq_id),
711
- "Use llama_memory_seq_keep() instead");
712
-
713
- // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
714
- // If the KV cache is RoPEd, the KV data is updated accordingly:
715
- // - lazily on next llama_decode()
716
- // p0 < 0 : [0, p1]
717
- // p1 < 0 : [p0, inf)
718
- DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
719
- struct llama_context * ctx,
720
- llama_seq_id seq_id,
721
- llama_pos p0,
722
- llama_pos p1,
723
- llama_pos delta),
724
- "Use llama_memory_seq_add() instead");
725
-
726
- // Integer division of the positions by factor of `d > 1`
727
- // If the KV cache is RoPEd, the KV data is updated accordingly:
728
- // - lazily on next llama_decode()
729
- // p0 < 0 : [0, p1]
730
- // p1 < 0 : [p0, inf)
731
- DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
732
- struct llama_context * ctx,
733
- llama_seq_id seq_id,
734
- llama_pos p0,
735
- llama_pos p1,
736
- int d),
737
- "Use llama_memory_seq_div() instead");
738
-
739
- // Returns the smallest position present in the KV cache for the specified sequence
740
- // This is typically non-zero only for SWA caches
741
- // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
742
- // Return -1 if the sequence is empty
743
- DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
744
- struct llama_context * ctx,
745
- llama_seq_id seq_id),
746
- "Use llama_memory_seq_pos_min() instead");
747
-
748
- // Returns the largest position present in the KV cache for the specified sequence
749
- // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
750
- // Return -1 if the sequence is empty
751
- DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
752
- struct llama_context * ctx,
753
- llama_seq_id seq_id),
754
- "Use llama_memory_seq_pos_max() instead");
755
-
756
- // Defragment the KV cache
757
- // This will be applied:
758
- // - lazily on next llama_decode()
759
- DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
760
- "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
761
-
762
- // Check if the context supports KV cache shifting
763
- DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
764
- "use llama_memory_can_shift() instead");
765
-
766
- // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
767
- DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
768
- "simply remove this call, updates are applied lazily on the next llama_decode()");
769
-
770
684
  //
771
685
  // State / sessions
772
686
  //
@@ -865,6 +779,29 @@ extern "C" {
865
779
  size_t n_token_capacity,
866
780
  size_t * n_token_count_out);
867
781
 
782
+ #define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
783
+
784
+ typedef uint32_t llama_state_seq_flags;
785
+
786
+ LLAMA_API size_t llama_state_seq_get_size_ext(
787
+ struct llama_context * ctx,
788
+ llama_seq_id seq_id,
789
+ llama_state_seq_flags flags);
790
+
791
+ LLAMA_API size_t llama_state_seq_get_data_ext(
792
+ struct llama_context * ctx,
793
+ uint8_t * dst,
794
+ size_t size,
795
+ llama_seq_id seq_id,
796
+ llama_state_seq_flags flags);
797
+
798
+ LLAMA_API size_t llama_state_seq_set_data_ext(
799
+ struct llama_context * ctx,
800
+ const uint8_t * src,
801
+ size_t size,
802
+ llama_seq_id dest_seq_id,
803
+ llama_state_seq_flags flags);
804
+
868
805
  //
869
806
  // Decoding
870
807
  //
@@ -1432,6 +1369,8 @@ extern "C" {
1432
1369
 
1433
1370
  ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
1434
1371
  void * get_opt_pars_ud; // userdata for calculating optimizer parameters
1372
+
1373
+ enum ggml_opt_optimizer_type optimizer_type;
1435
1374
  };
1436
1375
 
1437
1376
  LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
@@ -948,16 +948,8 @@ jsi::Value LlamaCppModel::embeddingJsi(jsi::Runtime& rt, const jsi::Value* args,
948
948
  throw std::runtime_error("Invalid embedding dimension");
949
949
  }
950
950
 
951
- // For OpenAI compatibility, default to mean pooling
952
- enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_MEAN;
953
- if (options.hasProperty(rt, "pooling") && options.getProperty(rt, "pooling").isString()) {
954
- std::string pooling = options.getProperty(rt, "pooling").getString(rt).utf8(rt);
955
- if (pooling == "last") {
956
- pooling_type = LLAMA_POOLING_TYPE_LAST;
957
- } else if (pooling == "cls" || pooling == "first") {
958
- pooling_type = LLAMA_POOLING_TYPE_CLS;
959
- }
960
- }
951
+ // Note: Pooling is handled automatically by llama_get_embeddings()
952
+ // The function returns the appropriate embedding based on the model's configuration
961
953
 
962
954
  // Get the embeddings
963
955
  std::vector<float> embedding_vec(n_embd);