@novastera-oss/llamarn 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. package/android/src/main/cpp/include/llama.h +8 -3
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  10. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  11. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  13. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  15. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  18. package/cpp/LlamaCppModel.cpp +56 -22
  19. package/cpp/build-info.cpp +2 -2
  20. package/cpp/llama.cpp/CMakeLists.txt +1 -2
  21. package/cpp/llama.cpp/README.md +4 -5
  22. package/cpp/llama.cpp/build-xcframework.sh +1 -1
  23. package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
  24. package/cpp/llama.cpp/common/arg.cpp +24 -0
  25. package/cpp/llama.cpp/common/chat.cpp +37 -20
  26. package/cpp/llama.cpp/common/chat.h +2 -0
  27. package/cpp/llama.cpp/common/common.cpp +3 -0
  28. package/cpp/llama.cpp/common/common.h +5 -0
  29. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  30. package/cpp/llama.cpp/convert_hf_to_gguf.py +860 -23
  31. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
  32. package/cpp/llama.cpp/ggml/CMakeLists.txt +8 -2
  33. package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
  34. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  35. package/cpp/llama.cpp/ggml/include/ggml.h +206 -10
  36. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
  38. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
  39. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
  40. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +37 -3
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -103
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1405 -240
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +8 -0
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +212 -34
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +35 -11
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +187 -54
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +71 -29
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +12 -6
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +269 -110
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  88. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  89. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
  90. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
  91. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
  92. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  93. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
  94. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  95. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  96. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
  99. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
  100. package/cpp/llama.cpp/ggml/src/ggml-impl.h +125 -183
  101. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
  102. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +51 -9
  103. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +394 -80
  104. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +616 -239
  105. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +741 -571
  108. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
  110. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  112. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
  114. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
  115. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
  116. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
  117. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  118. package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +697 -1098
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +104 -62
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  133. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  134. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  135. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
  136. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  137. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +39 -38
  138. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
  139. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  140. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  141. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  142. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  143. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  144. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  145. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  146. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +767 -292
  147. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
  149. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
  150. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
  151. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
  152. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
  153. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
  154. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  156. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  158. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  159. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  160. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
  161. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
  162. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  163. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  164. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  166. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  167. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  168. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  169. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
  170. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  172. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
  173. package/cpp/llama.cpp/ggml/src/ggml.c +449 -72
  174. package/cpp/llama.cpp/ggml/src/gguf.cpp +13 -2
  175. package/cpp/llama.cpp/gguf-py/gguf/constants.py +285 -0
  176. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +27 -0
  177. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +137 -21
  178. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +109 -7
  179. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  180. package/cpp/llama.cpp/include/llama.h +8 -43
  181. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  182. package/cpp/llama.cpp/src/llama-arch.cpp +265 -3
  183. package/cpp/llama.cpp/src/llama-arch.h +36 -1
  184. package/cpp/llama.cpp/src/llama-batch.cpp +596 -359
  185. package/cpp/llama.cpp/src/llama-batch.h +105 -70
  186. package/cpp/llama.cpp/src/llama-chat.cpp +26 -6
  187. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  188. package/cpp/llama.cpp/src/llama-context.cpp +101 -107
  189. package/cpp/llama.cpp/src/llama-context.h +13 -13
  190. package/cpp/llama.cpp/src/llama-graph.cpp +286 -404
  191. package/cpp/llama.cpp/src/llama-graph.h +78 -79
  192. package/cpp/llama.cpp/src/llama-hparams.cpp +11 -1
  193. package/cpp/llama.cpp/src/llama-hparams.h +11 -0
  194. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +74 -66
  195. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
  196. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +312 -157
  197. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +79 -46
  198. package/cpp/llama.cpp/src/llama-kv-cells.h +97 -21
  199. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +73 -69
  200. package/cpp/llama.cpp/src/llama-memory-hybrid.h +19 -22
  201. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +88 -77
  202. package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
  203. package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
  204. package/cpp/llama.cpp/src/llama-memory.h +21 -22
  205. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  206. package/cpp/llama.cpp/src/llama-model.cpp +5301 -2922
  207. package/cpp/llama.cpp/src/llama-model.h +40 -0
  208. package/cpp/llama.cpp/src/llama-quant.cpp +88 -5
  209. package/cpp/llama.cpp/src/llama-vocab.cpp +37 -3
  210. package/cpp/llama.cpp/src/llama-vocab.h +42 -0
  211. package/cpp/rn-utils.h +3 -0
  212. package/ios/include/chat.h +2 -0
  213. package/ios/include/common.h +5 -0
  214. package/ios/include/llama.h +8 -43
  215. package/ios/libs/llama.xcframework/Info.plist +19 -19
  216. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  217. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
  218. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  219. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  220. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +206 -10
  221. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -43
  222. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  223. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  224. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
  225. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
  226. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  227. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  228. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
  229. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
  230. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  231. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  232. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
  233. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3744
  234. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
  235. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  236. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +206 -10
  237. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -43
  238. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
  239. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  240. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +206 -10
  241. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -43
  242. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  243. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
  244. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  245. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +206 -10
  246. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -43
  247. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  248. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  249. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  250. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
  251. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  252. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  253. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +206 -10
  254. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -43
  255. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  256. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  257. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
  258. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
  259. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  260. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  261. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
  262. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
  263. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  264. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  265. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5095 -4900
  266. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  267. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  268. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +206 -10
  269. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -43
  270. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  271. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  272. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4871
  273. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3773
  274. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  275. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  276. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
  277. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
  278. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  279. package/package.json +1 -1
  280. package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
  281. package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  282. package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  283. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  284. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  285. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  286. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  287. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  288. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  289. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  290. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  291. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  292. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  293. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  294. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  295. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  296. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  297. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  298. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  299. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  300. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  301. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  302. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  303. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  304. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  305. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  306. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  307. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  308. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  309. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  310. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  311. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  312. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  313. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  314. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  315. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  316. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  317. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  318. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  319. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
@@ -390,6 +390,7 @@ extern "C" {
390
390
  void * imatrix; // pointer to importance matrix data
391
391
  void * kv_overrides; // pointer to vector containing overrides
392
392
  void * tensor_types; // pointer to vector containing tensor types
393
+ void * prune_layers; // pointer to vector containing layer indices to prune
393
394
  } llama_model_quantize_params;
394
395
 
395
396
  typedef struct llama_logit_bias {
@@ -943,12 +944,14 @@ extern "C" {
943
944
  // Requires the context to have a memory.
944
945
  // For encode-decoder contexts, processes the batch using the decoder.
945
946
  // Positive return values does not mean a fatal error, but rather a warning.
946
- // Upon non-zero return values, the memory state is restored to the state before this call
947
+ // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
948
+ // To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
949
+ // Upon other return values, the memory state is restored to the state before this call
947
950
  // 0 - success
948
951
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
949
- // 2 - aborted
952
+ // 2 - aborted (processed ubatches will remain in the context's memory)
950
953
  // -1 - invalid input batch
951
- // < -1 - error
954
+ // < -1 - fatal error (processed ubatches will remain in the context's memory)
952
955
  LLAMA_API int32_t llama_decode(
953
956
  struct llama_context * ctx,
954
957
  struct llama_batch batch);
@@ -1044,6 +1047,7 @@ extern "C" {
1044
1047
 
1045
1048
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
1046
1049
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
1050
+ LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
1047
1051
 
1048
1052
  LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
1049
1053
  LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
@@ -1087,6 +1091,7 @@ extern "C" {
1087
1091
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
1088
1092
  /// @return Returns the number of tokens on success, no more than n_tokens_max
1089
1093
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
1094
+ /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
1090
1095
  /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
1091
1096
  /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
1092
1097
  /// as plaintext. Does not insert a leading space.
@@ -5,6 +5,7 @@
5
5
  #include <cstdlib>
6
6
  #include <ctime>
7
7
  #include <chrono>
8
+ #include <thread>
8
9
  #include <fstream>
9
10
  #include <iostream>
10
11
  #include <random>
@@ -50,33 +51,60 @@ LlamaCppModel::~LlamaCppModel() {
50
51
  }
51
52
 
52
53
  void LlamaCppModel::release() {
53
- // Cancel any ongoing predictions
54
+ // Signal completion to stop and wait for it to finish gracefully
54
55
  if (is_predicting_) {
55
56
  should_stop_completion_ = true;
56
57
 
57
- // Optionally wait a bit for completion to stop
58
+ // Wait more patiently for completion to stop, with proper backoff
58
59
  int retry = 0;
59
- while (is_predicting_ && retry < 10) {
60
- std::this_thread::sleep_for(std::chrono::milliseconds(10));
60
+ while (is_predicting_ && retry < 100) { // Increased from 10 to 100
61
+ std::this_thread::sleep_for(std::chrono::milliseconds(retry < 50 ? 10 : 50));
61
62
  retry++;
62
63
  }
64
+
65
+ // Force stop if still predicting
66
+ if (is_predicting_) {
67
+ is_predicting_ = false;
68
+ }
63
69
  }
64
70
 
65
- // Clean up our resources
71
+ // Clean up our resources with proper mutex protection
66
72
  if (rn_ctx_) {
73
+ std::lock_guard<std::mutex> lock(rn_ctx_->mutex);
74
+
75
+ // Clear KV cache before freeing context (following server.cpp pattern)
67
76
  if (rn_ctx_->ctx) {
77
+ try {
78
+ llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
79
+ } catch (...) {
80
+ // Ignore errors during cache clearing
81
+ }
82
+
68
83
  llama_free(rn_ctx_->ctx);
69
84
  rn_ctx_->ctx = nullptr;
70
85
  }
71
86
 
87
+ // Free model after context (following server.cpp cleanup order)
72
88
  if (rn_ctx_->model) {
73
89
  llama_model_free(rn_ctx_->model);
74
90
  rn_ctx_->model = nullptr;
75
91
  }
76
92
 
93
+ // Clean up additional resources
94
+ rn_ctx_->vocab = nullptr; // This is owned by the model, so just null it
95
+ rn_ctx_->chat_templates.reset(); // Clean up chat templates
96
+ rn_ctx_->lora_adapters.clear(); // Clear LoRA adapters
97
+
98
+ // Reset state flags
99
+ rn_ctx_->model_loaded = false;
100
+
77
101
  // Note: rn_ctx_ itself is owned by the module, so we don't delete it here
78
102
  rn_ctx_ = nullptr;
79
103
  }
104
+
105
+ // Reset our internal state
106
+ should_stop_completion_ = false;
107
+ is_predicting_ = false;
80
108
  }
81
109
 
82
110
  int32_t LlamaCppModel::getVocabSize() const {
@@ -133,6 +161,10 @@ CompletionOptions LlamaCppModel::parseCompletionOptions(jsi::Runtime& rt, const
133
161
  options.min_p = obj.getProperty(rt, "min_p").asNumber();
134
162
  }
135
163
 
164
+ if (obj.hasProperty(rt, "presence_penalty") && !obj.getProperty(rt, "presence_penalty").isUndefined()) {
165
+ options.presence_penalty = obj.getProperty(rt, "presence_penalty").asNumber();
166
+ }
167
+
136
168
  if (obj.hasProperty(rt, "n_predict") && !obj.getProperty(rt, "n_predict").isUndefined()) {
137
169
  options.n_predict = obj.getProperty(rt, "n_predict").asNumber();
138
170
  } else if (obj.hasProperty(rt, "max_tokens") && !obj.getProperty(rt, "max_tokens").isUndefined()) {
@@ -365,13 +397,14 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
365
397
  std::lock_guard<std::mutex> lock(rn_ctx_->mutex);
366
398
 
367
399
  // Clear the context KV cache
368
- llama_kv_self_clear(rn_ctx_->ctx);
400
+ llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
369
401
 
370
402
  // Store original sampling parameters to restore later
371
403
  float orig_temp = rn_ctx_->params.sampling.temp;
372
404
  float orig_top_p = rn_ctx_->params.sampling.top_p;
373
405
  float orig_top_k = rn_ctx_->params.sampling.top_k;
374
406
  float orig_min_p = rn_ctx_->params.sampling.min_p;
407
+ float orig_presence_penalty = rn_ctx_->params.sampling.penalty_present;
375
408
  int orig_n_predict = rn_ctx_->params.n_predict;
376
409
 
377
410
  // Set sampling parameters from options
@@ -379,6 +412,7 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
379
412
  rn_ctx_->params.sampling.top_p = options.top_p;
380
413
  rn_ctx_->params.sampling.top_k = options.top_k;
381
414
  rn_ctx_->params.sampling.min_p = options.min_p;
415
+ rn_ctx_->params.sampling.penalty_present = options.presence_penalty;
382
416
  rn_ctx_->params.n_predict = options.n_predict;
383
417
 
384
418
  // Check for a partial callback
@@ -426,6 +460,7 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
426
460
  rn_ctx_->params.sampling.top_p = orig_top_p;
427
461
  rn_ctx_->params.sampling.top_k = orig_top_k;
428
462
  rn_ctx_->params.sampling.min_p = orig_min_p;
463
+ rn_ctx_->params.sampling.penalty_present = orig_presence_penalty;
429
464
  rn_ctx_->params.n_predict = orig_n_predict;
430
465
 
431
466
  return result;
@@ -885,29 +920,28 @@ jsi::Value LlamaCppModel::embeddingJsi(jsi::Runtime& rt, const jsi::Value* args,
885
920
  }
886
921
 
887
922
  // Clear the context KV cache to ensure clean embedding
888
- llama_kv_self_clear(rn_ctx_->ctx);
923
+ llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
889
924
 
890
925
  // Enable embedding mode
891
926
  llama_set_embeddings(rn_ctx_->ctx, true);
892
927
 
893
- // Evaluate tokens one by one
928
+ // Create and populate batch using common_batch functions (following server.cpp pattern)
929
+ llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
930
+
931
+ common_batch_clear(batch);
894
932
  for (int i = 0; i < (int)tokens.size(); i++) {
895
- llama_token token = tokens[i];
896
- llama_batch batch = {
897
- /* n_tokens */ 1,
898
- /* token */ &token,
899
- /* embd */ nullptr,
900
- /* pos */ &i,
901
- /* n_seq_id */ nullptr,
902
- /* seq_id */ nullptr,
903
- /* logits */ nullptr
904
- };
905
-
906
- if (llama_decode(rn_ctx_->ctx, batch) != 0) {
907
- throw std::runtime_error("Failed to decode token for embedding");
908
- }
933
+ // For embeddings, we typically need logits for the last token (for pooling)
934
+ bool needs_logits = (i == (int)tokens.size() - 1);
935
+ common_batch_add(batch, tokens[i], i, {0}, needs_logits);
909
936
  }
910
937
 
938
+ if (llama_decode(rn_ctx_->ctx, batch) != 0) {
939
+ llama_batch_free(batch);
940
+ throw std::runtime_error("Failed to decode tokens for embedding");
941
+ }
942
+
943
+ llama_batch_free(batch);
944
+
911
945
  // Get embedding size from the model
912
946
  const int n_embd = llama_model_n_embd(rn_ctx_->model);
913
947
  if (n_embd <= 0) {
@@ -1,4 +1,4 @@
1
- int LLAMA_BUILD_NUMBER = 5709;
2
- char const *LLAMA_COMMIT = "d67341dc";
1
+ int LLAMA_BUILD_NUMBER = 5880;
2
+ char const *LLAMA_COMMIT = "3120413c";
3
3
  char const *LLAMA_COMPILER = "unknown";
4
4
  char const *LLAMA_BUILD_TARGET = "unknown";
@@ -95,7 +95,7 @@ endif()
95
95
  if (NOT DEFINED LLAMA_BUILD_COMMIT)
96
96
  set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
97
97
  endif()
98
- set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
98
+ set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
99
99
 
100
100
  # override ggml options
101
101
  set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
@@ -120,7 +120,6 @@ endfunction()
120
120
 
121
121
  llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
122
122
  llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
123
- llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
124
123
  llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
125
124
  llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
126
125
  llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
@@ -6,9 +6,9 @@
6
6
  [![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
7
7
  [![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
8
8
 
9
- [Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
9
+ [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) / [ops](https://github.com/ggml-org/llama.cpp/blob/master/docs/ops.md)
10
10
 
11
- Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
11
+ LLM inference in C/C++
12
12
 
13
13
  ## Recent API changes
14
14
 
@@ -17,10 +17,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
17
17
 
18
18
  ## Hot topics
19
19
 
20
- - 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
21
- - A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
20
+ - Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
21
+ - Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
22
22
  - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
23
- - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
24
23
  - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
25
24
  - Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
26
25
  - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
@@ -1,4 +1,4 @@
1
- #!/bin/bash
1
+ #!/usr/bin/env bash
2
2
  #
3
3
  # Options
4
4
  IOS_MIN_OS_VERSION=16.4
@@ -86,8 +86,7 @@ if (LLAMA_CURL)
86
86
  endif()
87
87
  target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
88
88
  include_directories(${CURL_INCLUDE_DIRS})
89
- find_library(CURL_LIBRARY curl REQUIRED)
90
- set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
89
+ set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
91
90
  endif ()
92
91
 
93
92
  if (LLAMA_LLGUIDANCE)
@@ -112,13 +111,13 @@ if (LLAMA_LLGUIDANCE)
112
111
 
113
112
  ExternalProject_Add(llguidance_ext
114
113
  GIT_REPOSITORY https://github.com/guidance-ai/llguidance
115
- # v0.7.20 (+ fix to build on GCC 15):
116
- GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
114
+ # v1.0.1:
115
+ GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
117
116
  PREFIX ${CMAKE_BINARY_DIR}/llguidance
118
117
  SOURCE_DIR ${LLGUIDANCE_SRC}
119
118
  BUILD_IN_SOURCE TRUE
120
119
  CONFIGURE_COMMAND ""
121
- BUILD_COMMAND cargo build --release
120
+ BUILD_COMMAND cargo build --release --package llguidance
122
121
  INSTALL_COMMAND ""
123
122
  BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
124
123
  UPDATE_COMMAND ""
@@ -2706,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2706
2706
  params.embd_sep = value;
2707
2707
  }
2708
2708
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2709
+ add_opt(common_arg(
2710
+ {"--cls-separator"}, "STRING",
2711
+ "separator of classification sequences (default \\t) for example \"<#seq#>\"",
2712
+ [](common_params & params, const std::string & value) {
2713
+ params.cls_sep = value;
2714
+ }
2715
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2709
2716
  add_opt(common_arg(
2710
2717
  {"--host"}, "HOST",
2711
2718
  string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
@@ -2727,6 +2734,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2727
2734
  params.public_path = value;
2728
2735
  }
2729
2736
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
2737
+ add_opt(common_arg(
2738
+ {"--api-prefix"}, "PREFIX",
2739
+ string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
2740
+ [](common_params & params, const std::string & value) {
2741
+ params.api_prefix = value;
2742
+ }
2743
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
2730
2744
  add_opt(common_arg(
2731
2745
  {"--no-webui"},
2732
2746
  string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
@@ -2787,6 +2801,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2787
2801
  params.ssl_file_cert = value;
2788
2802
  }
2789
2803
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
2804
+ add_opt(common_arg(
2805
+ {"--chat-template-kwargs"}, "STRING",
2806
+ string_format("sets additional params for the json template parser"),
2807
+ [](common_params & params, const std::string & value) {
2808
+ auto parsed = json::parse(value);
2809
+ for (const auto & item : parsed.items()) {
2810
+ params.default_template_kwargs[item.key()] = item.value().dump();
2811
+ }
2812
+ }
2813
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
2790
2814
  add_opt(common_arg(
2791
2815
  {"-to", "--timeout"}, "N",
2792
2816
  string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -17,6 +17,8 @@
17
17
  #include <string>
18
18
  #include <vector>
19
19
 
20
+ using json = nlohmann::ordered_json;
21
+
20
22
  static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
21
23
  auto time = std::chrono::system_clock::to_time_t(now);
22
24
  auto local_time = *std::localtime(&time);
@@ -140,6 +142,7 @@ struct templates_params {
140
142
  bool add_generation_prompt = true;
141
143
  bool enable_thinking = true;
142
144
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
145
+ json extra_context;
143
146
  };
144
147
 
145
148
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -720,16 +723,23 @@ static void foreach_function(const json & tools, const std::function<void(const
720
723
 
721
724
  static std::string apply(
722
725
  const common_chat_template & tmpl,
723
- const nlohmann::ordered_json & messages,
724
- const nlohmann::ordered_json & tools,
725
- bool add_generation_prompt,
726
- const nlohmann::ordered_json & extra_context = nlohmann::ordered_json())
726
+ const struct templates_params & inputs,
727
+ const std::optional<json> & messages_override = std::nullopt,
728
+ const std::optional<json> & tools_override = std::nullopt,
729
+ const std::optional<json> & additional_context = std::nullopt)
727
730
  {
728
731
  minja::chat_template_inputs tmpl_inputs;
729
- tmpl_inputs.messages = messages;
730
- tmpl_inputs.tools = tools;
731
- tmpl_inputs.add_generation_prompt = add_generation_prompt;
732
- tmpl_inputs.extra_context = extra_context;
732
+ tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
733
+ if (tools_override) {
734
+ tmpl_inputs.tools = *tools_override;
735
+ } else {
736
+ tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
737
+ }
738
+ tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
739
+ tmpl_inputs.extra_context = inputs.extra_context;
740
+ if (additional_context) {
741
+ tmpl_inputs.extra_context.merge_patch(*additional_context);
742
+ }
733
743
  // TODO: add flag to control date/time, if only for testing purposes.
734
744
  // tmpl_inputs.now = std::chrono::system_clock::now();
735
745
 
@@ -828,7 +838,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
828
838
  inputs.messages,
829
839
  "Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
830
840
 
831
- data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
841
+ data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
832
842
  data.format = COMMON_CHAT_FORMAT_GENERIC;
833
843
  return data;
834
844
  }
@@ -904,7 +914,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
904
914
  data.preserved_tokens = {
905
915
  "[TOOL_CALLS]",
906
916
  };
907
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
917
+ data.prompt = apply(tmpl, inputs);
908
918
  data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
909
919
  return data;
910
920
  }
@@ -934,7 +944,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
934
944
  adjusted_messages.push_back(msg);
935
945
  }
936
946
  }
937
- data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
947
+ data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
938
948
  data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
939
949
  if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
940
950
  if (!inputs.enable_thinking) {
@@ -1122,7 +1132,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
1122
1132
  } else {
1123
1133
  data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
1124
1134
  }
1125
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
1135
+ data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
1126
1136
  {"date_string", format_time(inputs.now, "%d %b %Y")},
1127
1137
  {"tools_in_user_message", false},
1128
1138
  {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
@@ -1187,7 +1197,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w
1187
1197
 
1188
1198
  static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
1189
1199
  common_chat_params data;
1190
- auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1200
+ auto prompt = apply(tmpl, inputs);
1191
1201
 
1192
1202
  // Hacks to fix the official (broken) prompt.
1193
1203
  // It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
@@ -1282,7 +1292,7 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
1282
1292
  static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
1283
1293
  LOG_DBG("%s\n", __func__);
1284
1294
  common_chat_params data;
1285
- data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
1295
+ data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
1286
1296
  {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
1287
1297
  {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
1288
1298
  });
@@ -1338,7 +1348,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
1338
1348
  // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
1339
1349
  // If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
1340
1350
  common_chat_params data;
1341
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1351
+ data.prompt = apply(tmpl, inputs);
1342
1352
  data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
1343
1353
  if (inputs.tools.is_array() && !inputs.tools.empty()) {
1344
1354
  data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -1465,7 +1475,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
1465
1475
  data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
1466
1476
  }
1467
1477
 
1468
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1478
+ data.prompt = apply(tmpl, inputs);
1469
1479
  // TODO: if (has_raw_python)
1470
1480
  return data;
1471
1481
  }
@@ -1498,14 +1508,15 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
1498
1508
  static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
1499
1509
  common_chat_params data;
1500
1510
 
1501
- json additional_context = {
1511
+ json extra_context = json {
1502
1512
  {"enable_thinking", inputs.enable_thinking},
1503
1513
  };
1514
+ extra_context.update(inputs.extra_context);
1504
1515
 
1505
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
1516
+ data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context);
1506
1517
  data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
1507
1518
  if (string_ends_with(data.prompt, "<think>\n")) {
1508
- if (!inputs.enable_thinking) {
1519
+ if (!extra_context["enable_thinking"]) {
1509
1520
  data.prompt += "</think>";
1510
1521
  } else {
1511
1522
  data.thinking_forced_open = true;
@@ -1691,7 +1702,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
1691
1702
 
1692
1703
  static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
1693
1704
  common_chat_params data;
1694
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1705
+ data.prompt = apply(tmpl, inputs);
1695
1706
  data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
1696
1707
  data.grammar_lazy = false;
1697
1708
  if (!inputs.json_schema.is_null()) {
@@ -1722,6 +1733,12 @@ static common_chat_params common_chat_templates_apply_jinja(
1722
1733
  params.enable_thinking = inputs.enable_thinking;
1723
1734
  params.grammar = inputs.grammar;
1724
1735
  params.now = inputs.now;
1736
+
1737
+ params.extra_context = json::object();
1738
+ for (auto el : inputs.chat_template_kwargs) {
1739
+ params.extra_context[el.first] = json::parse(el.second);
1740
+ }
1741
+
1725
1742
  if (!inputs.json_schema.empty()) {
1726
1743
  params.json_schema = json::parse(inputs.json_schema);
1727
1744
  }
@@ -7,6 +7,7 @@
7
7
  #include <chrono>
8
8
  #include <string>
9
9
  #include <vector>
10
+ #include <map>
10
11
 
11
12
  struct common_chat_templates;
12
13
 
@@ -125,6 +126,7 @@ struct common_chat_templates_inputs {
125
126
  common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
126
127
  bool enable_thinking = true;
127
128
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
129
+ std::map<std::string, std::string> chat_template_kwargs;
128
130
  };
129
131
 
130
132
  struct common_chat_params {
@@ -1290,6 +1290,9 @@ std::vector<llama_token> common_tokenize(
1290
1290
  int n_tokens = text.length() + 2 * add_special;
1291
1291
  std::vector<llama_token> result(n_tokens);
1292
1292
  n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1293
+ if (n_tokens == std::numeric_limits<int32_t>::min()) {
1294
+ throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
1295
+ }
1293
1296
  if (n_tokens < 0) {
1294
1297
  result.resize(-n_tokens);
1295
1298
  int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
@@ -8,6 +8,7 @@
8
8
  #include <string>
9
9
  #include <string_view>
10
10
  #include <vector>
11
+ #include <map>
11
12
  #include <sstream>
12
13
 
13
14
  #ifdef _WIN32
@@ -358,6 +359,7 @@ struct common_params {
358
359
  int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
359
360
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
360
361
  std::string embd_sep = "\n"; // separator of embeddings
362
+ std::string cls_sep = "\t"; // separator of classification sequences
361
363
 
362
364
  // server params
363
365
  int32_t port = 8080; // server listens on this network port
@@ -368,6 +370,7 @@ struct common_params {
368
370
 
369
371
  std::string hostname = "127.0.0.1";
370
372
  std::string public_path = ""; // NOLINT
373
+ std::string api_prefix = ""; // NOLINT
371
374
  std::string chat_template = ""; // NOLINT
372
375
  bool use_jinja = false; // NOLINT
373
376
  bool enable_chat_template = true;
@@ -380,6 +383,8 @@ struct common_params {
380
383
  std::string ssl_file_key = ""; // NOLINT
381
384
  std::string ssl_file_cert = ""; // NOLINT
382
385
 
386
+ std::map<std::string, std::string> default_template_kwargs;
387
+
383
388
  // "advanced" endpoints are disabled by default for better security
384
389
  bool webui = true;
385
390
  bool endpoint_slots = false;