@novastera-oss/llamarn 0.2.9 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (314) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/proguard-rules.pro +12 -0
  3. package/android/src/main/cpp/include/llama.h +15 -47
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/build-info.cpp +2 -2
  21. package/cpp/llama.cpp/CMakeLists.txt +0 -1
  22. package/cpp/llama.cpp/CMakePresets.json +11 -0
  23. package/cpp/llama.cpp/CODEOWNERS +1 -0
  24. package/cpp/llama.cpp/README.md +8 -8
  25. package/cpp/llama.cpp/build-xcframework.sh +1 -1
  26. package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
  27. package/cpp/llama.cpp/common/arg.cpp +62 -1
  28. package/cpp/llama.cpp/common/chat.cpp +37 -20
  29. package/cpp/llama.cpp/common/chat.h +2 -0
  30. package/cpp/llama.cpp/common/common.cpp +22 -6
  31. package/cpp/llama.cpp/common/common.h +22 -4
  32. package/cpp/llama.cpp/convert_hf_to_gguf.py +1250 -43
  33. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +21 -13
  34. package/cpp/llama.cpp/ggml/CMakeLists.txt +13 -3
  35. package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +85 -47
  36. package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
  37. package/cpp/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  38. package/cpp/llama.cpp/ggml/include/ggml.h +173 -10
  39. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  40. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +0 -15
  41. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +7 -8
  42. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +44 -38
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  44. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +126 -8
  45. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +130 -22
  46. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +138 -18
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +11 -3
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +28 -1
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1206 -163
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +6 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +36 -9
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +142 -9
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +3 -3
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +31 -4
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +86 -17
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +225 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +41 -301
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +85 -64
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +47 -60
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +29 -42
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +46 -59
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +36 -45
  72. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +38 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +23 -36
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +3 -13
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +255 -99
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +1 -1
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +111 -3
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +6 -4
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1152 -695
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +92 -5
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +2 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +275 -0
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
  88. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
  89. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
  90. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +104 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +13 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
  93. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +27 -6
  94. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-impl.h +80 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
  97. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +48 -12
  98. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +572 -106
  99. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +599 -105
  100. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +18 -4
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +5 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +800 -42
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  105. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
  106. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  108. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  109. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  110. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
  112. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
  114. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
  115. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
  116. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
  117. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
  118. package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
  119. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +4 -4
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +693 -1034
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +14 -26
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +191 -55
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +8 -9
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +15 -18
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
  131. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +991 -307
  132. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +265 -0
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +59 -12
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  138. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  139. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
  140. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
  141. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
  142. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
  143. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
  144. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  145. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
  146. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
  147. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
  148. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -0
  149. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +17 -0
  150. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  151. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +3 -8
  152. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
  153. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
  154. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  155. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +18 -3
  156. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
  157. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  158. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
  159. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
  160. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
  161. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  162. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
  163. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
  164. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  165. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
  166. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +84 -9
  167. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  168. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +907 -0
  169. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  170. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +35 -0
  171. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  172. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +56 -0
  173. package/cpp/llama.cpp/ggml/src/ggml.c +386 -67
  174. package/cpp/llama.cpp/ggml/src/gguf.cpp +8 -1
  175. package/cpp/llama.cpp/gguf-py/gguf/constants.py +307 -0
  176. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +8 -2
  177. package/cpp/llama.cpp/gguf-py/gguf/metadata.py +4 -0
  178. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_dump.py +24 -1
  179. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +122 -47
  180. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +12 -3
  181. package/cpp/llama.cpp/include/llama.h +15 -47
  182. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +34 -0
  183. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +43 -0
  184. package/cpp/llama.cpp/requirements/requirements-all.txt +1 -0
  185. package/cpp/llama.cpp/requirements/requirements-server-bench.txt +5 -0
  186. package/cpp/llama.cpp/src/llama-arch.cpp +316 -3
  187. package/cpp/llama.cpp/src/llama-arch.h +23 -1
  188. package/cpp/llama.cpp/src/llama-batch.cpp +103 -71
  189. package/cpp/llama.cpp/src/llama-batch.h +31 -18
  190. package/cpp/llama.cpp/src/llama-chat.cpp +58 -1
  191. package/cpp/llama.cpp/src/llama-chat.h +3 -0
  192. package/cpp/llama.cpp/src/llama-context.cpp +180 -106
  193. package/cpp/llama.cpp/src/llama-context.h +26 -16
  194. package/cpp/llama.cpp/src/llama-cparams.h +3 -2
  195. package/cpp/llama.cpp/src/llama-graph.cpp +310 -211
  196. package/cpp/llama.cpp/src/llama-graph.h +184 -122
  197. package/cpp/llama.cpp/src/llama-hparams.cpp +47 -1
  198. package/cpp/llama.cpp/src/llama-hparams.h +13 -2
  199. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
  200. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
  201. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
  202. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +143 -47
  203. package/cpp/llama.cpp/src/llama-kv-cells.h +62 -10
  204. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
  205. package/cpp/llama.cpp/src/llama-memory-hybrid.h +3 -1
  206. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +36 -11
  207. package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
  208. package/cpp/llama.cpp/src/llama-memory.h +3 -0
  209. package/cpp/llama.cpp/src/llama-model.cpp +3545 -719
  210. package/cpp/llama.cpp/src/llama-model.h +21 -4
  211. package/cpp/llama.cpp/src/llama-quant.cpp +2 -2
  212. package/cpp/llama.cpp/src/llama-vocab.cpp +376 -10
  213. package/cpp/llama.cpp/src/llama-vocab.h +43 -0
  214. package/cpp/llama.cpp/src/unicode.cpp +207 -0
  215. package/cpp/llama.cpp/src/unicode.h +2 -0
  216. package/ios/include/chat.h +2 -0
  217. package/ios/include/common.h +22 -4
  218. package/ios/include/llama.h +15 -47
  219. package/ios/libs/llama.xcframework/Info.plist +13 -13
  220. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  221. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -4890
  222. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  223. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +173 -10
  224. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +15 -47
  225. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  226. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  227. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
  228. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3764
  229. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  230. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  231. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
  232. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  233. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  234. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
  235. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4016 -3766
  236. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
  237. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +173 -10
  238. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +15 -47
  239. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
  240. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +173 -10
  241. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +15 -47
  242. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  243. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
  244. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +173 -10
  245. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +15 -47
  246. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  247. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  248. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  249. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -4890
  250. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  251. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +173 -10
  252. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +15 -47
  253. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  254. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  255. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -4861
  256. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3764
  257. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  258. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  259. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
  260. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  261. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  262. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5303 -4926
  263. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
  264. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +173 -10
  265. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +15 -47
  266. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  267. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  268. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5274 -4897
  269. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4044 -3794
  270. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
  271. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +173 -10
  272. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -47
  273. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  274. package/package.json +4 -4
  275. package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
  276. package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
  277. package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
  278. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
  279. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
  280. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
  281. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
  282. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
  283. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
  284. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
  285. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
  286. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
  287. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
  288. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
  289. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
  290. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
  291. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
  292. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
  293. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
  294. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
  295. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
  296. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
  297. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
  298. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
  299. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
  300. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
  301. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
  302. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
  303. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
  304. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
  305. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
  306. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
  307. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
  308. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
  309. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
  310. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
  311. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
  312. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
  313. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
  314. package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
@@ -67,7 +67,8 @@ android {
67
67
  minSdkVersion getExtOrIntegerDefault("minSdkVersion")
68
68
  targetSdkVersion getExtOrIntegerDefault("targetSdkVersion")
69
69
 
70
-
70
+ // Include ProGuard rules for apps that use this library
71
+ consumerProguardFiles 'proguard-rules.pro'
71
72
  }
72
73
 
73
74
 
@@ -0,0 +1,12 @@
1
+ # ProGuard rules for @novastera-oss/llamarn library
2
+ # These rules will be automatically included when apps use this library
3
+
4
+ # Keep all classes in our package (includes NativeRNLlamaCppSpec, RNLlamaCppPackage, etc.)
5
+ -keep class com.novastera.llamarn.** {
6
+ *;
7
+ }
8
+
9
+ # Keep native methods (JNI)
10
+ -keepclassmembers class com.novastera.llamarn.** {
11
+ native <methods>;
12
+ }
@@ -71,52 +71,13 @@ extern "C" {
71
71
  typedef int32_t llama_seq_id;
72
72
 
73
73
  enum llama_vocab_type {
74
- LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
75
- LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
76
- LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
77
- LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
78
- LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
79
- LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
80
- };
81
-
82
- // pre-tokenization types
83
- enum llama_vocab_pre_type {
84
- LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
85
- LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
86
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
87
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
88
- LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
89
- LLAMA_VOCAB_PRE_TYPE_MPT = 5,
90
- LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
91
- LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
92
- LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
93
- LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
94
- LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
95
- LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
96
- LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
97
- LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
98
- LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
99
- LLAMA_VOCAB_PRE_TYPE_PORO = 15,
100
- LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
101
- LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
102
- LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
103
- LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
104
- LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
105
- LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
106
- LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
107
- LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
108
- LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
109
- LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
110
- LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
111
- LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
112
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
113
- LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
114
- LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
115
- LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
116
- LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
117
- LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
118
- LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
119
- LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
74
+ LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
75
+ LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
76
+ LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
77
+ LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
78
+ LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
79
+ LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
80
+ LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
120
81
  };
121
82
 
122
83
  enum llama_rope_type {
@@ -374,6 +335,9 @@ extern "C" {
374
335
  bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
375
336
  // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
376
337
  // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
338
+ bool kv_unified; // use a unified buffer across the input sequences when computing the attention
339
+ // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
340
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14363
377
341
  };
378
342
 
379
343
  // model quantization parameters
@@ -764,7 +728,7 @@ extern "C" {
764
728
  // - lazily on next llama_decode()
765
729
  // p0 < 0 : [0, p1]
766
730
  // p1 < 0 : [p0, inf)
767
- DEPRECATED(void llama_kv_self_seq_div(
731
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
768
732
  struct llama_context * ctx,
769
733
  llama_seq_id seq_id,
770
734
  llama_pos p0,
@@ -992,6 +956,7 @@ extern "C" {
992
956
  // in the order they have appeared in the batch.
993
957
  // Rows: number of tokens for which llama_batch.logits[i] != 0
994
958
  // Cols: n_vocab
959
+ // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
995
960
  LLAMA_API float * llama_get_logits(struct llama_context * ctx);
996
961
 
997
962
  // Logits for the ith token. For positive indices, Equivalent to:
@@ -1006,6 +971,7 @@ extern "C" {
1006
971
  // in the order they have appeared in the batch.
1007
972
  // shape: [n_outputs*n_embd]
1008
973
  // Otherwise, returns NULL.
974
+ // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
1009
975
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
1010
976
 
1011
977
  // Get the embeddings for the ith token. For positive indices, Equivalent to:
@@ -1044,6 +1010,7 @@ extern "C" {
1044
1010
  LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
1045
1011
  LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
1046
1012
  LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
1013
+ LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
1047
1014
 
1048
1015
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
1049
1016
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
@@ -1429,6 +1396,7 @@ extern "C" {
1429
1396
 
1430
1397
  int32_t n_p_eval;
1431
1398
  int32_t n_eval;
1399
+ int32_t n_reused; // number of times a ggml compute graph had been reused
1432
1400
  };
1433
1401
 
1434
1402
  struct llama_perf_sampler_data {
@@ -1,4 +1,4 @@
1
- int LLAMA_BUILD_NUMBER = 5770;
2
- char const *LLAMA_COMMIT = "b25e9277";
1
+ int LLAMA_BUILD_NUMBER = 6000;
2
+ char const *LLAMA_COMMIT = "4762ad73";
3
3
  char const *LLAMA_COMPILER = "unknown";
4
4
  char const *LLAMA_BUILD_TARGET = "unknown";
@@ -120,7 +120,6 @@ endfunction()
120
120
 
121
121
  llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
122
122
  llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
123
- llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
124
123
  llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
125
124
  llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
126
125
  llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
@@ -55,6 +55,17 @@
55
55
  "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
56
56
  }
57
57
  },
58
+ {
59
+ "name": "x64-linux-gcc", "hidden": true,
60
+ "cacheVariables": {
61
+ "CMAKE_C_COMPILER": "gcc",
62
+ "CMAKE_CXX_COMPILER": "g++"
63
+ }
64
+ },
65
+ { "name": "x64-linux-gcc-debug", "inherits": [ "base", "x64-linux-gcc", "debug" ] },
66
+ { "name": "x64-linux-gcc-release", "inherits": [ "base", "x64-linux-gcc", "release" ] },
67
+ { "name": "x64-linux-gcc-reldbg", "inherits": [ "base", "x64-linux-gcc", "reldbg" ] },
68
+ { "name": "x64-linux-gcc+static-release", "inherits": [ "base", "x64-linux-gcc", "release", "static" ] },
58
69
 
59
70
  { "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
60
71
  { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
@@ -9,3 +9,4 @@
9
9
  /ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
10
10
  /ggml/src/ggml-opt.cpp @JohannesGaessler
11
11
  /ggml/src/gguf.cpp @JohannesGaessler
12
+ /ggml/src/ggml-vulkan/ @0cc4m
@@ -6,9 +6,9 @@
6
6
  [![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
7
7
  [![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
8
8
 
9
- [Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
9
+ [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) / [ops](https://github.com/ggml-org/llama.cpp/blob/master/docs/ops.md)
10
10
 
11
- Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
11
+ LLM inference in C/C++
12
12
 
13
13
  ## Recent API changes
14
14
 
@@ -17,10 +17,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
17
17
 
18
18
  ## Hot topics
19
19
 
20
- - 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
21
- - A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
20
+ - Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
21
+ - Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
22
22
  - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
23
- - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
24
23
  - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
25
24
  - Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
26
25
  - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
@@ -134,6 +133,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
134
133
  - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
135
134
  - [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
136
135
  - [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
136
+ - [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
137
137
 
138
138
  #### Multimodal
139
139
 
@@ -269,6 +269,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
269
269
  | [Vulkan](docs/build.md#vulkan) | GPU |
270
270
  | [CANN](docs/build.md#cann) | Ascend NPU |
271
271
  | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
272
+ | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
272
273
  | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
273
274
 
274
275
  ## Obtaining and quantizing models
@@ -434,7 +435,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
434
435
 
435
436
  ## [`llama-perplexity`](tools/perplexity)
436
437
 
437
- #### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
438
+ #### A tool for measuring the [perplexity](tools/perplexity/README.md) [^1] (and other quality metrics) of a model over a given text.
438
439
 
439
440
  - <details open>
440
441
  <summary>Measure the perplexity over a text file</summary>
@@ -457,8 +458,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
457
458
 
458
459
  </details>
459
460
 
460
- [^1]: [tools/perplexity/README.md](./tools/perplexity/README.md)
461
- [^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
461
+ [^1]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
462
462
 
463
463
  ## [`llama-bench`](tools/llama-bench)
464
464
 
@@ -1,4 +1,4 @@
1
- #!/bin/bash
1
+ #!/usr/bin/env bash
2
2
  #
3
3
  # Options
4
4
  IOS_MIN_OS_VERSION=16.4
@@ -86,8 +86,7 @@ if (LLAMA_CURL)
86
86
  endif()
87
87
  target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
88
88
  include_directories(${CURL_INCLUDE_DIRS})
89
- find_library(CURL_LIBRARY curl REQUIRED)
90
- set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
89
+ set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
91
90
  endif ()
92
91
 
93
92
  if (LLAMA_LLGUIDANCE)
@@ -112,13 +111,13 @@ if (LLAMA_LLGUIDANCE)
112
111
 
113
112
  ExternalProject_Add(llguidance_ext
114
113
  GIT_REPOSITORY https://github.com/guidance-ai/llguidance
115
- # v0.7.20 (+ fix to build on GCC 15):
116
- GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
114
+ # v1.0.1:
115
+ GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
117
116
  PREFIX ${CMAKE_BINARY_DIR}/llguidance
118
117
  SOURCE_DIR ${LLGUIDANCE_SRC}
119
118
  BUILD_IN_SOURCE TRUE
120
119
  CONFIGURE_COMMAND ""
121
- BUILD_COMMAND cargo build --release
120
+ BUILD_COMMAND cargo build --release --package llguidance
122
121
  INSTALL_COMMAND ""
123
122
  BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
124
123
  UPDATE_COMMAND ""
@@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1464
1464
  params.swa_full = true;
1465
1465
  }
1466
1466
  ).set_env("LLAMA_ARG_SWA_FULL"));
1467
+ add_opt(common_arg(
1468
+ {"--kv-unified", "-kvu"},
1469
+ string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
1470
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
1471
+ [](common_params & params) {
1472
+ params.kv_unified = true;
1473
+ }
1474
+ ).set_env("LLAMA_ARG_KV_SPLIT"));
1467
1475
  add_opt(common_arg(
1468
1476
  {"--no-context-shift"},
1469
1477
  string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -1604,7 +1612,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1604
1612
  [](common_params & params, const std::string & value) {
1605
1613
  params.antiprompt.emplace_back(value);
1606
1614
  }
1607
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1615
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
1608
1616
  add_opt(common_arg(
1609
1617
  {"-sp", "--special"},
1610
1618
  string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
@@ -2647,6 +2655,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2647
2655
  params.i_chunk = value;
2648
2656
  }
2649
2657
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2658
+ add_opt(common_arg(
2659
+ {"--show-statistics"},
2660
+ string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
2661
+ [](common_params & params) {
2662
+ params.show_statistics = true;
2663
+ }
2664
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2650
2665
  add_opt(common_arg(
2651
2666
  {"--parse-special"},
2652
2667
  string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
@@ -2734,6 +2749,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2734
2749
  params.public_path = value;
2735
2750
  }
2736
2751
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
2752
+ add_opt(common_arg(
2753
+ {"--api-prefix"}, "PREFIX",
2754
+ string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
2755
+ [](common_params & params, const std::string & value) {
2756
+ params.api_prefix = value;
2757
+ }
2758
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
2737
2759
  add_opt(common_arg(
2738
2760
  {"--no-webui"},
2739
2761
  string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
@@ -2794,6 +2816,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2794
2816
  params.ssl_file_cert = value;
2795
2817
  }
2796
2818
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
2819
+ add_opt(common_arg(
2820
+ {"--chat-template-kwargs"}, "STRING",
2821
+ string_format("sets additional params for the json template parser"),
2822
+ [](common_params & params, const std::string & value) {
2823
+ auto parsed = json::parse(value);
2824
+ for (const auto & item : parsed.items()) {
2825
+ params.default_template_kwargs[item.key()] = item.value().dump();
2826
+ }
2827
+ }
2828
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
2797
2829
  add_opt(common_arg(
2798
2830
  {"-to", "--timeout"}, "N",
2799
2831
  string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -3406,5 +3438,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3406
3438
  }
3407
3439
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
3408
3440
 
3441
+ // diffusion parameters
3442
+ add_opt(common_arg(
3443
+ { "--diffusion-steps" }, "N",
3444
+ string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3445
+ [](common_params & params, int value) { params.diffusion.steps = value; }
3446
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3447
+ add_opt(common_arg(
3448
+ { "--diffusion-eps" }, "F",
3449
+ string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
3450
+ [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
3451
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3452
+ add_opt(common_arg(
3453
+ { "--diffusion-algorithm" }, "N",
3454
+ string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
3455
+ params.diffusion.algorithm),
3456
+ [](common_params & params, int value) { params.diffusion.algorithm = value; }
3457
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3458
+ add_opt(common_arg(
3459
+ { "--diffusion-alg-temp" }, "F",
3460
+ string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3461
+ [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
3462
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3463
+ add_opt(common_arg(
3464
+ { "--diffusion-visual" },
3465
+ string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3466
+ params.diffusion.visual_mode ? "true" : "false"),
3467
+ [](common_params & params) { params.diffusion.visual_mode = true; }
3468
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3469
+
3409
3470
  return ctx_arg;
3410
3471
  }
@@ -17,6 +17,8 @@
17
17
  #include <string>
18
18
  #include <vector>
19
19
 
20
+ using json = nlohmann::ordered_json;
21
+
20
22
  static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
21
23
  auto time = std::chrono::system_clock::to_time_t(now);
22
24
  auto local_time = *std::localtime(&time);
@@ -140,6 +142,7 @@ struct templates_params {
140
142
  bool add_generation_prompt = true;
141
143
  bool enable_thinking = true;
142
144
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
145
+ json extra_context;
143
146
  };
144
147
 
145
148
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -720,16 +723,23 @@ static void foreach_function(const json & tools, const std::function<void(const
720
723
 
721
724
  static std::string apply(
722
725
  const common_chat_template & tmpl,
723
- const nlohmann::ordered_json & messages,
724
- const nlohmann::ordered_json & tools,
725
- bool add_generation_prompt,
726
- const nlohmann::ordered_json & extra_context = nlohmann::ordered_json())
726
+ const struct templates_params & inputs,
727
+ const std::optional<json> & messages_override = std::nullopt,
728
+ const std::optional<json> & tools_override = std::nullopt,
729
+ const std::optional<json> & additional_context = std::nullopt)
727
730
  {
728
731
  minja::chat_template_inputs tmpl_inputs;
729
- tmpl_inputs.messages = messages;
730
- tmpl_inputs.tools = tools;
731
- tmpl_inputs.add_generation_prompt = add_generation_prompt;
732
- tmpl_inputs.extra_context = extra_context;
732
+ tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
733
+ if (tools_override) {
734
+ tmpl_inputs.tools = *tools_override;
735
+ } else {
736
+ tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
737
+ }
738
+ tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
739
+ tmpl_inputs.extra_context = inputs.extra_context;
740
+ if (additional_context) {
741
+ tmpl_inputs.extra_context.merge_patch(*additional_context);
742
+ }
733
743
  // TODO: add flag to control date/time, if only for testing purposes.
734
744
  // tmpl_inputs.now = std::chrono::system_clock::now();
735
745
 
@@ -828,7 +838,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
828
838
  inputs.messages,
829
839
  "Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
830
840
 
831
- data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
841
+ data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
832
842
  data.format = COMMON_CHAT_FORMAT_GENERIC;
833
843
  return data;
834
844
  }
@@ -904,7 +914,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
904
914
  data.preserved_tokens = {
905
915
  "[TOOL_CALLS]",
906
916
  };
907
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
917
+ data.prompt = apply(tmpl, inputs);
908
918
  data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
909
919
  return data;
910
920
  }
@@ -934,7 +944,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
934
944
  adjusted_messages.push_back(msg);
935
945
  }
936
946
  }
937
- data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
947
+ data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
938
948
  data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
939
949
  if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
940
950
  if (!inputs.enable_thinking) {
@@ -1122,7 +1132,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
1122
1132
  } else {
1123
1133
  data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
1124
1134
  }
1125
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
1135
+ data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
1126
1136
  {"date_string", format_time(inputs.now, "%d %b %Y")},
1127
1137
  {"tools_in_user_message", false},
1128
1138
  {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
@@ -1187,7 +1197,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w
1187
1197
 
1188
1198
  static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
1189
1199
  common_chat_params data;
1190
- auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1200
+ auto prompt = apply(tmpl, inputs);
1191
1201
 
1192
1202
  // Hacks to fix the official (broken) prompt.
1193
1203
  // It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
@@ -1282,7 +1292,7 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
1282
1292
  static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
1283
1293
  LOG_DBG("%s\n", __func__);
1284
1294
  common_chat_params data;
1285
- data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, {
1295
+ data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
1286
1296
  {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
1287
1297
  {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
1288
1298
  });
@@ -1338,7 +1348,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
1338
1348
  // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
1339
1349
  // If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
1340
1350
  common_chat_params data;
1341
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1351
+ data.prompt = apply(tmpl, inputs);
1342
1352
  data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
1343
1353
  if (inputs.tools.is_array() && !inputs.tools.empty()) {
1344
1354
  data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -1465,7 +1475,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
1465
1475
  data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
1466
1476
  }
1467
1477
 
1468
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1478
+ data.prompt = apply(tmpl, inputs);
1469
1479
  // TODO: if (has_raw_python)
1470
1480
  return data;
1471
1481
  }
@@ -1498,14 +1508,15 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
1498
1508
  static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
1499
1509
  common_chat_params data;
1500
1510
 
1501
- json additional_context = {
1511
+ json extra_context = json {
1502
1512
  {"enable_thinking", inputs.enable_thinking},
1503
1513
  };
1514
+ extra_context.update(inputs.extra_context);
1504
1515
 
1505
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
1516
+ data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context);
1506
1517
  data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
1507
1518
  if (string_ends_with(data.prompt, "<think>\n")) {
1508
- if (!inputs.enable_thinking) {
1519
+ if (!extra_context["enable_thinking"]) {
1509
1520
  data.prompt += "</think>";
1510
1521
  } else {
1511
1522
  data.thinking_forced_open = true;
@@ -1691,7 +1702,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
1691
1702
 
1692
1703
  static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
1693
1704
  common_chat_params data;
1694
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
1705
+ data.prompt = apply(tmpl, inputs);
1695
1706
  data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
1696
1707
  data.grammar_lazy = false;
1697
1708
  if (!inputs.json_schema.is_null()) {
@@ -1722,6 +1733,12 @@ static common_chat_params common_chat_templates_apply_jinja(
1722
1733
  params.enable_thinking = inputs.enable_thinking;
1723
1734
  params.grammar = inputs.grammar;
1724
1735
  params.now = inputs.now;
1736
+
1737
+ params.extra_context = json::object();
1738
+ for (auto el : inputs.chat_template_kwargs) {
1739
+ params.extra_context[el.first] = json::parse(el.second);
1740
+ }
1741
+
1725
1742
  if (!inputs.json_schema.empty()) {
1726
1743
  params.json_schema = json::parse(inputs.json_schema);
1727
1744
  }
@@ -7,6 +7,7 @@
7
7
  #include <chrono>
8
8
  #include <string>
9
9
  #include <vector>
10
+ #include <map>
10
11
 
11
12
  struct common_chat_templates;
12
13
 
@@ -125,6 +126,7 @@ struct common_chat_templates_inputs {
125
126
  common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
126
127
  bool enable_thinking = true;
127
128
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
129
+ std::map<std::string, std::string> chat_template_kwargs;
128
130
  };
129
131
 
130
132
  struct common_chat_params {
@@ -448,6 +448,15 @@ void string_replace_all(std::string & s, const std::string & search, const std::
448
448
  bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
449
449
  return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
450
450
  }
451
+
452
+ bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
453
+ bool has_suffix = string_ends_with(str, suffix);
454
+ if (has_suffix) {
455
+ str = str.substr(0, str.size() - suffix.size());
456
+ }
457
+ return has_suffix;
458
+ }
459
+
451
460
  size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
452
461
  if (!str.empty() && !stop.empty()) {
453
462
  const char text_last_char = str.back();
@@ -1005,15 +1014,21 @@ struct common_init_result common_init_from_params(common_params & params) {
1005
1014
  params.sampling.ignore_eos = false;
1006
1015
  }
1007
1016
 
1008
- if (params.sampling.ignore_eos) {
1009
- for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
1010
- if (llama_vocab_is_eog(vocab, i)) {
1011
- LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
1012
- params.sampling.logit_bias.push_back({i, -INFINITY});
1013
- }
1017
+ // initialize once
1018
+ for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
1019
+ if (llama_vocab_is_eog(vocab, i)) {
1020
+ LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
1021
+ params.sampling.logit_bias_eog.push_back({i, -INFINITY});
1014
1022
  }
1015
1023
  }
1016
1024
 
1025
+ if (params.sampling.ignore_eos) {
1026
+ // add EOG biases to the active set of logit biases
1027
+ params.sampling.logit_bias.insert(
1028
+ params.sampling.logit_bias.end(),
1029
+ params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
1030
+ }
1031
+
1017
1032
  if (params.sampling.penalty_last_n == -1) {
1018
1033
  LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1019
1034
  params.sampling.penalty_last_n = llama_n_ctx(lctx);
@@ -1157,6 +1172,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1157
1172
  cparams.no_perf = params.no_perf;
1158
1173
  cparams.op_offload = !params.no_op_offload;
1159
1174
  cparams.swa_full = params.swa_full;
1175
+ cparams.kv_unified = params.kv_unified;
1160
1176
 
1161
1177
  cparams.type_k = params.cache_type_k;
1162
1178
  cparams.type_v = params.cache_type_v;