@novastera-oss/llamarn 0.2.6 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. package/android/src/main/cpp/include/llama.h +141 -38
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +58 -24
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +37 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +53 -40
  26. package/cpp/llama.cpp/common/common.h +6 -2
  27. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  28. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  29. package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
  30. package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
  31. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  32. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  33. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  34. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
  35. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  38. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  88. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  90. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  91. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
  93. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
  94. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
  97. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  105. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
  112. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
  113. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  115. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  116. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  117. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
  139. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  140. package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
  141. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
  142. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
  143. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  144. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  145. package/cpp/llama.cpp/include/llama.h +141 -38
  146. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  147. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  148. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  149. package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
  150. package/cpp/llama.cpp/src/llama-arch.h +25 -1
  151. package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
  152. package/cpp/llama.cpp/src/llama-batch.h +110 -57
  153. package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
  154. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  155. package/cpp/llama.cpp/src/llama-context.cpp +360 -266
  156. package/cpp/llama.cpp/src/llama-context.h +27 -23
  157. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  158. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  159. package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
  160. package/cpp/llama.cpp/src/llama-graph.h +126 -58
  161. package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
  162. package/cpp/llama.cpp/src/llama-hparams.h +16 -2
  163. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
  164. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
  165. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
  166. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
  167. package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
  168. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  169. package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
  170. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
  171. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
  172. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  173. package/cpp/llama.cpp/src/llama-memory.h +73 -36
  174. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  175. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  176. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  177. package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
  178. package/cpp/llama.cpp/src/llama-model.h +26 -0
  179. package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
  180. package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
  181. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  182. package/cpp/llama.cpp/src/llama.cpp +11 -7
  183. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  184. package/cpp/rn-completion.cpp +2 -2
  185. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  186. package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
  187. package/ios/include/chat.h +1 -1
  188. package/ios/include/common.h +6 -2
  189. package/ios/include/llama.h +141 -38
  190. package/ios/libs/llama.xcframework/Info.plist +15 -15
  191. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  192. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  193. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  194. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  195. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
  196. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  197. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  198. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  199. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  200. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  201. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  202. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  203. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  204. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  205. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  206. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
  207. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  208. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  209. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
  210. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  211. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  219. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  220. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  221. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  222. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  223. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
  224. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  225. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  226. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  227. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  228. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  229. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  231. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  232. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  233. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
  234. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  235. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  236. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
  237. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  238. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  239. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
  240. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
  241. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  242. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  243. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  244. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  245. package/package.json +1 -2
  246. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  247. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  248. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  249. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  250. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  251. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  252. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  253. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -367,7 +367,7 @@ ifdef LLAMA_SERVER_SSL
367
367
  endif
368
368
 
369
369
  ifndef GGML_NO_CPU_AARCH64
370
- MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
370
+ MK_CPPFLAGS += -DGGML_USE_CPU_REPACK
371
371
  endif
372
372
 
373
373
  # warnings
@@ -970,7 +970,7 @@ OBJ_GGML = \
970
970
  $(DIR_GGML)/src/ggml-threading.o \
971
971
  $(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
972
972
  $(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
973
- $(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
973
+ $(DIR_GGML)/src/ggml-cpu/repack.o \
974
974
  $(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
975
975
  $(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
976
976
  $(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \
@@ -3,9 +3,10 @@
3
3
  ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
4
4
 
5
5
  [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
6
+ [![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
6
7
  [![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
7
8
 
8
- [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
9
+ [Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
9
10
 
10
11
  Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
11
12
 
@@ -17,7 +18,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
17
18
  ## Hot topics
18
19
 
19
20
  - 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
20
- - **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
21
21
  - A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
22
22
  - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
23
23
  - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
@@ -28,6 +28,30 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
28
28
 
29
29
  ----
30
30
 
31
+ ## Quick start
32
+
33
+ Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
34
+
35
+ - Install `llama.cpp` using [brew, nix or winget](docs/install.md)
36
+ - Run with Docker - see our [Docker documentation](docs/docker.md)
37
+ - Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
38
+ - Build from source by cloning this repository - check out [our build guide](docs/build.md)
39
+
40
+ Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more.
41
+
42
+ Example command:
43
+
44
+ ```sh
45
+ # Use a local model file
46
+ llama-cli -m my_model.gguf
47
+
48
+ # Or download and run a model directly from Hugging Face
49
+ llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
50
+
51
+ # Launch OpenAI-compatible API server
52
+ llama-server -hf ggml-org/gemma-3-1b-it-GGUF
53
+ ```
54
+
31
55
  ## Description
32
56
 
33
57
  The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
@@ -230,6 +254,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
230
254
 
231
255
  </details>
232
256
 
257
+
233
258
  ## Supported backends
234
259
 
235
260
  | Backend | Target devices |
@@ -246,16 +271,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
246
271
  | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
247
272
  | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
248
273
 
249
- ## Building the project
250
-
251
- The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
252
- The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
253
-
254
- - Clone this repository and build locally, see [how to build](docs/build.md)
255
- - On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
256
- - Use a Docker image, see [documentation for Docker](docs/docker.md)
257
- - Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
258
-
259
274
  ## Obtaining and quantizing models
260
275
 
261
276
  The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
@@ -263,7 +278,11 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
263
278
  - [Trending](https://huggingface.co/models?library=gguf&sort=trending)
264
279
  - [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
265
280
 
266
- You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
281
+ You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
282
+
283
+ ```sh
284
+ llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
285
+ ```
267
286
 
268
287
  By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
269
288
 
@@ -7,8 +7,8 @@ llama_add_compile_flags()
7
7
  # Build info header
8
8
  #
9
9
 
10
- if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
11
- set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
10
+ if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
11
+ set(GIT_DIR "${PROJECT_SOURCE_DIR}/.git")
12
12
 
13
13
  # Is git submodule
14
14
  if(NOT IS_DIRECTORY "${GIT_DIR}")
@@ -18,36 +18,26 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
18
18
  if (SLASH_POS EQUAL 0)
19
19
  set(GIT_DIR "${REAL_GIT_DIR}")
20
20
  else()
21
- set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
21
+ set(GIT_DIR "${PROJECT_SOURCE_DIR}/${REAL_GIT_DIR}")
22
22
  endif()
23
23
  endif()
24
24
 
25
25
  if(EXISTS "${GIT_DIR}/index")
26
- set(GIT_INDEX "${GIT_DIR}/index")
26
+ # For build-info.cpp below
27
+ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${GIT_DIR}/index")
27
28
  else()
28
29
  message(WARNING "Git index not found in git repository.")
29
- set(GIT_INDEX "")
30
30
  endif()
31
31
  else()
32
32
  message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
33
- set(GIT_INDEX "")
34
33
  endif()
35
34
 
36
- # Add a custom command to rebuild build-info.cpp when .git/index changes
37
- add_custom_command(
38
- OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
39
- COMMENT "Generating build details from Git"
40
- COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
41
- -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
42
- -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
43
- -DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME} -DCMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}
44
- -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
45
- WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
46
- DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
47
- VERBATIM
48
- )
35
+ set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
36
+ set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
37
+ configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
38
+
49
39
  set(TARGET build_info)
50
- add_library(${TARGET} OBJECT build-info.cpp)
40
+ add_library(${TARGET} OBJECT ${OUTPUT_FILE})
51
41
  if (BUILD_SHARED_LIBS)
52
42
  set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
53
43
  endif()
@@ -988,10 +988,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
988
988
  params.tensor_buft_overrides.push_back({nullptr, nullptr});
989
989
  }
990
990
 
991
- if (params.reranking && params.embedding) {
992
- throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
993
- }
994
-
995
991
  if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
996
992
  throw std::runtime_error(string_format(
997
993
  "error: the supplied chat template is not supported: %s%s\n",
@@ -2710,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2710
2706
  params.embd_sep = value;
2711
2707
  }
2712
2708
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2709
+ add_opt(common_arg(
2710
+ {"--cls-separator"}, "STRING",
2711
+ "separator of classification sequences (default \\t) for example \"<#seq#>\"",
2712
+ [](common_params & params, const std::string & value) {
2713
+ params.cls_sep = value;
2714
+ }
2715
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2713
2716
  add_opt(common_arg(
2714
2717
  {"--host"}, "HOST",
2715
2718
  string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
@@ -2747,9 +2750,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2747
2750
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
2748
2751
  add_opt(common_arg(
2749
2752
  {"--reranking", "--rerank"},
2750
- string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
2753
+ string_format("enable reranking endpoint on server (default: %s)", "disabled"),
2751
2754
  [](common_params & params) {
2752
- params.reranking = true;
2755
+ params.embedding = true;
2756
+ params.pooling_type = LLAMA_POOLING_TYPE_RANK;
2753
2757
  }
2754
2758
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
2755
2759
  add_opt(common_arg(
@@ -2869,6 +2873,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2869
2873
  "(default: deepseek)",
2870
2874
  [](common_params & params, const std::string & value) {
2871
2875
  /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2876
+ else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
2872
2877
  else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2873
2878
  else { throw std::invalid_argument("invalid value"); }
2874
2879
  }
@@ -3212,6 +3217,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3212
3217
  params.speculative.model.path = value;
3213
3218
  }
3214
3219
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3220
+ add_opt(common_arg(
3221
+ {"-ctkd", "--cache-type-k-draft"}, "TYPE",
3222
+ string_format(
3223
+ "KV cache data type for K for the draft model\n"
3224
+ "allowed values: %s\n"
3225
+ "(default: %s)",
3226
+ get_all_kv_cache_types().c_str(),
3227
+ ggml_type_name(params.speculative.cache_type_k)
3228
+ ),
3229
+ [](common_params & params, const std::string & value) {
3230
+ params.speculative.cache_type_k = kv_cache_type_from_str(value);
3231
+ }
3232
+ ).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
3233
+ add_opt(common_arg(
3234
+ {"-ctvd", "--cache-type-v-draft"}, "TYPE",
3235
+ string_format(
3236
+ "KV cache data type for V for the draft model\n"
3237
+ "allowed values: %s\n"
3238
+ "(default: %s)",
3239
+ get_all_kv_cache_types().c_str(),
3240
+ ggml_type_name(params.speculative.cache_type_v)
3241
+ ),
3242
+ [](common_params & params, const std::string & value) {
3243
+ params.speculative.cache_type_v = kv_cache_type_from_str(value);
3244
+ }
3245
+ ).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
3215
3246
 
3216
3247
  add_opt(common_arg(
3217
3248
  {"-mv", "--model-vocoder"}, "FNAME",
@@ -1,4 +1,4 @@
1
- int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
2
- char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
1
+ int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
2
+ char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
3
3
  char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
4
4
  char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
@@ -49,6 +49,7 @@ bool common_chat_msg_parser::add_tool_call(const std::string & name, const std::
49
49
 
50
50
  // LOG_DBG("Tool call arguments:\n\traw: %s\n\tresult: %s\n", arguments.c_str(), tool_call.arguments.c_str());
51
51
  result_.tool_calls.emplace_back(tool_call);
52
+
52
53
  return true;
53
54
  }
54
55
  bool common_chat_msg_parser::add_tool_call(const json & tool_call) {
@@ -378,3 +379,7 @@ std::optional<common_chat_msg_parser::consume_json_result> common_chat_msg_parse
378
379
  /* .is_partial = */ found_healing_marker,
379
380
  };
380
381
  }
382
+
383
+ void common_chat_msg_parser::clear_tools() {
384
+ result_.tool_calls.clear();
385
+ }
@@ -115,4 +115,6 @@ class common_chat_msg_parser {
115
115
  const std::vector<std::vector<std::string>> & args_paths = {},
116
116
  const std::vector<std::vector<std::string>> & content_paths = {}
117
117
  );
118
+
119
+ void clear_tools();
118
120
  };
@@ -82,10 +82,10 @@ json common_chat_msg::to_json_oaicompat() const
82
82
 
83
83
  std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
84
84
  std::vector<common_chat_msg_diff> diffs;
85
- // if (previous_msg.reasoning_content != current.reasoning_content) {
86
- // auto & diff = diffs.emplace_back();
87
- // diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, current.reasoning_content);
88
- // }
85
+ if (previous_msg.reasoning_content != new_msg.reasoning_content) {
86
+ auto & diff = diffs.emplace_back();
87
+ diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content);
88
+ }
89
89
  if (previous_msg.content != new_msg.content) {
90
90
  auto & diff = diffs.emplace_back();
91
91
  diff.content_delta = string_diff(previous_msg.content, new_msg.content);
@@ -385,9 +385,9 @@ json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & t
385
385
 
386
386
  template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
387
387
  json delta = json::object();
388
- // if (!diff.reasoning_content_delta.empty()) {
389
- // delta["reasoning_content"] = msg.reasoning_content;
390
- // }
388
+ if (!diff.reasoning_content_delta.empty()) {
389
+ delta["reasoning_content"] = diff.reasoning_content_delta;
390
+ }
391
391
  if (!diff.content_delta.empty()) {
392
392
  delta["content"] = diff.content_delta;
393
393
  }
@@ -598,6 +598,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
598
598
  switch (format) {
599
599
  case COMMON_REASONING_FORMAT_NONE: return "none";
600
600
  case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
601
+ case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
601
602
  default:
602
603
  throw std::runtime_error("Unknown reasoning format");
603
604
  }
@@ -1837,7 +1838,7 @@ static common_chat_params common_chat_templates_apply_legacy(
1837
1838
  if (res < 0) {
1838
1839
  // if the custom "tmpl" is not supported, we throw an error
1839
1840
  // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1840
- throw std::runtime_error("this custom template is not supported");
1841
+ throw std::runtime_error("this custom template is not supported, try using --jinja");
1841
1842
  }
1842
1843
 
1843
1844
  // if it turns out that our buffer is too small, we resize it
@@ -1920,7 +1921,9 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
1920
1921
  } catch (const common_chat_msg_partial_exception & ex) {
1921
1922
  LOG_DBG("Partial parse: %s\n", ex.what());
1922
1923
  if (!is_partial) {
1923
- throw std::runtime_error(ex.what());
1924
+ builder.clear_tools();
1925
+ builder.move_to(0);
1926
+ common_chat_parse_content_only(builder);
1924
1927
  }
1925
1928
  }
1926
1929
  auto msg = builder.result();
@@ -70,7 +70,7 @@ struct common_chat_msg {
70
70
  };
71
71
 
72
72
  struct common_chat_msg_diff {
73
- // std::string reasoning_content_delta;
73
+ std::string reasoning_content_delta;
74
74
  std::string content_delta;
75
75
  size_t tool_call_index = std::string::npos;
76
76
  common_chat_tool_call tool_call_delta;
@@ -466,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
466
466
 
467
467
  std::string regex_escape(const std::string & s) {
468
468
  static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
469
- return std::regex_replace(s, special_chars, "\\$0");
469
+ return std::regex_replace(s, special_chars, "\\$&");
470
470
  }
471
471
 
472
472
  std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
@@ -706,11 +706,17 @@ bool fs_validate_filename(const std::string & filename) {
706
706
  // disable C++17 deprecation warning for std::codecvt_utf8
707
707
  # pragma clang diagnostic push
708
708
  # pragma clang diagnostic ignored "-Wdeprecated-declarations"
709
+ #elif defined(__GNUC__)
710
+ # pragma GCC diagnostic push
711
+ # pragma GCC diagnostic ignored "-Wdeprecated-declarations"
709
712
  #endif
713
+
710
714
  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
711
715
 
712
716
  #if defined(__clang__)
713
717
  # pragma clang diagnostic pop
718
+ #elif defined(__GNUC__)
719
+ # pragma GCC diagnostic pop
714
720
  #endif
715
721
 
716
722
  filename_utf32 = converter.from_bytes(filename);
@@ -767,6 +773,9 @@ bool fs_validate_filename(const std::string & filename) {
767
773
  return true;
768
774
  }
769
775
 
776
+ #include <iostream>
777
+
778
+
770
779
  // returns true if successful, false otherwise
771
780
  bool fs_create_directory_with_parents(const std::string & path) {
772
781
  #ifdef _WIN32
@@ -784,9 +793,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
784
793
  // process path from front to back, procedurally creating directories
785
794
  while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
786
795
  const std::wstring subpath = wpath.substr(0, pos_slash);
787
- const wchar_t * test = subpath.c_str();
788
796
 
789
- const bool success = CreateDirectoryW(test, NULL);
797
+ pos_slash += 1;
798
+
799
+ // skip the drive letter, in some systems it can return an access denied error
800
+ if (subpath.length() == 2 && subpath[1] == ':') {
801
+ continue;
802
+ }
803
+
804
+ const bool success = CreateDirectoryW(subpath.c_str(), NULL);
805
+
790
806
  if (!success) {
791
807
  const DWORD error = GetLastError();
792
808
 
@@ -800,8 +816,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
800
816
  return false;
801
817
  }
802
818
  }
803
-
804
- pos_slash += 1;
805
819
  }
806
820
 
807
821
  return true;
@@ -897,34 +911,6 @@ struct common_init_result common_init_from_params(common_params & params) {
897
911
 
898
912
  const llama_vocab * vocab = llama_model_get_vocab(model);
899
913
 
900
- if (params.reranking) {
901
- bool ok = true;
902
-
903
- if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
904
- LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
905
- ok = false;
906
- }
907
-
908
- bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
909
- bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
910
-
911
- if (!has_eos && !has_sep) {
912
- LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
913
- ok = false;
914
- } else if (!has_eos) {
915
- LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
916
- } else if (!has_sep) {
917
- LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
918
- ok = false;
919
- }
920
-
921
- if (!ok) {
922
- llama_model_free(model);
923
-
924
- return iparams;
925
- }
926
- }
927
-
928
914
  auto cparams = common_context_params_to_llama(params);
929
915
 
930
916
  llama_context * lctx = llama_init_from_model(model, cparams);
@@ -934,7 +920,7 @@ struct common_init_result common_init_from_params(common_params & params) {
934
920
  return iparams;
935
921
  }
936
922
 
937
- if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
923
+ if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
938
924
  LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
939
925
  params.ctx_shift = false;
940
926
  }
@@ -966,6 +952,35 @@ struct common_init_result common_init_from_params(common_params & params) {
966
952
  }
967
953
  }
968
954
 
955
+ if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
956
+ bool ok = true;
957
+
958
+ if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
959
+ LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
960
+ ok = false;
961
+ }
962
+
963
+ bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
964
+ bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
965
+
966
+ if (!has_eos && !has_sep) {
967
+ LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
968
+ ok = false;
969
+ } else if (!has_eos) {
970
+ LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
971
+ } else if (!has_sep) {
972
+ LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
973
+ ok = false;
974
+ }
975
+
976
+ if (!ok) {
977
+ llama_free(lctx);
978
+ llama_model_free(model);
979
+
980
+ return iparams;
981
+ }
982
+ }
983
+
969
984
  // load and optionally apply lora adapters
970
985
  for (auto & la : params.lora_adapters) {
971
986
  llama_adapter_lora_ptr lora;
@@ -1041,7 +1056,7 @@ struct common_init_result common_init_from_params(common_params & params) {
1041
1056
  if (llama_model_has_decoder(model)) {
1042
1057
  llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
1043
1058
  }
1044
- llama_kv_self_clear(lctx);
1059
+ llama_memory_clear(llama_get_memory(lctx), true);
1045
1060
  llama_synchronize(lctx);
1046
1061
  llama_perf_context_reset(lctx);
1047
1062
  llama_set_warmup(lctx, false);
@@ -1143,11 +1158,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1143
1158
  cparams.op_offload = !params.no_op_offload;
1144
1159
  cparams.swa_full = params.swa_full;
1145
1160
 
1146
- if (params.reranking) {
1147
- cparams.embeddings = true;
1148
- cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
1149
- }
1150
-
1151
1161
  cparams.type_k = params.cache_type_k;
1152
1162
  cparams.type_v = params.cache_type_v;
1153
1163
 
@@ -1280,6 +1290,9 @@ std::vector<llama_token> common_tokenize(
1280
1290
  int n_tokens = text.length() + 2 * add_special;
1281
1291
  std::vector<llama_token> result(n_tokens);
1282
1292
  n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1293
+ if (n_tokens == std::numeric_limits<int32_t>::min()) {
1294
+ throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
1295
+ }
1283
1296
  if (n_tokens < 0) {
1284
1297
  result.resize(-n_tokens);
1285
1298
  int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
@@ -199,6 +199,9 @@ struct common_params_speculative {
199
199
  float p_split = 0.1f; // speculative decoding split probability
200
200
  float p_min = 0.75f; // minimum speculative decoding probability (greedy)
201
201
 
202
+ ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
203
+ ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
204
+
202
205
  struct cpu_params cpuparams;
203
206
  struct cpu_params cpuparams_batch;
204
207
 
@@ -215,7 +218,8 @@ struct common_params_vocoder {
215
218
 
216
219
  enum common_reasoning_format {
217
220
  COMMON_REASONING_FORMAT_NONE,
218
- COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
221
+ COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
222
+ COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
219
223
  };
220
224
 
221
225
  struct common_params {
@@ -354,7 +358,7 @@ struct common_params {
354
358
  int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
355
359
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
356
360
  std::string embd_sep = "\n"; // separator of embeddings
357
- bool reranking = false; // enable reranking support on server
361
+ std::string cls_sep = "\t"; // separator of classification sequences
358
362
 
359
363
  // server params
360
364
  int32_t port = 8080; // server listens on this network port
@@ -41,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
41
41
  return result;
42
42
  }
43
43
 
44
- /* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
45
- class string_view {
46
- const std::string & _str;
47
- const size_t _start;
48
- const size_t _end;
49
- public:
50
- string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
51
-
52
- size_t size() const {
53
- return _end - _start;
54
- }
55
-
56
- size_t length() const {
57
- return size();
58
- }
59
-
60
- operator std::string() const {
61
- return str();
62
- }
63
-
64
- std::string str() const {
65
- return _str.substr(_start, _end - _start);
66
- }
67
-
68
- string_view substr(size_t pos, size_t len = std::string::npos) const {
69
- return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
70
- }
71
-
72
- char operator[](size_t pos) const {
73
- auto index = _start + pos;
74
- if (index >= _end) {
75
- throw std::out_of_range("string_view index out of range");
76
- }
77
- return _str[_start + pos];
78
- }
79
-
80
- bool operator==(const string_view & other) const {
81
- std::string this_str = *this;
82
- std::string other_str = other;
83
- return this_str == other_str;
84
- }
85
- };
86
-
87
44
  static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
88
45
  auto has_min = min_value != std::numeric_limits<int>::min();
89
46
  auto has_max = max_value != std::numeric_limits<int>::max();
@@ -112,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
112
69
  }
113
70
  out << "}";
114
71
  };
115
- std::function<void(const string_view &, const string_view &)> uniform_range =
116
- [&](const string_view & from, const string_view & to) {
72
+ std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
73
+ [&](const std::string_view & from, const std::string_view & to) {
117
74
  size_t i = 0;
118
75
  while (i < from.length() && i < to.length() && from[i] == to[i]) {
119
76
  i++;
120
77
  }
121
78
  if (i > 0) {
122
- out << "\"" << from.substr(0, i).str() << "\"";
79
+ out << "\"" << from.substr(0, i) << "\"";
123
80
  }
124
81
  if (i < from.length() && i < to.length()) {
125
82
  if (i > 0) {
@@ -144,6 +144,8 @@ llama_tokens common_speculative_gen_draft(
144
144
  auto & smpl = spec->smpl;
145
145
  auto & prompt = spec->prompt;
146
146
 
147
+ auto * mem = llama_get_memory(ctx);
148
+
147
149
  int reuse_i = 0;
148
150
  int reuse_n = 0;
149
151
 
@@ -173,7 +175,7 @@ llama_tokens common_speculative_gen_draft(
173
175
  result.reserve(params.n_draft);
174
176
 
175
177
  if (reuse_n == 0) {
176
- llama_kv_self_clear(ctx);
178
+ llama_memory_clear(mem, false);
177
179
 
178
180
  prompt.clear();
179
181
  } else {
@@ -192,14 +194,14 @@ llama_tokens common_speculative_gen_draft(
192
194
  }
193
195
 
194
196
  if (reuse_i > 0) {
195
- llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
196
- llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
197
+ llama_memory_seq_rm (mem, 0, 0, reuse_i);
198
+ llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
197
199
 
198
200
  prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
199
201
  }
200
202
 
201
203
  if (reuse_n < (int) prompt.size()) {
202
- llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
204
+ llama_memory_seq_rm (mem, 0, reuse_n, -1);
203
205
 
204
206
  prompt.erase(prompt.begin() + reuse_n, prompt.end());
205
207
  }