@novastera-oss/llamarn 0.2.5 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/RNLlamaCpp.podspec +3 -2
  2. package/android/CMakeLists.txt +6 -3
  3. package/android/src/main/cpp/include/llama.h +140 -38
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  12. package/cpp/LlamaCppModel.cpp +48 -67
  13. package/cpp/LlamaCppModel.h +8 -3
  14. package/cpp/PureCppImpl.cpp +1 -1
  15. package/cpp/PureCppImpl.h +2 -2
  16. package/cpp/build-info.cpp +2 -2
  17. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  18. package/cpp/llama.cpp/Makefile +2 -2
  19. package/cpp/llama.cpp/README.md +33 -13
  20. package/cpp/llama.cpp/common/CMakeLists.txt +15 -28
  21. package/cpp/llama.cpp/common/arg.cpp +38 -12
  22. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  23. package/cpp/llama.cpp/common/chat-parser.cpp +9 -3
  24. package/cpp/llama.cpp/common/chat-parser.h +4 -1
  25. package/cpp/llama.cpp/common/chat.cpp +16 -13
  26. package/cpp/llama.cpp/common/chat.h +1 -1
  27. package/cpp/llama.cpp/common/common.cpp +52 -40
  28. package/cpp/llama.cpp/common/common.h +5 -2
  29. package/cpp/llama.cpp/common/json-partial.cpp +5 -4
  30. package/cpp/llama.cpp/common/json-partial.h +2 -1
  31. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
  32. package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
  33. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  34. package/cpp/llama.cpp/convert_hf_to_gguf.py +128 -84
  35. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
  36. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  37. package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
  38. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +49 -13
  39. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
  41. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  42. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  44. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +33 -2
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  73. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
  74. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +6 -8
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +25 -16
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  79. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  82. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
  83. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
  86. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  88. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  90. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  93. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  94. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -46
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  107. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
  108. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -248
  109. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  112. package/cpp/llama.cpp/ggml/src/ggml.c +9 -8
  113. package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
  114. package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
  115. package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
  116. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
  117. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
  118. package/cpp/llama.cpp/include/llama.h +140 -38
  119. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  120. package/cpp/llama.cpp/src/CMakeLists.txt +4 -1
  121. package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
  122. package/cpp/llama.cpp/src/llama-arch.h +7 -1
  123. package/cpp/llama.cpp/src/llama-batch.cpp +289 -31
  124. package/cpp/llama.cpp/src/llama-batch.h +47 -17
  125. package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
  126. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  127. package/cpp/llama.cpp/src/llama-context.cpp +488 -313
  128. package/cpp/llama.cpp/src/llama-context.h +38 -17
  129. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  130. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  131. package/cpp/llama.cpp/src/llama-graph.cpp +275 -152
  132. package/cpp/llama.cpp/src/llama-graph.h +109 -52
  133. package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
  134. package/cpp/llama.cpp/src/llama-hparams.h +8 -2
  135. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +281 -0
  136. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +133 -0
  137. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1835 -0
  138. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +308 -0
  139. package/cpp/llama.cpp/src/llama-kv-cells.h +53 -17
  140. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
  141. package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
  142. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +1116 -0
  143. package/cpp/llama.cpp/src/llama-memory-recurrent.h +188 -0
  144. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  145. package/cpp/llama.cpp/src/llama-memory.h +89 -4
  146. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  147. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  148. package/cpp/llama.cpp/src/llama-model.cpp +735 -143
  149. package/cpp/llama.cpp/src/llama-model.h +4 -0
  150. package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
  151. package/cpp/llama.cpp/src/llama-vocab.cpp +39 -25
  152. package/cpp/llama.cpp/src/llama.cpp +11 -7
  153. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  154. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
  155. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
  156. package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
  157. package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
  158. package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
  159. package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  160. package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
  161. package/cpp/rn-completion.cpp +65 -10
  162. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  163. package/cpp/{rn-utils.hpp → rn-utils.h} +8 -1
  164. package/ios/include/chat.h +1 -1
  165. package/ios/include/common/minja/chat-template.hpp +1 -1
  166. package/ios/include/common/minja/minja.hpp +1 -1
  167. package/ios/include/common.h +5 -2
  168. package/ios/include/json-schema-to-grammar.h +4 -4
  169. package/ios/include/llama.h +140 -38
  170. package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
  171. package/ios/libs/llama.xcframework/Info.plist +20 -20
  172. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  173. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4617
  174. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
  175. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +140 -38
  176. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  177. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  178. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
  179. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3557
  180. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  181. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
  182. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  183. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  184. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
  185. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3559
  186. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
  187. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +140 -38
  188. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
  189. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +140 -38
  190. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  191. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
  192. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +140 -38
  193. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  194. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  195. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  196. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4616
  197. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
  198. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +140 -38
  199. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  200. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  201. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4637
  202. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3556
  203. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  204. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
  205. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  206. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  207. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4653
  208. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
  209. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +140 -38
  210. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  211. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  212. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4674
  213. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3587
  214. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  215. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
  216. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  217. package/package.json +1 -2
  218. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  219. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  220. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  221. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2747
  222. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -502
  223. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  224. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  225. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -203,6 +203,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
203
203
 
204
204
  DWORD p = NORMAL_PRIORITY_CLASS;
205
205
  switch (prio) {
206
+ case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
206
207
  case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
207
208
  case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
208
209
  case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
@@ -228,6 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
228
229
 
229
230
  int p = 0;
230
231
  switch (prio) {
232
+ case GGML_SCHED_PRIO_LOW: p = 5; break;
231
233
  case GGML_SCHED_PRIO_NORMAL: p = 0; break;
232
234
  case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
233
235
  case GGML_SCHED_PRIO_HIGH: p = -10; break;
@@ -464,7 +466,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
464
466
 
465
467
  std::string regex_escape(const std::string & s) {
466
468
  static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
467
- return std::regex_replace(s, special_chars, "\\$0");
469
+ return std::regex_replace(s, special_chars, "\\$&");
468
470
  }
469
471
 
470
472
  std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
@@ -704,11 +706,17 @@ bool fs_validate_filename(const std::string & filename) {
704
706
  // disable C++17 deprecation warning for std::codecvt_utf8
705
707
  # pragma clang diagnostic push
706
708
  # pragma clang diagnostic ignored "-Wdeprecated-declarations"
709
+ #elif defined(__GNUC__)
710
+ # pragma GCC diagnostic push
711
+ # pragma GCC diagnostic ignored "-Wdeprecated-declarations"
707
712
  #endif
713
+
708
714
  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
709
715
 
710
716
  #if defined(__clang__)
711
717
  # pragma clang diagnostic pop
718
+ #elif defined(__GNUC__)
719
+ # pragma GCC diagnostic pop
712
720
  #endif
713
721
 
714
722
  filename_utf32 = converter.from_bytes(filename);
@@ -765,6 +773,9 @@ bool fs_validate_filename(const std::string & filename) {
765
773
  return true;
766
774
  }
767
775
 
776
+ #include <iostream>
777
+
778
+
768
779
  // returns true if successful, false otherwise
769
780
  bool fs_create_directory_with_parents(const std::string & path) {
770
781
  #ifdef _WIN32
@@ -782,9 +793,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
782
793
  // process path from front to back, procedurally creating directories
783
794
  while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
784
795
  const std::wstring subpath = wpath.substr(0, pos_slash);
785
- const wchar_t * test = subpath.c_str();
786
796
 
787
- const bool success = CreateDirectoryW(test, NULL);
797
+ pos_slash += 1;
798
+
799
+ // skip the drive letter, in some systems it can return an access denied error
800
+ if (subpath.length() == 2 && subpath[1] == ':') {
801
+ continue;
802
+ }
803
+
804
+ const bool success = CreateDirectoryW(subpath.c_str(), NULL);
805
+
788
806
  if (!success) {
789
807
  const DWORD error = GetLastError();
790
808
 
@@ -798,8 +816,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
798
816
  return false;
799
817
  }
800
818
  }
801
-
802
- pos_slash += 1;
803
819
  }
804
820
 
805
821
  return true;
@@ -895,34 +911,6 @@ struct common_init_result common_init_from_params(common_params & params) {
895
911
 
896
912
  const llama_vocab * vocab = llama_model_get_vocab(model);
897
913
 
898
- if (params.reranking) {
899
- bool ok = true;
900
-
901
- if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
902
- LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
903
- ok = false;
904
- }
905
-
906
- bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
907
- bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
908
-
909
- if (!has_eos && !has_sep) {
910
- LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
911
- ok = false;
912
- } else if (!has_eos) {
913
- LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
914
- } else if (!has_sep) {
915
- LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
916
- ok = false;
917
- }
918
-
919
- if (!ok) {
920
- llama_model_free(model);
921
-
922
- return iparams;
923
- }
924
- }
925
-
926
914
  auto cparams = common_context_params_to_llama(params);
927
915
 
928
916
  llama_context * lctx = llama_init_from_model(model, cparams);
@@ -932,7 +920,7 @@ struct common_init_result common_init_from_params(common_params & params) {
932
920
  return iparams;
933
921
  }
934
922
 
935
- if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
923
+ if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
936
924
  LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
937
925
  params.ctx_shift = false;
938
926
  }
@@ -964,6 +952,35 @@ struct common_init_result common_init_from_params(common_params & params) {
964
952
  }
965
953
  }
966
954
 
955
+ if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
956
+ bool ok = true;
957
+
958
+ if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
959
+ LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
960
+ ok = false;
961
+ }
962
+
963
+ bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
964
+ bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
965
+
966
+ if (!has_eos && !has_sep) {
967
+ LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
968
+ ok = false;
969
+ } else if (!has_eos) {
970
+ LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
971
+ } else if (!has_sep) {
972
+ LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
973
+ ok = false;
974
+ }
975
+
976
+ if (!ok) {
977
+ llama_free(lctx);
978
+ llama_model_free(model);
979
+
980
+ return iparams;
981
+ }
982
+ }
983
+
967
984
  // load and optionally apply lora adapters
968
985
  for (auto & la : params.lora_adapters) {
969
986
  llama_adapter_lora_ptr lora;
@@ -1039,7 +1056,7 @@ struct common_init_result common_init_from_params(common_params & params) {
1039
1056
  if (llama_model_has_decoder(model)) {
1040
1057
  llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
1041
1058
  }
1042
- llama_kv_self_clear(lctx);
1059
+ llama_memory_clear(llama_get_memory(lctx), true);
1043
1060
  llama_synchronize(lctx);
1044
1061
  llama_perf_context_reset(lctx);
1045
1062
  llama_set_warmup(lctx, false);
@@ -1141,11 +1158,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1141
1158
  cparams.op_offload = !params.no_op_offload;
1142
1159
  cparams.swa_full = params.swa_full;
1143
1160
 
1144
- if (params.reranking) {
1145
- cparams.embeddings = true;
1146
- cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
1147
- }
1148
-
1149
1161
  cparams.type_k = params.cache_type_k;
1150
1162
  cparams.type_v = params.cache_type_v;
1151
1163
 
@@ -199,6 +199,9 @@ struct common_params_speculative {
199
199
  float p_split = 0.1f; // speculative decoding split probability
200
200
  float p_min = 0.75f; // minimum speculative decoding probability (greedy)
201
201
 
202
+ ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
203
+ ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
204
+
202
205
  struct cpu_params cpuparams;
203
206
  struct cpu_params cpuparams_batch;
204
207
 
@@ -215,7 +218,8 @@ struct common_params_vocoder {
215
218
 
216
219
  enum common_reasoning_format {
217
220
  COMMON_REASONING_FORMAT_NONE,
218
- COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
221
+ COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
222
+ COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
219
223
  };
220
224
 
221
225
  struct common_params {
@@ -354,7 +358,6 @@ struct common_params {
354
358
  int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
355
359
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
356
360
  std::string embd_sep = "\n"; // separator of embeddings
357
- bool reranking = false; // enable reranking support on server
358
361
 
359
362
  // server params
360
363
  int32_t port = 8080; // server listens on this network port
@@ -1,9 +1,10 @@
1
- #include <json-partial.h>
2
- #include "ggml.h"
1
+ #include "json-partial.h"
2
+
3
3
  #include "log.h"
4
- #include <string>
5
4
 
6
- #include <json.hpp>
5
+ #include <nlohmann/json.hpp>
6
+
7
+ #include <string>
7
8
 
8
9
  using json = nlohmann::ordered_json;
9
10
 
@@ -1,5 +1,6 @@
1
1
  #pragma once
2
- #include <json.hpp>
2
+
3
+ #include <nlohmann/json.hpp>
3
4
 
4
5
  // Healing marker (empty if the JSON was fully parsed / wasn't healed).
5
6
  struct common_healing_marker {
@@ -1,8 +1,9 @@
1
1
  #include "json-schema-to-grammar.h"
2
2
  #include "common.h"
3
3
 
4
+ #include <nlohmann/json.hpp>
5
+
4
6
  #include <algorithm>
5
- #include <fstream>
6
7
  #include <map>
7
8
  #include <regex>
8
9
  #include <sstream>
@@ -1,9 +1,9 @@
1
1
  #pragma once
2
2
 
3
- #include "ggml.h"
4
- // Change JSON_ASSERT from assert() to GGML_ASSERT:
5
- #define JSON_ASSERT GGML_ASSERT
6
- #include "json.hpp"
3
+ #include <nlohmann/json_fwd.hpp>
4
+
5
+ #include <functional>
6
+ #include <string>
7
7
 
8
8
  std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
9
9
  bool force_gbnf = false);
@@ -144,6 +144,8 @@ llama_tokens common_speculative_gen_draft(
144
144
  auto & smpl = spec->smpl;
145
145
  auto & prompt = spec->prompt;
146
146
 
147
+ auto * mem = llama_get_memory(ctx);
148
+
147
149
  int reuse_i = 0;
148
150
  int reuse_n = 0;
149
151
 
@@ -173,7 +175,7 @@ llama_tokens common_speculative_gen_draft(
173
175
  result.reserve(params.n_draft);
174
176
 
175
177
  if (reuse_n == 0) {
176
- llama_kv_self_clear(ctx);
178
+ llama_memory_clear(mem, false);
177
179
 
178
180
  prompt.clear();
179
181
  } else {
@@ -192,14 +194,14 @@ llama_tokens common_speculative_gen_draft(
192
194
  }
193
195
 
194
196
  if (reuse_i > 0) {
195
- llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
196
- llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
197
+ llama_memory_seq_rm (mem, 0, 0, reuse_i);
198
+ llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
197
199
 
198
200
  prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
199
201
  }
200
202
 
201
203
  if (reuse_n < (int) prompt.size()) {
202
- llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
204
+ llama_memory_seq_rm (mem, 0, reuse_n, -1);
203
205
 
204
206
  prompt.erase(prompt.begin() + reuse_n, prompt.end());
205
207
  }