@novastera-oss/llamarn 0.2.6 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. package/android/src/main/cpp/include/llama.h +141 -38
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +58 -24
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +37 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +53 -40
  26. package/cpp/llama.cpp/common/common.h +6 -2
  27. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  28. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  29. package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
  30. package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
  31. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  32. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  33. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  34. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
  35. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  38. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  88. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  90. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  91. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
  93. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
  94. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
  97. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  105. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
  112. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
  113. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  115. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  116. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  117. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
  139. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  140. package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
  141. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
  142. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
  143. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  144. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  145. package/cpp/llama.cpp/include/llama.h +141 -38
  146. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  147. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  148. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  149. package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
  150. package/cpp/llama.cpp/src/llama-arch.h +25 -1
  151. package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
  152. package/cpp/llama.cpp/src/llama-batch.h +110 -57
  153. package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
  154. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  155. package/cpp/llama.cpp/src/llama-context.cpp +360 -266
  156. package/cpp/llama.cpp/src/llama-context.h +27 -23
  157. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  158. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  159. package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
  160. package/cpp/llama.cpp/src/llama-graph.h +126 -58
  161. package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
  162. package/cpp/llama.cpp/src/llama-hparams.h +16 -2
  163. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
  164. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
  165. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
  166. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
  167. package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
  168. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  169. package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
  170. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
  171. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
  172. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  173. package/cpp/llama.cpp/src/llama-memory.h +73 -36
  174. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  175. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  176. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  177. package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
  178. package/cpp/llama.cpp/src/llama-model.h +26 -0
  179. package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
  180. package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
  181. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  182. package/cpp/llama.cpp/src/llama.cpp +11 -7
  183. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  184. package/cpp/rn-completion.cpp +2 -2
  185. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  186. package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
  187. package/ios/include/chat.h +1 -1
  188. package/ios/include/common.h +6 -2
  189. package/ios/include/llama.h +141 -38
  190. package/ios/libs/llama.xcframework/Info.plist +15 -15
  191. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  192. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  193. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  194. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  195. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
  196. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  197. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  198. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  199. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  200. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  201. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  202. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  203. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  204. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  205. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  206. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
  207. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  208. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  209. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
  210. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  211. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  219. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  220. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  221. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  222. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  223. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
  224. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  225. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  226. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  227. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  228. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  229. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  231. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  232. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  233. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
  234. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  235. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  236. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
  237. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  238. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  239. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
  240. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
  241. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  242. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  243. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  244. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  245. package/package.json +1 -2
  246. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  247. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  248. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  249. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  250. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  251. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  252. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  253. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -73,6 +73,7 @@ enum llm_type {
73
73
  LLM_TYPE_40B,
74
74
  LLM_TYPE_65B,
75
75
  LLM_TYPE_70B,
76
+ LLM_TYPE_142B,
76
77
  LLM_TYPE_236B,
77
78
  LLM_TYPE_290B,
78
79
  LLM_TYPE_314B,
@@ -94,6 +95,8 @@ enum llm_type {
94
95
  LLM_TYPE_17B_128E, // llama4 Maverick
95
96
  LLM_TYPE_30B_A3B,
96
97
  LLM_TYPE_235B_A22B,
98
+ LLM_TYPE_E2B,
99
+ LLM_TYPE_E4B,
97
100
  };
98
101
 
99
102
  std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
@@ -315,6 +318,19 @@ struct llama_layer {
315
318
  struct ggml_tensor * ffn_up_scale = nullptr;
316
319
  struct ggml_tensor * ffn_down_scale = nullptr;
317
320
 
321
+ // altup & laurel
322
+ struct ggml_tensor * per_layer_inp_gate = nullptr;
323
+ struct ggml_tensor * per_layer_proj = nullptr;
324
+ struct ggml_tensor * per_layer_post_norm = nullptr;
325
+ struct ggml_tensor * altup_correct_coef = nullptr;
326
+ struct ggml_tensor * altup_correct_scale = nullptr;
327
+ struct ggml_tensor * altup_predict_coef = nullptr;
328
+ struct ggml_tensor * altup_router = nullptr;
329
+ struct ggml_tensor * altup_router_norm = nullptr;
330
+ struct ggml_tensor * laurel_l = nullptr;
331
+ struct ggml_tensor * laurel_r = nullptr;
332
+ struct ggml_tensor * laurel_post_norm = nullptr;
333
+
318
334
  struct llama_layer_posnet posnet;
319
335
 
320
336
  struct llama_layer_convnext convnext;
@@ -329,6 +345,9 @@ struct llama_model {
329
345
  llama_hparams hparams = {};
330
346
  llama_vocab vocab;
331
347
 
348
+ // for classifier models
349
+ std::vector<std::string> classifier_labels;
350
+
332
351
  struct ggml_tensor * tok_embd = nullptr;
333
352
  struct ggml_tensor * type_embd = nullptr;
334
353
  struct ggml_tensor * pos_embd = nullptr;
@@ -350,6 +369,13 @@ struct llama_model {
350
369
  struct ggml_tensor * conv1d = nullptr;
351
370
  struct ggml_tensor * conv1d_b = nullptr;
352
371
 
372
+ // gemma3n altup
373
+ struct ggml_tensor * tok_embd_per_layer = nullptr;
374
+ struct ggml_tensor * altup_proj = nullptr;
375
+ struct ggml_tensor * altup_unembd_proj = nullptr;
376
+ struct ggml_tensor * per_layer_model_proj = nullptr;
377
+ struct ggml_tensor * per_layer_proj_norm = nullptr;
378
+
353
379
  std::vector<llama_layer> layers;
354
380
 
355
381
  llama_model_params params;
@@ -1,5 +1,4 @@
1
1
  #include "llama-quant.h"
2
-
3
2
  #include "llama-impl.h"
4
3
  #include "llama-model.h"
5
4
  #include "llama-model-loader.h"
@@ -27,6 +26,56 @@ static void zeros(std::ofstream & file, size_t n) {
27
26
  }
28
27
  }
29
28
 
29
+ static std::string remap_layer(const std::string & orig_name, const std::vector<int> & prune, std::map<int, std::string> & mapped, int & next_id) {
30
+ if (prune.empty()) {
31
+ return orig_name;
32
+ }
33
+
34
+ static const std::regex pattern(R"(blk\.(\d+)\.)");
35
+ if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
36
+ const int blk = std::stoi(match[1]);
37
+ std::string new_name = orig_name;
38
+
39
+ if (mapped.count(blk)) {
40
+ // Already mapped, do nothing
41
+ } else if (std::find(prune.begin(), prune.end(), blk) != prune.end()) {
42
+ mapped[blk] = "";
43
+ } else if (blk < prune.front()) {
44
+ mapped[blk] = std::to_string(blk);
45
+ next_id = blk + 1;
46
+ } else {
47
+ mapped[blk] = std::to_string(next_id);
48
+ ++next_id;
49
+ }
50
+
51
+ return mapped[blk].empty() ? mapped[blk] : new_name.replace(match.position(1), match.length(1), mapped[blk]);
52
+ }
53
+
54
+ return orig_name;
55
+ }
56
+
57
+ static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
58
+ if (mapped.empty()) {
59
+ return orig_name;
60
+ }
61
+
62
+ static const std::regex pattern(R"(blk\.(\d+)\.)");
63
+ if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
64
+ const std::string blk(match[1]);
65
+ std::string new_name = orig_name;
66
+
67
+ for (const auto & p : mapped) {
68
+ if (p.second == blk) {
69
+ LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
70
+ return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
71
+ }
72
+ }
73
+ GGML_ABORT("\n%s: imatrix mapping error for %s\n", __func__, orig_name.c_str());
74
+ }
75
+
76
+ return orig_name;
77
+ }
78
+
30
79
  struct quantize_state_impl {
31
80
  const llama_model & model;
32
81
  const llama_model_quantize_params * params;
@@ -174,7 +223,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
174
223
  new_type = GGML_TYPE_Q6_K;
175
224
  }
176
225
  }
177
- } else if (name == "token_embd.weight") {
226
+ } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
178
227
  if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
179
228
  new_type = qs.params->token_embedding_type;
180
229
  } else {
@@ -568,6 +617,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
568
617
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
569
618
  gguf_context_ptr ctx_out { gguf_init_empty() };
570
619
 
620
+ std::vector<int> prune_list = {};
621
+ if (params->prune_layers) {
622
+ prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
623
+ }
624
+
571
625
  // copy the KV pairs from the input file
572
626
  gguf_set_kv (ctx_out.get(), ml.meta.get());
573
627
  gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
@@ -585,7 +639,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
585
639
  if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
586
640
  gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
587
641
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
588
- gguf_set_val_i32(ctx_out.get(), o.key, o.val_i64);
642
+ // Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
643
+ gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)abs(o.val_i64));
589
644
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
590
645
  gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
591
646
  } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
@@ -596,12 +651,32 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
596
651
  }
597
652
  }
598
653
 
654
+ std::map<int, std::string> mapped;
655
+ int blk_id = 0;
656
+ int pruned_attention_w = 0;
657
+
599
658
  // make a list of weights
600
659
  std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
601
660
  tensors.reserve(ml.weights_map.size());
602
661
  for (const auto & it : ml.weights_map) {
662
+ const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
663
+ if (remapped_name.empty()) {
664
+ if (it.first.find("attn_v.weight") != std::string::npos ||
665
+ it.first.find("attn_qkv.weight") != std::string::npos ||
666
+ it.first.find("attn_kv_b.weight") != std::string::npos) {
667
+ pruned_attention_w++;
668
+ }
669
+ LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
670
+ continue;
671
+ } else if (remapped_name != it.first) {
672
+ ggml_set_name(it.second.tensor, remapped_name.c_str());
673
+ LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
674
+ }
603
675
  tensors.push_back(&it.second);
604
676
  }
677
+ if (!prune_list.empty()) {
678
+ gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id);
679
+ }
605
680
 
606
681
  // keep_split requires that the weights are sorted by split index
607
682
  if (params->keep_split) {
@@ -639,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
639
714
  if (llama_model_has_encoder(&model)) {
640
715
  n_attn_layer *= 3;
641
716
  }
642
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
717
+ GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
643
718
  }
644
719
 
645
720
  size_t total_size_org = 0;
@@ -680,7 +755,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
680
755
  for (size_t i = 0; i < ctx_outs.size(); ++i) {
681
756
  gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
682
757
  gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
683
- gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
758
+ gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)tensors.size());
684
759
  }
685
760
  }
686
761
 
@@ -755,6 +830,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
755
830
  // NOTE: can't use LLM_TN here because the layer number is not known
756
831
  quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
757
832
 
833
+ // these are very small (e.g. 4x4)
834
+ quantize &= name.find("altup") == std::string::npos;
835
+ quantize &= name.find("laurel") == std::string::npos;
836
+
837
+ // these are not too big so keep them as it is
838
+ quantize &= name.find("per_layer_model_proj") == std::string::npos;
839
+
758
840
  // do not quantize positional embeddings and token types (BERT)
759
841
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
760
842
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
@@ -831,7 +913,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
831
913
 
832
914
  const float * imatrix = nullptr;
833
915
  if (imatrix_data) {
834
- auto it = imatrix_data->find(tensor->name);
916
+ auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
835
917
  if (it == imatrix_data->end()) {
836
918
  LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
837
919
  } else {
@@ -946,6 +1028,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
946
1028
  /*.imatrix =*/ nullptr,
947
1029
  /*.kv_overrides =*/ nullptr,
948
1030
  /*.tensor_type =*/ nullptr,
1031
+ /*.prune_layers =*/ nullptr
949
1032
  };
950
1033
 
951
1034
  return result;
@@ -9,16 +9,16 @@
9
9
 
10
10
  #include <algorithm>
11
11
  #include <cassert>
12
+ #include <cctype>
12
13
  #include <cfloat>
13
- #include <climits>
14
14
  #include <cstdarg>
15
15
  #include <cstring>
16
16
  #include <forward_list>
17
+ #include <limits>
17
18
  #include <map>
18
19
  #include <queue>
19
20
  #include <set>
20
21
  #include <unordered_map>
21
- #include <cctype>
22
22
 
23
23
  //
24
24
  // helpers
@@ -1269,6 +1269,7 @@ struct llama_vocab::impl {
1269
1269
  bool add_space_prefix = false;
1270
1270
  bool add_bos = false;
1271
1271
  bool add_eos = false;
1272
+ bool add_sep = false;
1272
1273
  bool ignore_merges = false;
1273
1274
  bool clean_spaces = false; // clean_up_tokenization_spaces
1274
1275
  bool remove_extra_whitespaces = false;
@@ -1421,6 +1422,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1421
1422
  special_sep_id = 102;
1422
1423
  special_pad_id = 0;
1423
1424
  special_mask_id = 103;
1425
+
1426
+ add_sep = true;
1424
1427
  } else if (tokenizer_model == "gpt2") {
1425
1428
  type = LLAMA_VOCAB_TYPE_BPE;
1426
1429
 
@@ -1550,12 +1553,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1550
1553
  tokenizer_pre == "jina-es" ||
1551
1554
  tokenizer_pre == "jina-de" ||
1552
1555
  tokenizer_pre == "gigachat" ||
1553
- tokenizer_pre == "jina-v1-en" ||
1554
1556
  tokenizer_pre == "jina-v2-es" ||
1555
- tokenizer_pre == "jina-v2-de" ||
1557
+ tokenizer_pre == "jina-v2-de") {
1558
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1559
+ } else if (
1560
+ tokenizer_pre == "jina-v1-en" ||
1556
1561
  tokenizer_pre == "jina-v2-code" ||
1557
1562
  tokenizer_pre == "roberta-bpe") {
1558
1563
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1564
+ add_sep = true;
1559
1565
  } else if (
1560
1566
  tokenizer_pre == "refact") {
1561
1567
  pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
@@ -1665,6 +1671,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1665
1671
  clean_spaces = true;
1666
1672
  add_bos = true;
1667
1673
  add_eos = false;
1674
+ add_sep = true;
1668
1675
  } else if (type == LLAMA_VOCAB_TYPE_UGM) {
1669
1676
  pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1670
1677
  add_bos = false;
@@ -1801,7 +1808,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1801
1808
  }
1802
1809
  }
1803
1810
 
1804
- // Handle add_bos and add_eos
1811
+ // Handle add_bos, add_eos and add_sep
1805
1812
  {
1806
1813
  bool temp = true;
1807
1814
 
@@ -1811,6 +1818,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1811
1818
  if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
1812
1819
  add_eos = temp;
1813
1820
  }
1821
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
1822
+ add_sep = temp;
1823
+ }
1814
1824
  }
1815
1825
 
1816
1826
  // auto-detect special tokens by text
@@ -1987,6 +1997,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1987
1997
  || t.first == "<|eom_id|>"
1988
1998
  || t.first == "<EOT>"
1989
1999
  || t.first == "_<EOT>"
2000
+ || t.first == "<|end_of_text|>"
1990
2001
  ) {
1991
2002
  special_eog_ids.insert(t.second);
1992
2003
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2059,9 +2070,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2059
2070
  //NOTE: Per token attributes are missing from the GGUF file.
2060
2071
  //TODO: Extract attributes from GGUF file.
2061
2072
  {
2062
- auto _contains_any = [] (const std::string & str, const std::vector<std::string> & substrs) -> bool {
2073
+ auto _contains_any = [] (const std::string & str, const std::vector<std::string_view> & substrs) -> bool {
2063
2074
  for (const auto & substr : substrs) {
2064
- if (str.find(substr) < std::string::npos) {
2075
+ if (str.find(substr) != std::string::npos) {
2065
2076
  return true;
2066
2077
  }
2067
2078
  }
@@ -2098,7 +2109,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2098
2109
  || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
2099
2110
  || _contains_any(general_arch, {"nomic-bert-moe"})
2100
2111
  ) {
2101
- _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
2112
+ if (token_to_id.count("<mask>") == 0) {
2113
+ LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
2114
+ } else {
2115
+ _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
2116
+ }
2102
2117
  } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
2103
2118
  for (auto id : cache_special_tokens) {
2104
2119
  _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
@@ -2568,6 +2583,10 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
2568
2583
  // copy piece chars to output text buffer
2569
2584
  // skip up to 'lstrip' leading spaces before copying
2570
2585
  auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
2586
+ if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
2587
+ GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
2588
+ }
2589
+
2571
2590
  for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
2572
2591
  token++;
2573
2592
  size--;
@@ -2764,26 +2783,26 @@ void llama_vocab::impl::print_info() const {
2764
2783
  LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
2765
2784
 
2766
2785
  // special tokens
2767
- if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token[special_bos_id].text.c_str() ); }
2768
- if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token[special_eos_id].text.c_str() ); }
2769
- if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token[special_eot_id].text.c_str() ); }
2770
- if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token[special_eom_id].text.c_str() ); }
2771
- if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token[special_unk_id].text.c_str() ); }
2772
- if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token[special_sep_id].text.c_str() ); }
2773
- if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token[special_pad_id].text.c_str() ); }
2774
- if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token[special_mask_id].text.c_str() ); }
2775
-
2776
- if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token[linefeed_id].text.c_str() ); }
2777
-
2778
- if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token[special_fim_pre_id].text.c_str() ); }
2779
- if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token[special_fim_suf_id].text.c_str() ); }
2780
- if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token[special_fim_mid_id].text.c_str() ); }
2781
- if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token[special_fim_pad_id].text.c_str() ); }
2782
- if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token[special_fim_rep_id].text.c_str() ); }
2783
- if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token[special_fim_sep_id].text.c_str() ); }
2786
+ if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
2787
+ if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
2788
+ if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
2789
+ if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
2790
+ if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
2791
+ if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
2792
+ if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
2793
+ if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
2794
+
2795
+ if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
2796
+
2797
+ if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
2798
+ if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
2799
+ if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
2800
+ if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
2801
+ if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
2802
+ if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
2784
2803
 
2785
2804
  for (const auto & id : special_eog_ids) {
2786
- LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token[id].text.c_str() );
2805
+ LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
2787
2806
  }
2788
2807
 
2789
2808
  LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
@@ -2991,6 +3010,10 @@ bool llama_vocab::get_add_eos() const {
2991
3010
  return pimpl->add_eos;
2992
3011
  }
2993
3012
 
3013
+ bool llama_vocab::get_add_sep() const {
3014
+ return pimpl->add_sep;
3015
+ }
3016
+
2994
3017
  bool llama_vocab::get_ignore_merges() const {
2995
3018
  return pimpl->ignore_merges;
2996
3019
  }
@@ -3051,6 +3074,11 @@ int32_t llama_vocab::tokenize(
3051
3074
  bool add_special,
3052
3075
  bool parse_special) const {
3053
3076
  auto res = tokenize(std::string(text, text_len), add_special, parse_special);
3077
+ if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
3078
+ LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
3079
+ return std::numeric_limits<int32_t>::min();
3080
+ }
3081
+
3054
3082
  if (n_tokens_max < (int) res.size()) {
3055
3083
  // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
3056
3084
  return -((int) res.size());
@@ -3182,6 +3210,10 @@ bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
3182
3210
  return vocab->get_add_eos();
3183
3211
  }
3184
3212
 
3213
+ bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
3214
+ return vocab->get_add_sep();
3215
+ }
3216
+
3185
3217
  llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
3186
3218
  return vocab->token_fim_pre();
3187
3219
  }
@@ -74,6 +74,7 @@ struct llama_vocab {
74
74
  bool get_add_space_prefix () const;
75
75
  bool get_add_bos () const;
76
76
  bool get_add_eos () const;
77
+ bool get_add_sep () const;
77
78
  bool get_ignore_merges () const;
78
79
  bool get_clean_spaces () const;
79
80
  bool get_remove_extra_whitespaces () const;
@@ -198,14 +198,18 @@ static struct llama_model * llama_model_load_from_file_impl(
198
198
 
199
199
  // if using single GPU mode, remove all except the main GPU
200
200
  if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
201
- if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
202
- LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
203
- llama_model_free(model);
204
- return nullptr;
201
+ if (params.main_gpu < 0) {
202
+ model->devices.clear();
203
+ } else {
204
+ if (params.main_gpu >= (int)model->devices.size()) {
205
+ LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %zu)\n", __func__, params.main_gpu, model->devices.size());
206
+ llama_model_free(model);
207
+ return nullptr;
208
+ }
209
+ ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
210
+ model->devices.clear();
211
+ model->devices.push_back(main_gpu);
205
212
  }
206
- ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
207
- model->devices.clear();
208
- model->devices.push_back(main_gpu);
209
213
  }
210
214
 
211
215
  for (auto * dev : model->devices) {
@@ -204,12 +204,17 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
204
204
  // disable C++17 deprecation warning for std::codecvt_utf8
205
205
  # pragma clang diagnostic push
206
206
  # pragma clang diagnostic ignored "-Wdeprecated-declarations"
207
+ #elif defined(__GNUC__)
208
+ # pragma GCC diagnostic push
209
+ # pragma GCC diagnostic ignored "-Wdeprecated-declarations"
207
210
  #endif
208
211
 
209
212
  std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
210
213
 
211
214
  #if defined(__clang__)
212
215
  # pragma clang diagnostic pop
216
+ #elif defined(__GNUC__)
217
+ # pragma GCC diagnostic pop
213
218
  #endif
214
219
 
215
220
  return conv.from_bytes(s);
@@ -1,4 +1,4 @@
1
- #include "rn-llama.hpp"
1
+ #include "rn-llama.h"
2
2
  // Suppress unused function warnings from llama.cpp headers
3
3
  #pragma GCC diagnostic push
4
4
  #pragma GCC diagnostic ignored "-Wunused-function"
@@ -7,7 +7,7 @@
7
7
  #include "llama.h"
8
8
  #include "sampling.h"
9
9
  #pragma GCC diagnostic pop
10
- #include "rn-utils.hpp"
10
+ #include "rn-utils.h"
11
11
 
12
12
  #include <string>
13
13
  #include <vector>
@@ -10,7 +10,7 @@
10
10
  #include "json-schema-to-grammar.h"
11
11
  #pragma GCC diagnostic pop
12
12
 
13
- #include "rn-utils.hpp"
13
+ #include "rn-utils.h"
14
14
 
15
15
  #include <functional>
16
16
  #include <mutex>
@@ -54,6 +54,7 @@ struct CompletionOptions {
54
54
  float top_p = 0.9f;
55
55
  float top_k = 40.0f;
56
56
  float min_p = 0.05f;
57
+ float presence_penalty = 0.0f; // for reducing repetitions (0-2 range)
57
58
  int n_keep = 0;
58
59
  int n_probs = 0; // for log probabilities
59
60
  bool post_sampling_probs = false;
@@ -77,6 +78,7 @@ struct CompletionOptions {
77
78
  {"top_p", top_p},
78
79
  {"top_k", top_k},
79
80
  {"min_p", min_p},
81
+ {"presence_penalty", presence_penalty},
80
82
  {"n_predict", n_predict},
81
83
  {"n_keep", n_keep},
82
84
  {"n_probs", n_probs},
@@ -147,6 +149,7 @@ struct CompletionOptions {
147
149
  data["top_p"] = top_p;
148
150
  data["max_tokens"] = n_predict;
149
151
  data["stream"] = stream;
152
+ data["presence_penalty"] = presence_penalty;
150
153
 
151
154
  if (seed >= 0) {
152
155
  data["seed"] = seed;
@@ -70,7 +70,7 @@ struct common_chat_msg {
70
70
  };
71
71
 
72
72
  struct common_chat_msg_diff {
73
- // std::string reasoning_content_delta;
73
+ std::string reasoning_content_delta;
74
74
  std::string content_delta;
75
75
  size_t tool_call_index = std::string::npos;
76
76
  common_chat_tool_call tool_call_delta;
@@ -199,6 +199,9 @@ struct common_params_speculative {
199
199
  float p_split = 0.1f; // speculative decoding split probability
200
200
  float p_min = 0.75f; // minimum speculative decoding probability (greedy)
201
201
 
202
+ ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
203
+ ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
204
+
202
205
  struct cpu_params cpuparams;
203
206
  struct cpu_params cpuparams_batch;
204
207
 
@@ -215,7 +218,8 @@ struct common_params_vocoder {
215
218
 
216
219
  enum common_reasoning_format {
217
220
  COMMON_REASONING_FORMAT_NONE,
218
- COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
221
+ COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
222
+ COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
219
223
  };
220
224
 
221
225
  struct common_params {
@@ -354,7 +358,7 @@ struct common_params {
354
358
  int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
355
359
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
356
360
  std::string embd_sep = "\n"; // separator of embeddings
357
- bool reranking = false; // enable reranking support on server
361
+ std::string cls_sep = "\t"; // separator of classification sequences
358
362
 
359
363
  // server params
360
364
  int32_t port = 8080; // server listens on this network port