cui-llama.rn 1.7.4 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. package/README.md +217 -17
  2. package/android/src/main/CMakeLists.txt +34 -15
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +79 -5
  4. package/android/src/main/java/com/rnllama/RNLlama.java +237 -0
  5. package/android/src/main/jni.cpp +213 -14
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
  16. package/cpp/README.md +1 -1
  17. package/cpp/chat-parser.cpp +385 -0
  18. package/cpp/chat-parser.h +120 -0
  19. package/cpp/chat.cpp +726 -596
  20. package/cpp/chat.h +71 -6
  21. package/cpp/common.cpp +56 -38
  22. package/cpp/common.h +9 -3
  23. package/cpp/ggml-backend-reg.cpp +5 -0
  24. package/cpp/ggml-backend.cpp +10 -2
  25. package/cpp/ggml-common.h +4 -0
  26. package/cpp/ggml-cpu/amx/amx.cpp +1 -1
  27. package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
  28. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  29. package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
  30. package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
  31. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  32. package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
  33. package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  34. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  35. package/cpp/ggml-cpu/common.h +4 -3
  36. package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
  37. package/cpp/ggml-cpu/ggml-cpu.c +123 -104
  38. package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
  39. package/cpp/ggml-cpu/ops.cpp +330 -148
  40. package/cpp/ggml-cpu/ops.h +1 -0
  41. package/cpp/ggml-cpu/quants.c +1158 -0
  42. package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  43. package/cpp/ggml-cpu/repack.cpp +1571 -0
  44. package/cpp/ggml-cpu/repack.h +98 -0
  45. package/cpp/ggml-cpu/simd-mappings.h +330 -38
  46. package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  47. package/cpp/ggml-cpu/vec.cpp +87 -18
  48. package/cpp/ggml-cpu/vec.h +249 -94
  49. package/cpp/ggml-cpu.h +1 -0
  50. package/cpp/ggml-impl.h +63 -183
  51. package/cpp/ggml-llama-sim.metallib +0 -0
  52. package/cpp/ggml-llama.metallib +0 -0
  53. package/cpp/ggml-metal.m +152 -45
  54. package/cpp/ggml-quants.c +0 -2
  55. package/cpp/ggml.c +61 -21
  56. package/cpp/ggml.h +22 -3
  57. package/cpp/gguf.cpp +24 -3
  58. package/cpp/json-partial.cpp +256 -0
  59. package/cpp/json-partial.h +38 -0
  60. package/cpp/json-schema-to-grammar.cpp +5 -47
  61. package/cpp/json-schema-to-grammar.h +4 -4
  62. package/cpp/llama-arch.cpp +153 -3
  63. package/cpp/llama-arch.h +27 -1
  64. package/cpp/llama-batch.cpp +741 -272
  65. package/cpp/llama-batch.h +112 -54
  66. package/cpp/llama-chat.cpp +30 -8
  67. package/cpp/llama-chat.h +1 -0
  68. package/cpp/llama-context.cpp +524 -339
  69. package/cpp/llama-context.h +38 -17
  70. package/cpp/llama-cparams.cpp +4 -0
  71. package/cpp/llama-cparams.h +2 -0
  72. package/cpp/llama-grammar.cpp +12 -2
  73. package/cpp/llama-graph.cpp +431 -356
  74. package/cpp/llama-graph.h +126 -58
  75. package/cpp/llama-hparams.cpp +10 -2
  76. package/cpp/llama-hparams.h +19 -2
  77. package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
  78. package/cpp/llama-kv-cache-unified-iswa.h +128 -0
  79. package/cpp/llama-kv-cache-unified.cpp +1841 -0
  80. package/cpp/llama-kv-cache-unified.h +303 -0
  81. package/cpp/llama-kv-cells.h +439 -0
  82. package/cpp/llama-memory-hybrid.cpp +246 -0
  83. package/cpp/llama-memory-hybrid.h +138 -0
  84. package/cpp/llama-memory-recurrent.cpp +1112 -0
  85. package/cpp/llama-memory-recurrent.h +183 -0
  86. package/cpp/llama-memory.cpp +41 -0
  87. package/cpp/llama-memory.h +86 -5
  88. package/cpp/llama-mmap.cpp +1 -1
  89. package/cpp/llama-model-loader.cpp +42 -17
  90. package/cpp/llama-model-saver.cpp +1 -0
  91. package/cpp/llama-model.cpp +1639 -513
  92. package/cpp/llama-model.h +26 -0
  93. package/cpp/llama-sampling.cpp +2 -2
  94. package/cpp/llama-vocab.cpp +65 -28
  95. package/cpp/llama-vocab.h +1 -0
  96. package/cpp/llama.cpp +11 -7
  97. package/cpp/llama.h +150 -42
  98. package/cpp/minja/chat-template.hpp +1 -1
  99. package/cpp/minja/minja.hpp +1 -1
  100. package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
  101. package/cpp/nlohmann/json_fwd.hpp +187 -0
  102. package/cpp/regex-partial.cpp +204 -0
  103. package/cpp/regex-partial.h +56 -0
  104. package/cpp/rn-llama.cpp +646 -35
  105. package/cpp/rn-llama.h +32 -1
  106. package/cpp/rn-tts.h +39 -0
  107. package/cpp/sampling.cpp +7 -8
  108. package/cpp/tools/mtmd/clip-impl.h +5 -0
  109. package/cpp/tools/mtmd/clip.cpp +572 -436
  110. package/cpp/tools/mtmd/clip.h +14 -4
  111. package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
  112. package/cpp/tools/mtmd/mtmd-audio.h +2 -17
  113. package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
  114. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  115. package/cpp/tools/mtmd/mtmd.cpp +368 -248
  116. package/cpp/tools/mtmd/mtmd.h +6 -70
  117. package/cpp/unicode.cpp +5 -0
  118. package/ios/CMakeLists.txt +26 -6
  119. package/ios/RNLlama.h +1 -1
  120. package/ios/RNLlama.mm +153 -3
  121. package/ios/RNLlamaContext.h +9 -1
  122. package/ios/RNLlamaContext.mm +112 -9
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  131. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  132. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  133. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  134. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  135. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  136. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  137. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  138. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  139. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  140. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  141. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  142. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  143. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  144. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  145. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  146. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  147. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
  148. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  165. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  166. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  167. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  168. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  169. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  170. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  171. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  172. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  173. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  174. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  175. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  176. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  177. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  178. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  179. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  180. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  181. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  182. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  183. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  184. package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  185. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  186. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  187. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  188. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  189. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  190. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  191. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  192. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
  193. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
  194. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  195. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  196. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  197. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
  198. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  199. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  200. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  201. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  202. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  203. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  204. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  205. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  206. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  207. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  208. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  209. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  210. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  211. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  212. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  213. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  214. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  215. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
  216. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  217. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  218. package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  219. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  220. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  221. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  222. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  223. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  224. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  225. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  226. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  227. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  228. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  229. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  230. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  231. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  232. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  233. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  234. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  235. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  236. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  237. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  238. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  239. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  240. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  241. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  242. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  243. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  244. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  245. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  246. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  247. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  248. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  249. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  250. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  251. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  252. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
  253. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  254. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  255. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  256. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  257. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  258. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  259. package/jest/mock.js +24 -0
  260. package/package.json +1 -1
  261. package/src/NativeRNLlama.ts +46 -2
  262. package/src/index.ts +105 -1
  263. package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  264. package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
  265. package/cpp/ggml-cpu/sgemm.cpp +0 -3544
  266. package/cpp/ggml-cpu/sgemm.h +0 -14
  267. package/cpp/llama-kv-cache.cpp +0 -2827
  268. package/cpp/llama-kv-cache.h +0 -515
  269. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  270. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  271. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  272. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
  273. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  274. /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  275. /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
  276. /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
package/cpp/chat.h CHANGED
@@ -3,6 +3,7 @@
3
3
  #pragma once
4
4
 
5
5
  #include "common.h"
6
+ #include <functional>
6
7
  #include <chrono>
7
8
  #include <string>
8
9
  #include <vector>
@@ -21,11 +22,19 @@ struct common_chat_tool_call {
21
22
  std::string name;
22
23
  std::string arguments;
23
24
  std::string id;
25
+
26
+ bool operator==(const common_chat_tool_call & other) const {
27
+ return name == other.name && arguments == other.arguments && id == other.id;
28
+ }
24
29
  };
25
30
 
26
31
  struct common_chat_msg_content_part {
27
32
  std::string type;
28
33
  std::string text;
34
+
35
+ bool operator==(const common_chat_msg_content_part & other) const {
36
+ return type == other.type && text == other.text;
37
+ }
29
38
  };
30
39
 
31
40
  struct common_chat_msg {
@@ -36,6 +45,51 @@ struct common_chat_msg {
36
45
  std::string reasoning_content;
37
46
  std::string tool_name;
38
47
  std::string tool_call_id;
48
+
49
+ template <class T> T to_json_oaicompat() const;
50
+
51
+ bool empty() const {
52
+ return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
53
+ }
54
+ void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
55
+ for (auto i = 0u; i < tool_calls.size(); i++) {
56
+ if (ids_cache.size() <= i) {
57
+ auto id = tool_calls[i].id;
58
+ if (id.empty()) {
59
+ id = gen_tool_call_id();
60
+ }
61
+ ids_cache.push_back(id);
62
+ }
63
+ tool_calls[i].id = ids_cache[i];
64
+ }
65
+ }
66
+ bool operator==(const common_chat_msg & other) const {
67
+ return role == other.role
68
+ && content == other.content
69
+ && content_parts == other.content_parts
70
+ && tool_calls == other.tool_calls
71
+ && reasoning_content == other.reasoning_content
72
+ && tool_name == other.tool_name
73
+ && tool_call_id == other.tool_call_id;
74
+ }
75
+ bool operator!=(const common_chat_msg & other) const {
76
+ return !(*this == other);
77
+ }
78
+ };
79
+
80
+ struct common_chat_msg_diff {
81
+ std::string reasoning_content_delta;
82
+ std::string content_delta;
83
+ size_t tool_call_index = std::string::npos;
84
+ common_chat_tool_call tool_call_delta;
85
+
86
+ static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
87
+
88
+ bool operator==(const common_chat_msg_diff & other) const {
89
+ return content_delta == other.content_delta
90
+ && tool_call_index == other.tool_call_index
91
+ && tool_call_delta == other.tool_call_delta;
92
+ }
39
93
  };
40
94
 
41
95
  struct common_chat_tool {
@@ -57,14 +111,11 @@ enum common_chat_format {
57
111
  COMMON_CHAT_FORMAT_LLAMA_3_X,
58
112
  COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
59
113
  COMMON_CHAT_FORMAT_DEEPSEEK_R1,
60
- COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
61
114
  COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
62
115
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
63
116
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
64
117
  COMMON_CHAT_FORMAT_HERMES_2_PRO,
65
- COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
66
118
  COMMON_CHAT_FORMAT_COMMAND_R7B,
67
- COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
68
119
 
69
120
  COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
70
121
  };
@@ -79,7 +130,8 @@ struct common_chat_templates_inputs {
79
130
  std::vector<common_chat_tool> tools;
80
131
  common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
81
132
  bool parallel_tool_calls = false;
82
- bool extract_reasoning = true;
133
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
134
+ bool enable_thinking = true;
83
135
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
84
136
  };
85
137
 
@@ -88,11 +140,21 @@ struct common_chat_params {
88
140
  std::string prompt;
89
141
  std::string grammar;
90
142
  bool grammar_lazy = false;
143
+ bool thinking_forced_open = false;
91
144
  std::vector<common_grammar_trigger> grammar_triggers;
92
145
  std::vector<std::string> preserved_tokens;
93
146
  std::vector<std::string> additional_stops;
94
147
  };
95
148
 
149
+ struct common_chat_syntax {
150
+ common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
151
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
152
+ // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
153
+ bool reasoning_in_content = false;
154
+ bool thinking_forced_open = false;
155
+ bool parse_tool_calls = true;
156
+ };
157
+
96
158
  // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
97
159
  bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
98
160
 
@@ -129,8 +191,9 @@ std::string common_chat_format_example(
129
191
  const struct common_chat_templates * tmpls,
130
192
  bool use_jinja);
131
193
 
132
- std::string common_chat_format_name(common_chat_format format);
133
- common_chat_msg common_chat_parse( const std::string & input, common_chat_format format);
194
+ const char* common_chat_format_name(common_chat_format format);
195
+ const char* common_reasoning_format_name(common_reasoning_format format);
196
+ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
134
197
 
135
198
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
136
199
 
@@ -143,3 +206,5 @@ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common
143
206
  // T can be std::string containing JSON or nlohmann::ordered_json
144
207
  template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
145
208
  template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
209
+
210
+ template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
package/cpp/common.cpp CHANGED
@@ -210,6 +210,7 @@ bool set_process_priority(enum lm_ggml_sched_priority prio) {
210
210
 
211
211
  DWORD p = NORMAL_PRIORITY_CLASS;
212
212
  switch (prio) {
213
+ case LM_GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
213
214
  case LM_GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
214
215
  case LM_GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
215
216
  case LM_GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
@@ -235,6 +236,7 @@ bool set_process_priority(enum lm_ggml_sched_priority prio) {
235
236
 
236
237
  int p = 0;
237
238
  switch (prio) {
239
+ case LM_GGML_SCHED_PRIO_LOW: p = 5; break;
238
240
  case LM_GGML_SCHED_PRIO_NORMAL: p = 0; break;
239
241
  case LM_GGML_SCHED_PRIO_MEDIUM: p = -5; break;
240
242
  case LM_GGML_SCHED_PRIO_HIGH: p = -10; break;
@@ -471,7 +473,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
471
473
 
472
474
  std::string regex_escape(const std::string & s) {
473
475
  static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
474
- return std::regex_replace(s, special_chars, "\\$0");
476
+ return std::regex_replace(s, special_chars, "\\$&");
475
477
  }
476
478
 
477
479
  std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
@@ -711,11 +713,17 @@ bool fs_validate_filename(const std::string & filename) {
711
713
  // disable C++17 deprecation warning for std::codecvt_utf8
712
714
  # pragma clang diagnostic push
713
715
  # pragma clang diagnostic ignored "-Wdeprecated-declarations"
716
+ #elif defined(__GNUC__)
717
+ # pragma GCC diagnostic push
718
+ # pragma GCC diagnostic ignored "-Wdeprecated-declarations"
714
719
  #endif
720
+
715
721
  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
716
722
 
717
723
  #if defined(__clang__)
718
724
  # pragma clang diagnostic pop
725
+ #elif defined(__GNUC__)
726
+ # pragma GCC diagnostic pop
719
727
  #endif
720
728
 
721
729
  filename_utf32 = converter.from_bytes(filename);
@@ -772,6 +780,9 @@ bool fs_validate_filename(const std::string & filename) {
772
780
  return true;
773
781
  }
774
782
 
783
+ #include <iostream>
784
+
785
+
775
786
  // returns true if successful, false otherwise
776
787
  bool fs_create_directory_with_parents(const std::string & path) {
777
788
  #ifdef _WIN32
@@ -789,9 +800,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
789
800
  // process path from front to back, procedurally creating directories
790
801
  while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
791
802
  const std::wstring subpath = wpath.substr(0, pos_slash);
792
- const wchar_t * test = subpath.c_str();
793
803
 
794
- const bool success = CreateDirectoryW(test, NULL);
804
+ pos_slash += 1;
805
+
806
+ // skip the drive letter, in some systems it can return an access denied error
807
+ if (subpath.length() == 2 && subpath[1] == ':') {
808
+ continue;
809
+ }
810
+
811
+ const bool success = CreateDirectoryW(subpath.c_str(), NULL);
812
+
795
813
  if (!success) {
796
814
  const DWORD error = GetLastError();
797
815
 
@@ -805,8 +823,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
805
823
  return false;
806
824
  }
807
825
  }
808
-
809
- pos_slash += 1;
810
826
  }
811
827
 
812
828
  return true;
@@ -856,7 +872,7 @@ std::string fs_get_cache_directory() {
856
872
  if (getenv("LLAMA_CACHE")) {
857
873
  cache_directory = std::getenv("LLAMA_CACHE");
858
874
  } else {
859
- #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
875
+ #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
860
876
  if (std::getenv("XDG_CACHE_HOME")) {
861
877
  cache_directory = std::getenv("XDG_CACHE_HOME");
862
878
  } else {
@@ -902,31 +918,6 @@ struct common_init_result common_init_from_params(common_params & params) {
902
918
 
903
919
  const llama_vocab * vocab = llama_model_get_vocab(model);
904
920
 
905
- if (params.reranking) {
906
- bool ok = true;
907
-
908
- if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
909
- LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
910
- ok = false;
911
- }
912
-
913
- if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
914
- LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
915
- ok = false;
916
- }
917
-
918
- if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
919
- LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
920
- ok = false;
921
- }
922
-
923
- if (!ok) {
924
- llama_model_free(model);
925
-
926
- return iparams;
927
- }
928
- }
929
-
930
921
  auto cparams = common_context_params_to_llama(params);
931
922
 
932
923
  llama_context * lctx = llama_init_from_model(model, cparams);
@@ -936,7 +927,7 @@ struct common_init_result common_init_from_params(common_params & params) {
936
927
  return iparams;
937
928
  }
938
929
 
939
- if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
930
+ if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
940
931
  LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
941
932
  params.ctx_shift = false;
942
933
  }
@@ -968,6 +959,35 @@ struct common_init_result common_init_from_params(common_params & params) {
968
959
  }
969
960
  }
970
961
 
962
+ if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
963
+ bool ok = true;
964
+
965
+ if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
966
+ LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
967
+ ok = false;
968
+ }
969
+
970
+ bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
971
+ bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
972
+
973
+ if (!has_eos && !has_sep) {
974
+ LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
975
+ ok = false;
976
+ } else if (!has_eos) {
977
+ LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
978
+ } else if (!has_sep) {
979
+ LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
980
+ ok = false;
981
+ }
982
+
983
+ if (!ok) {
984
+ llama_free(lctx);
985
+ llama_model_free(model);
986
+
987
+ return iparams;
988
+ }
989
+ }
990
+
971
991
  // load and optionally apply lora adapters
972
992
  for (auto & la : params.lora_adapters) {
973
993
  llama_adapter_lora_ptr lora;
@@ -1043,7 +1063,7 @@ struct common_init_result common_init_from_params(common_params & params) {
1043
1063
  if (llama_model_has_decoder(model)) {
1044
1064
  llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
1045
1065
  }
1046
- llama_kv_self_clear(lctx);
1066
+ llama_memory_clear(llama_get_memory(lctx), true);
1047
1067
  llama_synchronize(lctx);
1048
1068
  llama_perf_context_reset(lctx);
1049
1069
  llama_set_warmup(lctx, false);
@@ -1145,11 +1165,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1145
1165
  cparams.op_offload = !params.no_op_offload;
1146
1166
  cparams.swa_full = params.swa_full;
1147
1167
 
1148
- if (params.reranking) {
1149
- cparams.embeddings = true;
1150
- cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
1151
- }
1152
-
1153
1168
  cparams.type_k = params.cache_type_k;
1154
1169
  cparams.type_v = params.cache_type_v;
1155
1170
 
@@ -1282,6 +1297,9 @@ std::vector<llama_token> common_tokenize(
1282
1297
  int n_tokens = text.length() + 2 * add_special;
1283
1298
  std::vector<llama_token> result(n_tokens);
1284
1299
  n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1300
+ if (n_tokens == std::numeric_limits<int32_t>::min()) {
1301
+ throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
1302
+ }
1285
1303
  if (n_tokens < 0) {
1286
1304
  result.resize(-n_tokens);
1287
1305
  int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
package/cpp/common.h CHANGED
@@ -126,7 +126,7 @@ enum common_grammar_trigger_type {
126
126
  COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
127
127
  COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
128
128
  COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
129
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
129
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
130
130
  };
131
131
 
132
132
  struct common_grammar_trigger {
@@ -210,6 +210,9 @@ struct common_params_speculative {
210
210
  float p_split = 0.1f; // speculative decoding split probability
211
211
  float p_min = 0.75f; // minimum speculative decoding probability (greedy)
212
212
 
213
+ lm_ggml_type cache_type_k = LM_GGML_TYPE_F16; // KV cache data type for the K
214
+ lm_ggml_type cache_type_v = LM_GGML_TYPE_F16; // KV cache data type for the V
215
+
213
216
  struct cpu_params cpuparams;
214
217
  struct cpu_params cpuparams_batch;
215
218
 
@@ -226,7 +229,8 @@ struct common_params_vocoder {
226
229
 
227
230
  enum common_reasoning_format {
228
231
  COMMON_REASONING_FORMAT_NONE,
229
- COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
232
+ COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
233
+ COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
230
234
  };
231
235
 
232
236
  struct common_params {
@@ -306,6 +310,7 @@ struct common_params {
306
310
  int32_t verbosity = 0;
307
311
  int32_t control_vector_layer_start = -1; // layer range for control vector
308
312
  int32_t control_vector_layer_end = -1; // layer range for control vector
313
+ bool offline = false;
309
314
 
310
315
  int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
311
316
  int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -368,7 +373,7 @@ struct common_params {
368
373
  int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
369
374
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
370
375
  std::string embd_sep = "\n"; // separator of embeddings
371
- bool reranking = false; // enable reranking support on server
376
+ std::string cls_sep = "\t"; // separator of classification sequences
372
377
 
373
378
  // server params
374
379
  int32_t port = 8080; // server listens on this network port
@@ -383,6 +388,7 @@ struct common_params {
383
388
  bool use_jinja = false; // NOLINT
384
389
  bool enable_chat_template = true;
385
390
  common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
391
+ int reasoning_budget = -1;
386
392
  bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
387
393
 
388
394
  std::vector<std::string> api_keys;
@@ -69,6 +69,9 @@
69
69
  #if defined(__clang__)
70
70
  # pragma clang diagnostic push
71
71
  # pragma clang diagnostic ignored "-Wdeprecated-declarations"
72
+ #elif defined(__GNUC__)
73
+ # pragma GCC diagnostic push
74
+ # pragma GCC diagnostic ignored "-Wdeprecated-declarations"
72
75
  #endif
73
76
 
74
77
  namespace fs = std::filesystem;
@@ -91,6 +94,8 @@ static std::string path_str(const fs::path & path) {
91
94
 
92
95
  #if defined(__clang__)
93
96
  # pragma clang diagnostic pop
97
+ #elif defined(__GNUC__)
98
+ # pragma GCC diagnostic pop
94
99
  #endif
95
100
 
96
101
  #ifdef _WIN32
@@ -1340,7 +1340,10 @@ static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
1340
1340
  // allocate graph
1341
1341
  if (backend_ids_changed || !lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
1342
1342
  // the re-allocation may cause the split inputs to be moved to a different address
1343
- lm_ggml_backend_sched_synchronize(sched);
1343
+ // synchronize without lm_ggml_backend_sched_synchronize to avoid changing cur_copy
1344
+ for (int i = 0; i < sched->n_backends; i++) {
1345
+ lm_ggml_backend_synchronize(sched->backends[i]);
1346
+ }
1344
1347
  #ifndef NDEBUG
1345
1348
  LM_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
1346
1349
  #endif
@@ -1564,7 +1567,6 @@ bool lm_ggml_backend_sched_alloc_graph(lm_ggml_backend_sched_t sched, struct lm_
1564
1567
 
1565
1568
  lm_ggml_backend_sched_split_graph(sched, graph);
1566
1569
 
1567
-
1568
1570
  if (!lm_ggml_backend_sched_alloc_splits(sched)) {
1569
1571
  return false;
1570
1572
  }
@@ -1598,6 +1600,12 @@ void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched) {
1598
1600
  for (int i = 0; i < sched->n_backends; i++) {
1599
1601
  lm_ggml_backend_synchronize(sched->backends[i]);
1600
1602
  }
1603
+ if (!sched->is_alloc) {
1604
+ // if the graph is not already allocated, always use copy 0 after a synchronization
1605
+ // this ensures that during generation the same copy is used every time,
1606
+ // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
1607
+ sched->cur_copy = 0;
1608
+ }
1601
1609
  }
1602
1610
 
1603
1611
  void lm_ggml_backend_sched_set_eval_callback(lm_ggml_backend_sched_t sched, lm_ggml_backend_sched_eval_callback callback, void * user_data) {
package/cpp/ggml-common.h CHANGED
@@ -1074,6 +1074,10 @@ LM_GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
1074
1074
  0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
1075
1075
  LM_GGML_TABLE_END()
1076
1076
 
1077
+ LM_GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
1078
+ -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
1079
+ LM_GGML_TABLE_END()
1080
+
1077
1081
  #define NGRID_IQ1S 2048
1078
1082
  #define IQ1S_DELTA 0.125f
1079
1083
  #define IQ1M_DELTA 0.125f
@@ -5,7 +5,7 @@
5
5
  #include "ggml-backend.h"
6
6
  #include "ggml-impl.h"
7
7
  #include "ggml-cpu.h"
8
- #include "ggml-cpu-traits.h"
8
+ #include "traits.h"
9
9
 
10
10
  #if defined(__gnu_linux__)
11
11
  #include <sys/syscall.h>
@@ -8,7 +8,8 @@
8
8
  #include "mmq.h"
9
9
  #include "ggml-impl.h"
10
10
  #include "ggml-cpu-impl.h"
11
- #include "ggml-cpu-quants.h"
11
+ #include "simd-mappings.h"
12
+ #include "quants.h"
12
13
  #include "ggml-quants.h"
13
14
  #include <algorithm>
14
15
  #include <type_traits>
@@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
453
454
 
454
455
  // Quantize these floats
455
456
  const float iscale = 127.f / amax;
456
- y[i].d = LM_GGML_FP32_TO_FP16(1 / iscale);
457
+ y[i].d = LM_GGML_CPU_FP32_TO_FP16(1 / iscale);
457
458
  const float id = ( amax != 0.0f ) ? iscale : 0.f;
458
459
  const __m512 vscale = _mm512_set1_ps(id);
459
460
 
@@ -1090,7 +1091,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
1090
1091
  const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
1091
1092
 
1092
1093
  for (int m = 0; m < nr; ++m) {
1093
- const __m512 vd1 = _mm512_set1_ps(LM_GGML_FP16_TO_FP32(A[m * lda].d));
1094
+ const __m512 vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1094
1095
  const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
1095
1096
 
1096
1097
  __m512 vsum;
@@ -1113,8 +1114,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
1113
1114
  const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(lm_ggml_half))));
1114
1115
 
1115
1116
  for (int m = 0; m < nr; ++m) {
1116
- const __m512 vd1 = _mm512_set1_ps(LM_GGML_FP16_TO_FP32(A[m * lda].d));
1117
- const __m512 vs1 = _mm512_set1_ps(LM_GGML_FP16_TO_FP32(A[m * lda].s));
1117
+ const __m512 vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1118
+ const __m512 vs1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[m * lda].s));
1118
1119
  const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
1119
1120
 
1120
1121
  __m512 vsum;
@@ -1137,7 +1138,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
1137
1138
  const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
1138
1139
 
1139
1140
  for (int m = 0; m < nr; ++m) {
1140
- const __m512 vd1 = _mm512_set1_ps(LM_GGML_FP16_TO_FP32(A[m * lda].d));
1141
+ const __m512 vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1141
1142
  const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
1142
1143
 
1143
1144
  __m512 vsum;
@@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
1437
1438
  va[k] = _mm512_set1_epi32(a_ptr[k]);
1438
1439
  vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
1439
1440
  }
1440
- vd1 = _mm512_set1_ps(LM_GGML_FP16_TO_FP32(A[0 * KB + i].d));
1441
+ vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1441
1442
  }
1442
1443
 
1443
1444
  // load b
@@ -1498,8 +1499,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
1498
1499
  for (int k = 0; k < 8; ++k) {
1499
1500
  va[k] = _mm512_set1_epi32(a_ptr[k]);
1500
1501
  }
1501
- vd1 = _mm512_set1_ps(LM_GGML_FP16_TO_FP32(A[0 * KB + i].d));
1502
- vs1 = _mm512_set1_ps(LM_GGML_FP16_TO_FP32(A[0 * KB + i].s));
1502
+ vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1503
+ vs1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
1503
1504
  }
1504
1505
 
1505
1506
  // load b
@@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
1571
1572
  va[k] = _mm512_set1_epi32(a_ptr[k]);
1572
1573
  va[k] = _mm512_add_epi8(va[k], off);
1573
1574
  }
1574
- vd1 = _mm512_set1_ps(LM_GGML_FP16_TO_FP32(A[0 * KB + i].d));
1575
+ vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1575
1576
  }
1576
1577
 
1577
1578
  // load b
@@ -0,0 +1,94 @@
1
+ #include "ggml-backend-impl.h"
2
+
3
+ #if defined(__aarch64__)
4
+
5
+ #if defined(__linux__)
6
+ #include <sys/auxv.h>
7
+ #elif defined(__APPLE__)
8
+ #include <sys/sysctl.h>
9
+ #endif
10
+
11
+ #if !defined(HWCAP2_I8MM)
12
+ #define HWCAP2_I8MM (1 << 13)
13
+ #endif
14
+
15
+ #if !defined(HWCAP2_SME)
16
+ #define HWCAP2_SME (1 << 23)
17
+ #endif
18
+
19
+ struct aarch64_features {
20
+ // has_neon not needed, aarch64 has NEON guaranteed
21
+ bool has_dotprod = false;
22
+ bool has_fp16_va = false;
23
+ bool has_sve = false;
24
+ bool has_sve2 = false;
25
+ bool has_i8mm = false;
26
+ bool has_sme = false;
27
+
28
+ aarch64_features() {
29
+ #if defined(__linux__)
30
+ uint32_t hwcap = getauxval(AT_HWCAP);
31
+ uint32_t hwcap2 = getauxval(AT_HWCAP2);
32
+
33
+ has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
34
+ has_fp16_va = !!(hwcap & HWCAP_FPHP);
35
+ has_sve = !!(hwcap & HWCAP_SVE);
36
+ has_sve2 = !!(hwcap2 & HWCAP2_SVE2);
37
+ has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
38
+ has_sme = !!(hwcap2 & HWCAP2_SME);
39
+ #elif defined(__APPLE__)
40
+ int oldp = 0;
41
+ size_t size = sizeof(oldp);
42
+
43
+ if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) {
44
+ has_dotprod = static_cast<bool>(oldp);
45
+ }
46
+
47
+ if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) {
48
+ has_i8mm = static_cast<bool>(oldp);
49
+ }
50
+
51
+ if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) {
52
+ has_sme = static_cast<bool>(oldp);
53
+ }
54
+
55
+ // Apple apparently does not implement SVE yet
56
+ #endif
57
+ }
58
+ };
59
+
60
+ static int lm_ggml_backend_cpu_aarch64_score() {
61
+ int score = 1;
62
+ aarch64_features af;
63
+
64
+ #ifdef LM_GGML_USE_DOTPROD
65
+ if (!af.has_dotprod) { return 0; }
66
+ score += 1<<1;
67
+ #endif
68
+ #ifdef LM_GGML_USE_FP16_VECTOR_ARITHMETIC
69
+ if (!af.has_fp16_va) { return 0; }
70
+ score += 1<<2;
71
+ #endif
72
+ #ifdef LM_GGML_USE_SVE
73
+ if (!af.has_sve) { return 0; }
74
+ score += 1<<3;
75
+ #endif
76
+ #ifdef LM_GGML_USE_MATMUL_INT8
77
+ if (!af.has_i8mm) { return 0; }
78
+ score += 1<<4;
79
+ #endif
80
+ #ifdef LM_GGML_USE_SVE2
81
+ if (!af.has_sve2) { return 0; }
82
+ score += 1<<5;
83
+ #endif
84
+ #ifdef LM_GGML_USE_SME
85
+ if (!af.has_sme) { return 0; }
86
+ score += 1<<6;
87
+ #endif
88
+
89
+ return score;
90
+ }
91
+
92
+ LM_GGML_BACKEND_DL_SCORE_IMPL(lm_ggml_backend_cpu_aarch64_score)
93
+
94
+ # endif // defined(__aarch64__)