cui-llama.rn 1.7.4 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. package/README.md +217 -17
  2. package/android/src/main/CMakeLists.txt +34 -15
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +79 -5
  4. package/android/src/main/java/com/rnllama/RNLlama.java +237 -0
  5. package/android/src/main/jni.cpp +213 -14
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
  16. package/cpp/README.md +1 -1
  17. package/cpp/chat-parser.cpp +385 -0
  18. package/cpp/chat-parser.h +120 -0
  19. package/cpp/chat.cpp +726 -596
  20. package/cpp/chat.h +71 -6
  21. package/cpp/common.cpp +56 -38
  22. package/cpp/common.h +9 -3
  23. package/cpp/ggml-backend-reg.cpp +5 -0
  24. package/cpp/ggml-backend.cpp +10 -2
  25. package/cpp/ggml-common.h +4 -0
  26. package/cpp/ggml-cpu/amx/amx.cpp +1 -1
  27. package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
  28. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  29. package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
  30. package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
  31. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  32. package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
  33. package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  34. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  35. package/cpp/ggml-cpu/common.h +4 -3
  36. package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
  37. package/cpp/ggml-cpu/ggml-cpu.c +123 -104
  38. package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
  39. package/cpp/ggml-cpu/ops.cpp +330 -148
  40. package/cpp/ggml-cpu/ops.h +1 -0
  41. package/cpp/ggml-cpu/quants.c +1158 -0
  42. package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  43. package/cpp/ggml-cpu/repack.cpp +1571 -0
  44. package/cpp/ggml-cpu/repack.h +98 -0
  45. package/cpp/ggml-cpu/simd-mappings.h +330 -38
  46. package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  47. package/cpp/ggml-cpu/vec.cpp +87 -18
  48. package/cpp/ggml-cpu/vec.h +249 -94
  49. package/cpp/ggml-cpu.h +1 -0
  50. package/cpp/ggml-impl.h +63 -183
  51. package/cpp/ggml-llama-sim.metallib +0 -0
  52. package/cpp/ggml-llama.metallib +0 -0
  53. package/cpp/ggml-metal.m +152 -45
  54. package/cpp/ggml-quants.c +0 -2
  55. package/cpp/ggml.c +61 -21
  56. package/cpp/ggml.h +22 -3
  57. package/cpp/gguf.cpp +24 -3
  58. package/cpp/json-partial.cpp +256 -0
  59. package/cpp/json-partial.h +38 -0
  60. package/cpp/json-schema-to-grammar.cpp +5 -47
  61. package/cpp/json-schema-to-grammar.h +4 -4
  62. package/cpp/llama-arch.cpp +153 -3
  63. package/cpp/llama-arch.h +27 -1
  64. package/cpp/llama-batch.cpp +741 -272
  65. package/cpp/llama-batch.h +112 -54
  66. package/cpp/llama-chat.cpp +30 -8
  67. package/cpp/llama-chat.h +1 -0
  68. package/cpp/llama-context.cpp +524 -339
  69. package/cpp/llama-context.h +38 -17
  70. package/cpp/llama-cparams.cpp +4 -0
  71. package/cpp/llama-cparams.h +2 -0
  72. package/cpp/llama-grammar.cpp +12 -2
  73. package/cpp/llama-graph.cpp +431 -356
  74. package/cpp/llama-graph.h +126 -58
  75. package/cpp/llama-hparams.cpp +10 -2
  76. package/cpp/llama-hparams.h +19 -2
  77. package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
  78. package/cpp/llama-kv-cache-unified-iswa.h +128 -0
  79. package/cpp/llama-kv-cache-unified.cpp +1841 -0
  80. package/cpp/llama-kv-cache-unified.h +303 -0
  81. package/cpp/llama-kv-cells.h +439 -0
  82. package/cpp/llama-memory-hybrid.cpp +246 -0
  83. package/cpp/llama-memory-hybrid.h +138 -0
  84. package/cpp/llama-memory-recurrent.cpp +1112 -0
  85. package/cpp/llama-memory-recurrent.h +183 -0
  86. package/cpp/llama-memory.cpp +41 -0
  87. package/cpp/llama-memory.h +86 -5
  88. package/cpp/llama-mmap.cpp +1 -1
  89. package/cpp/llama-model-loader.cpp +42 -17
  90. package/cpp/llama-model-saver.cpp +1 -0
  91. package/cpp/llama-model.cpp +1639 -513
  92. package/cpp/llama-model.h +26 -0
  93. package/cpp/llama-sampling.cpp +2 -2
  94. package/cpp/llama-vocab.cpp +65 -28
  95. package/cpp/llama-vocab.h +1 -0
  96. package/cpp/llama.cpp +11 -7
  97. package/cpp/llama.h +150 -42
  98. package/cpp/minja/chat-template.hpp +1 -1
  99. package/cpp/minja/minja.hpp +1 -1
  100. package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
  101. package/cpp/nlohmann/json_fwd.hpp +187 -0
  102. package/cpp/regex-partial.cpp +204 -0
  103. package/cpp/regex-partial.h +56 -0
  104. package/cpp/rn-llama.cpp +646 -35
  105. package/cpp/rn-llama.h +32 -1
  106. package/cpp/rn-tts.h +39 -0
  107. package/cpp/sampling.cpp +7 -8
  108. package/cpp/tools/mtmd/clip-impl.h +5 -0
  109. package/cpp/tools/mtmd/clip.cpp +572 -436
  110. package/cpp/tools/mtmd/clip.h +14 -4
  111. package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
  112. package/cpp/tools/mtmd/mtmd-audio.h +2 -17
  113. package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
  114. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  115. package/cpp/tools/mtmd/mtmd.cpp +368 -248
  116. package/cpp/tools/mtmd/mtmd.h +6 -70
  117. package/cpp/unicode.cpp +5 -0
  118. package/ios/CMakeLists.txt +26 -6
  119. package/ios/RNLlama.h +1 -1
  120. package/ios/RNLlama.mm +153 -3
  121. package/ios/RNLlamaContext.h +9 -1
  122. package/ios/RNLlamaContext.mm +112 -9
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  131. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  132. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  133. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  134. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  135. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  136. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  137. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  138. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  139. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  140. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  141. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  142. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  143. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  144. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  145. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  146. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  147. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
  148. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  165. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  166. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  167. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  168. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  169. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  170. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  171. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  172. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  173. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  174. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  175. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  176. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  177. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  178. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  179. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  180. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  181. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  182. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  183. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  184. package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  185. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  186. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  187. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  188. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  189. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  190. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  191. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  192. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
  193. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
  194. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  195. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  196. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  197. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
  198. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  199. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  200. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  201. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  202. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  203. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  204. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  205. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  206. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  207. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  208. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  209. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  210. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  211. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  212. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  213. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  214. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  215. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
  216. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  217. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  218. package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  219. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  220. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  221. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  222. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  223. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  224. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  225. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  226. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  227. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  228. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  229. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  230. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  231. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  232. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  233. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  234. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  235. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  236. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  237. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  238. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  239. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  240. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  241. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  242. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  243. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  244. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  245. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  246. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  247. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  248. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  249. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  250. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  251. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  252. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
  253. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  254. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  255. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  256. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  257. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  258. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  259. package/jest/mock.js +24 -0
  260. package/package.json +1 -1
  261. package/src/NativeRNLlama.ts +46 -2
  262. package/src/index.ts +105 -1
  263. package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  264. package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
  265. package/cpp/ggml-cpu/sgemm.cpp +0 -3544
  266. package/cpp/ggml-cpu/sgemm.h +0 -14
  267. package/cpp/llama-kv-cache.cpp +0 -2827
  268. package/cpp/llama-kv-cache.h +0 -515
  269. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  270. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  271. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  272. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
  273. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  274. /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  275. /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
  276. /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
@@ -0,0 +1,120 @@
1
+ #pragma once
2
+
3
+ #include "chat.h"
4
+ #include "json-partial.h"
5
+ #include "regex-partial.h"
6
+
7
+ #include "nlohmann/json.hpp"
8
+
9
+ #include <optional>
10
+ #include <string>
11
+ #include <vector>
12
+
13
+ class common_chat_msg_partial_exception : public std::runtime_error {
14
+ public:
15
+ common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
16
+ };
17
+
18
+ class common_chat_msg_parser {
19
+ std::string input_;
20
+ bool is_partial_;
21
+ common_chat_syntax syntax_;
22
+ std::string healing_marker_;
23
+
24
+ size_t pos_ = 0;
25
+ common_chat_msg result_;
26
+
27
+ public:
28
+ common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
29
+ const std::string & input() const { return input_; }
30
+ size_t pos() const { return pos_; }
31
+ const std::string & healing_marker() const { return healing_marker_; }
32
+ const bool & is_partial() const { return is_partial_; }
33
+ const common_chat_msg & result() const { return result_; }
34
+ const common_chat_syntax & syntax() const { return syntax_; }
35
+
36
+ void move_to(size_t pos) {
37
+ if (pos > input_.size()) {
38
+ throw std::runtime_error("Invalid position!");
39
+ }
40
+ pos_ = pos;
41
+ }
42
+ void move_back(size_t n) {
43
+ if (pos_ < n) {
44
+ throw std::runtime_error("Can't move back that far!");
45
+ }
46
+ pos_ -= n;
47
+ }
48
+
49
+ // Get the substring of the input at the given range
50
+ std::string str(const common_string_range & rng) const;
51
+
52
+ // Appends to the result.content field
53
+ void add_content(const std::string & content);
54
+
55
+ // Appends to the result.reasoning_content field
56
+ void add_reasoning_content(const std::string & reasoning_content);
57
+
58
+ // Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
59
+ bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);
60
+
61
+ // Adds a tool call using the "name", "id" and "arguments" fields of the json object
62
+ bool add_tool_call(const nlohmann::ordered_json & tool_call);
63
+
64
+ // Adds an array of tool calls using their "name", "id" and "arguments" fields.
65
+ bool add_tool_calls(const nlohmann::ordered_json & arr);
66
+
67
+ void finish();
68
+
69
+ bool consume_spaces();
70
+
71
+ void consume_literal(const std::string & literal);
72
+
73
+ bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);
74
+
75
+ std::string consume_rest();
76
+
77
+ struct find_regex_result {
78
+ std::string prelude;
79
+ std::vector<common_string_range> groups;
80
+ };
81
+
82
+ std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
83
+
84
+ bool try_consume_literal(const std::string & literal);
85
+
86
+ std::optional<find_regex_result> try_find_literal(const std::string & literal);
87
+
88
+ find_regex_result consume_regex(const common_regex & regex);
89
+
90
+ std::optional<find_regex_result> try_consume_regex(const common_regex & regex);
91
+
92
+ std::optional<common_json> try_consume_json();
93
+ common_json consume_json();
94
+
95
+ struct consume_json_result {
96
+ nlohmann::ordered_json value;
97
+ bool is_partial;
98
+ };
99
+
100
+ /*
101
+ Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.
102
+
103
+ By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
104
+ e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`
105
+
106
+ But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
107
+ - with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
108
+ - with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
109
+ */
110
+ consume_json_result consume_json_with_dumped_args(
111
+ const std::vector<std::vector<std::string>> & args_paths = {},
112
+ const std::vector<std::vector<std::string>> & content_paths = {}
113
+ );
114
+ std::optional<consume_json_result> try_consume_json_with_dumped_args(
115
+ const std::vector<std::vector<std::string>> & args_paths = {},
116
+ const std::vector<std::vector<std::string>> & content_paths = {}
117
+ );
118
+
119
+ void clear_tools();
120
+ };
@@ -3,6 +3,7 @@
3
3
  #pragma once
4
4
 
5
5
  #include "common.h"
6
+ #include <functional>
6
7
  #include <chrono>
7
8
  #include <string>
8
9
  #include <vector>
@@ -21,11 +22,19 @@ struct common_chat_tool_call {
21
22
  std::string name;
22
23
  std::string arguments;
23
24
  std::string id;
25
+
26
+ bool operator==(const common_chat_tool_call & other) const {
27
+ return name == other.name && arguments == other.arguments && id == other.id;
28
+ }
24
29
  };
25
30
 
26
31
  struct common_chat_msg_content_part {
27
32
  std::string type;
28
33
  std::string text;
34
+
35
+ bool operator==(const common_chat_msg_content_part & other) const {
36
+ return type == other.type && text == other.text;
37
+ }
29
38
  };
30
39
 
31
40
  struct common_chat_msg {
@@ -36,6 +45,51 @@ struct common_chat_msg {
36
45
  std::string reasoning_content;
37
46
  std::string tool_name;
38
47
  std::string tool_call_id;
48
+
49
+ template <class T> T to_json_oaicompat() const;
50
+
51
+ bool empty() const {
52
+ return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
53
+ }
54
+ void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
55
+ for (auto i = 0u; i < tool_calls.size(); i++) {
56
+ if (ids_cache.size() <= i) {
57
+ auto id = tool_calls[i].id;
58
+ if (id.empty()) {
59
+ id = gen_tool_call_id();
60
+ }
61
+ ids_cache.push_back(id);
62
+ }
63
+ tool_calls[i].id = ids_cache[i];
64
+ }
65
+ }
66
+ bool operator==(const common_chat_msg & other) const {
67
+ return role == other.role
68
+ && content == other.content
69
+ && content_parts == other.content_parts
70
+ && tool_calls == other.tool_calls
71
+ && reasoning_content == other.reasoning_content
72
+ && tool_name == other.tool_name
73
+ && tool_call_id == other.tool_call_id;
74
+ }
75
+ bool operator!=(const common_chat_msg & other) const {
76
+ return !(*this == other);
77
+ }
78
+ };
79
+
80
+ struct common_chat_msg_diff {
81
+ std::string reasoning_content_delta;
82
+ std::string content_delta;
83
+ size_t tool_call_index = std::string::npos;
84
+ common_chat_tool_call tool_call_delta;
85
+
86
+ static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
87
+
88
+ bool operator==(const common_chat_msg_diff & other) const {
89
+ return content_delta == other.content_delta
90
+ && tool_call_index == other.tool_call_index
91
+ && tool_call_delta == other.tool_call_delta;
92
+ }
39
93
  };
40
94
 
41
95
  struct common_chat_tool {
@@ -57,14 +111,11 @@ enum common_chat_format {
57
111
  COMMON_CHAT_FORMAT_LLAMA_3_X,
58
112
  COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
59
113
  COMMON_CHAT_FORMAT_DEEPSEEK_R1,
60
- COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
61
114
  COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
62
115
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
63
116
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
64
117
  COMMON_CHAT_FORMAT_HERMES_2_PRO,
65
- COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
66
118
  COMMON_CHAT_FORMAT_COMMAND_R7B,
67
- COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
68
119
 
69
120
  COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
70
121
  };
@@ -79,7 +130,8 @@ struct common_chat_templates_inputs {
79
130
  std::vector<common_chat_tool> tools;
80
131
  common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
81
132
  bool parallel_tool_calls = false;
82
- bool extract_reasoning = true;
133
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
134
+ bool enable_thinking = true;
83
135
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
84
136
  };
85
137
 
@@ -88,11 +140,21 @@ struct common_chat_params {
88
140
  std::string prompt;
89
141
  std::string grammar;
90
142
  bool grammar_lazy = false;
143
+ bool thinking_forced_open = false;
91
144
  std::vector<common_grammar_trigger> grammar_triggers;
92
145
  std::vector<std::string> preserved_tokens;
93
146
  std::vector<std::string> additional_stops;
94
147
  };
95
148
 
149
+ struct common_chat_syntax {
150
+ common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
151
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
152
+ // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
153
+ bool reasoning_in_content = false;
154
+ bool thinking_forced_open = false;
155
+ bool parse_tool_calls = true;
156
+ };
157
+
96
158
  // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
97
159
  bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
98
160
 
@@ -129,8 +191,9 @@ std::string common_chat_format_example(
129
191
  const struct common_chat_templates * tmpls,
130
192
  bool use_jinja);
131
193
 
132
- std::string common_chat_format_name(common_chat_format format);
133
- common_chat_msg common_chat_parse( const std::string & input, common_chat_format format);
194
+ const char* common_chat_format_name(common_chat_format format);
195
+ const char* common_reasoning_format_name(common_reasoning_format format);
196
+ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
134
197
 
135
198
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
136
199
 
@@ -143,3 +206,5 @@ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common
143
206
  // T can be std::string containing JSON or nlohmann::ordered_json
144
207
  template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
145
208
  template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
209
+
210
+ template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
@@ -126,7 +126,7 @@ enum common_grammar_trigger_type {
126
126
  COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
127
127
  COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
128
128
  COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
129
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
129
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
130
130
  };
131
131
 
132
132
  struct common_grammar_trigger {
@@ -210,6 +210,9 @@ struct common_params_speculative {
210
210
  float p_split = 0.1f; // speculative decoding split probability
211
211
  float p_min = 0.75f; // minimum speculative decoding probability (greedy)
212
212
 
213
+ lm_ggml_type cache_type_k = LM_GGML_TYPE_F16; // KV cache data type for the K
214
+ lm_ggml_type cache_type_v = LM_GGML_TYPE_F16; // KV cache data type for the V
215
+
213
216
  struct cpu_params cpuparams;
214
217
  struct cpu_params cpuparams_batch;
215
218
 
@@ -226,7 +229,8 @@ struct common_params_vocoder {
226
229
 
227
230
  enum common_reasoning_format {
228
231
  COMMON_REASONING_FORMAT_NONE,
229
- COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
232
+ COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
233
+ COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
230
234
  };
231
235
 
232
236
  struct common_params {
@@ -306,6 +310,7 @@ struct common_params {
306
310
  int32_t verbosity = 0;
307
311
  int32_t control_vector_layer_start = -1; // layer range for control vector
308
312
  int32_t control_vector_layer_end = -1; // layer range for control vector
313
+ bool offline = false;
309
314
 
310
315
  int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
311
316
  int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -368,7 +373,7 @@ struct common_params {
368
373
  int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
369
374
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
370
375
  std::string embd_sep = "\n"; // separator of embeddings
371
- bool reranking = false; // enable reranking support on server
376
+ std::string cls_sep = "\t"; // separator of classification sequences
372
377
 
373
378
  // server params
374
379
  int32_t port = 8080; // server listens on this network port
@@ -383,6 +388,7 @@ struct common_params {
383
388
  bool use_jinja = false; // NOLINT
384
389
  bool enable_chat_template = true;
385
390
  common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
391
+ int reasoning_budget = -1;
386
392
  bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
387
393
 
388
394
  std::vector<std::string> api_keys;
@@ -1074,6 +1074,10 @@ LM_GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
1074
1074
  0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
1075
1075
  LM_GGML_TABLE_END()
1076
1076
 
1077
+ LM_GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
1078
+ -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
1079
+ LM_GGML_TABLE_END()
1080
+
1077
1081
  #define NGRID_IQ1S 2048
1078
1082
  #define IQ1S_DELTA 0.125f
1079
1083
  #define IQ1M_DELTA 0.125f
@@ -101,6 +101,7 @@ extern "C" {
101
101
  LM_GGML_BACKEND_API int lm_ggml_cpu_has_riscv_v (void);
102
102
  LM_GGML_BACKEND_API int lm_ggml_cpu_has_vsx (void);
103
103
  LM_GGML_BACKEND_API int lm_ggml_cpu_has_vxe (void);
104
+ LM_GGML_BACKEND_API int lm_ggml_cpu_has_nnpa (void);
104
105
  LM_GGML_BACKEND_API int lm_ggml_cpu_has_wasm_simd (void);
105
106
  LM_GGML_BACKEND_API int lm_ggml_cpu_has_llamafile (void);
106
107
 
@@ -32,6 +32,8 @@
32
32
  extern "C" {
33
33
  #endif
34
34
 
35
+ void lm_ggml_print_backtrace(void);
36
+
35
37
  #ifndef MIN
36
38
  # define MIN(a, b) ((a) < (b) ? (a) : (b))
37
39
  #endif
@@ -315,203 +317,81 @@ struct lm_ggml_cgraph lm_ggml_graph_view(struct lm_ggml_cgraph * cgraph, int i0,
315
317
  LM_GGML_API void * lm_ggml_aligned_malloc(size_t size);
316
318
  LM_GGML_API void lm_ggml_aligned_free(void * ptr, size_t size);
317
319
 
318
- // FP16 to FP32 conversion
319
-
320
- // 16-bit float
321
- // on Arm, we use __fp16
322
- // on x86, we use uint16_t
323
- //
324
- // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
325
- // for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
326
- //
327
- #if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
328
- #define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
329
- #define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
330
-
331
- #define LM_GGML_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
332
-
333
- static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
334
- __fp16 tmp;
335
- memcpy(&tmp, &h, sizeof(lm_ggml_fp16_t));
336
- return (float)tmp;
337
- }
338
-
339
- static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
340
- lm_ggml_fp16_t res;
341
- __fp16 tmp = f;
342
- memcpy(&res, &tmp, sizeof(lm_ggml_fp16_t));
343
- return res;
344
- }
345
-
346
- #elif defined(__F16C__)
347
-
348
- #ifdef _MSC_VER
349
- #define LM_GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
350
- #define LM_GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
351
- #else
352
- #define LM_GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
353
- #define LM_GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
354
- #endif
355
-
356
- #elif defined(__POWER9_VECTOR__)
357
-
358
- #define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
359
- #define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
360
- /* the inline asm below is about 12% faster than the lookup method */
361
- #define LM_GGML_FP16_TO_FP32(x) LM_GGML_COMPUTE_FP16_TO_FP32(x)
362
- #define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
363
-
364
- static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
365
- float f;
366
- double d;
367
- __asm__(
368
- "mtfprd %0,%2\n"
369
- "xscvhpdp %0,%0\n"
370
- "frsp %1,%0\n" :
371
- /* temp */ "=d"(d),
372
- /* out */ "=f"(f):
373
- /* in */ "r"(h));
374
- return f;
375
- }
376
-
377
- static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
378
- double d;
379
- lm_ggml_fp16_t r;
380
- __asm__( /* xscvdphp can work on double or single precision */
381
- "xscvdphp %0,%2\n"
382
- "mffprd %1,%0\n" :
383
- /* temp */ "=d"(d),
384
- /* out */ "=r"(r):
385
- /* in */ "f"(f));
386
- return r;
387
- }
388
-
389
- #elif defined(__riscv) && defined(LM_GGML_RV_ZFH)
320
+ // FP16 <-> FP32
321
+ // ref: https://github.com/Maratyszcza/FP16
390
322
 
391
- static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
392
- float f;
393
- __asm__(
394
- "fmv.h.x %[f], %[h]\n\t"
395
- "fcvt.s.h %[f], %[f]"
396
- : [f] "=&f" (f)
397
- : [h] "r" (h)
398
- );
399
- return f;
400
- }
323
+ static inline float fp32_from_bits(uint32_t w) {
324
+ union {
325
+ uint32_t as_bits;
326
+ float as_value;
327
+ } fp32;
328
+ fp32.as_bits = w;
329
+ return fp32.as_value;
330
+ }
401
331
 
402
- static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
403
- lm_ggml_fp16_t res;
404
- __asm__(
405
- "fcvt.h.s %[f], %[f]\n\t"
406
- "fmv.x.h %[h], %[f]"
407
- : [h] "=&r" (res)
408
- : [f] "f" (f)
409
- );
410
- return res;
411
- }
332
+ static inline uint32_t fp32_to_bits(float f) {
333
+ union {
334
+ float as_value;
335
+ uint32_t as_bits;
336
+ } fp32;
337
+ fp32.as_value = f;
338
+ return fp32.as_bits;
339
+ }
412
340
 
413
- #define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
414
- #define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
415
- #define LM_GGML_FP16_TO_FP32(x) LM_GGML_COMPUTE_FP16_TO_FP32(x)
416
- #define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
341
+ static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
342
+ const uint32_t w = (uint32_t) h << 16;
343
+ const uint32_t sign = w & UINT32_C(0x80000000);
344
+ const uint32_t two_w = w + w;
417
345
 
346
+ const uint32_t exp_offset = UINT32_C(0xE0) << 23;
347
+ #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
348
+ const float exp_scale = 0x1.0p-112f;
418
349
  #else
350
+ const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
351
+ #endif
352
+ const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
419
353
 
420
- // FP16 <-> FP32
421
- // ref: https://github.com/Maratyszcza/FP16
422
-
423
- static inline float fp32_from_bits(uint32_t w) {
424
- union {
425
- uint32_t as_bits;
426
- float as_value;
427
- } fp32;
428
- fp32.as_bits = w;
429
- return fp32.as_value;
430
- }
431
-
432
- static inline uint32_t fp32_to_bits(float f) {
433
- union {
434
- float as_value;
435
- uint32_t as_bits;
436
- } fp32;
437
- fp32.as_value = f;
438
- return fp32.as_bits;
439
- }
440
-
441
- static inline float lm_ggml_compute_fp16_to_fp32(lm_ggml_fp16_t h) {
442
- const uint32_t w = (uint32_t) h << 16;
443
- const uint32_t sign = w & UINT32_C(0x80000000);
444
- const uint32_t two_w = w + w;
445
-
446
- const uint32_t exp_offset = UINT32_C(0xE0) << 23;
447
- #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
448
- const float exp_scale = 0x1.0p-112f;
449
- #else
450
- const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
451
- #endif
452
- const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
453
-
454
- const uint32_t magic_mask = UINT32_C(126) << 23;
455
- const float magic_bias = 0.5f;
456
- const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
457
-
458
- const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
459
- const uint32_t result = sign |
460
- (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
461
- return fp32_from_bits(result);
462
- }
354
+ const uint32_t magic_mask = UINT32_C(126) << 23;
355
+ const float magic_bias = 0.5f;
356
+ const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
463
357
 
464
- static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
465
- #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
466
- const float scale_to_inf = 0x1.0p+112f;
467
- const float scale_to_zero = 0x1.0p-110f;
468
- #else
469
- const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
470
- const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
471
- #endif
472
- float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
473
-
474
- const uint32_t w = fp32_to_bits(f);
475
- const uint32_t shl1_w = w + w;
476
- const uint32_t sign = w & UINT32_C(0x80000000);
477
- uint32_t bias = shl1_w & UINT32_C(0xFF000000);
478
- if (bias < UINT32_C(0x71000000)) {
479
- bias = UINT32_C(0x71000000);
480
- }
358
+ const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
359
+ const uint32_t result = sign |
360
+ (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
361
+ return fp32_from_bits(result);
362
+ }
481
363
 
482
- base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
483
- const uint32_t bits = fp32_to_bits(base);
484
- const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
485
- const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
486
- const uint32_t nonsign = exp_bits + mantissa_bits;
487
- return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
364
+ static inline lm_ggml_fp16_t lm_ggml_compute_fp32_to_fp16(float f) {
365
+ #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
366
+ const float scale_to_inf = 0x1.0p+112f;
367
+ const float scale_to_zero = 0x1.0p-110f;
368
+ #else
369
+ const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
370
+ const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
371
+ #endif
372
+ float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
373
+
374
+ const uint32_t w = fp32_to_bits(f);
375
+ const uint32_t shl1_w = w + w;
376
+ const uint32_t sign = w & UINT32_C(0x80000000);
377
+ uint32_t bias = shl1_w & UINT32_C(0xFF000000);
378
+ if (bias < UINT32_C(0x71000000)) {
379
+ bias = UINT32_C(0x71000000);
488
380
  }
489
381
 
490
- #define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
491
- #define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
492
-
493
- #endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
494
-
495
- // precomputed f32 table for f16 (256 KB)
496
- // defined in ggml.c, initialized in lm_ggml_init()
497
- LM_GGML_API float lm_ggml_table_f32_f16[1 << 16];
498
-
499
- // On ARM NEON, it's quicker to directly convert x -> x instead of calling into lm_ggml_lookup_fp16_to_fp32,
500
- // so we define LM_GGML_FP16_TO_FP32 and LM_GGML_FP32_TO_FP16 elsewhere for NEON.
501
- // This is also true for POWER9.
502
- #if !defined(LM_GGML_FP16_TO_FP32)
503
- inline static float lm_ggml_lookup_fp16_to_fp32(lm_ggml_fp16_t f) {
504
- uint16_t s;
505
- memcpy(&s, &f, sizeof(uint16_t));
506
- return lm_ggml_table_f32_f16[s];
382
+ base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
383
+ const uint32_t bits = fp32_to_bits(base);
384
+ const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
385
+ const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
386
+ const uint32_t nonsign = exp_bits + mantissa_bits;
387
+ return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
507
388
  }
508
389
 
509
- #define LM_GGML_FP16_TO_FP32(x) lm_ggml_lookup_fp16_to_fp32(x)
510
- #endif
390
+ #define LM_GGML_COMPUTE_FP16_TO_FP32(x) lm_ggml_compute_fp16_to_fp32(x)
391
+ #define LM_GGML_COMPUTE_FP32_TO_FP16(x) lm_ggml_compute_fp32_to_fp16(x)
511
392
 
512
- #if !defined(LM_GGML_FP32_TO_FP16)
393
+ #define LM_GGML_FP16_TO_FP32(x) LM_GGML_COMPUTE_FP16_TO_FP32(x)
513
394
  #define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
514
- #endif
515
395
 
516
396
  /**
517
397
  * Converts brain16 to float32.
@@ -490,6 +490,7 @@ extern "C" {
490
490
  LM_GGML_OP_UPSCALE, // nearest interpolate
491
491
  LM_GGML_OP_PAD,
492
492
  LM_GGML_OP_PAD_REFLECT_1D,
493
+ LM_GGML_OP_ROLL,
493
494
  LM_GGML_OP_ARANGE,
494
495
  LM_GGML_OP_TIMESTEP_EMBEDDING,
495
496
  LM_GGML_OP_ARGSORT,
@@ -936,6 +937,15 @@ extern "C" {
936
937
  struct lm_ggml_tensor * a,
937
938
  struct lm_ggml_tensor * b);
938
939
 
940
+ // repeat a to the specified shape
941
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_repeat_4d(
942
+ struct lm_ggml_context * ctx,
943
+ struct lm_ggml_tensor * a,
944
+ int64_t ne0,
945
+ int64_t ne1,
946
+ int64_t ne2,
947
+ int64_t ne3);
948
+
939
949
  // sums repetitions in a into shape of b
940
950
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_repeat_back(
941
951
  struct lm_ggml_context * ctx,
@@ -1793,6 +1803,17 @@ extern "C" {
1793
1803
  int p0,
1794
1804
  int p1);
1795
1805
 
1806
+ // Move tensor elements by an offset given for each dimension. Elements that
1807
+ // are shifted beyond the last position are wrapped around to the beginning.
1808
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_roll(
1809
+ struct lm_ggml_context * ctx,
1810
+ struct lm_ggml_tensor * a,
1811
+ int shift0,
1812
+ int shift1,
1813
+ int shift2,
1814
+ int shift3);
1815
+
1816
+
1796
1817
  // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
1797
1818
  // timesteps: [N,]
1798
1819
  // return: [N, dim]
@@ -2087,9 +2108,6 @@ extern "C" {
2087
2108
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_graph_get_grad (const struct lm_ggml_cgraph * cgraph, const struct lm_ggml_tensor * node);
2088
2109
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_graph_get_grad_acc(const struct lm_ggml_cgraph * cgraph, const struct lm_ggml_tensor * node);
2089
2110
 
2090
- LM_GGML_API void lm_ggml_graph_export(const struct lm_ggml_cgraph * cgraph, const char * fname);
2091
- LM_GGML_API struct lm_ggml_cgraph * lm_ggml_graph_import(const char * fname, struct lm_ggml_context ** ctx_data, struct lm_ggml_context ** ctx_eval);
2092
-
2093
2111
  // print info and performance information for the graph
2094
2112
  LM_GGML_API void lm_ggml_graph_print(const struct lm_ggml_cgraph * cgraph);
2095
2113
 
@@ -2173,6 +2191,7 @@ extern "C" {
2173
2191
 
2174
2192
  // scheduling priorities
2175
2193
  enum lm_ggml_sched_priority {
2194
+ LM_GGML_SCHED_PRIO_LOW = -1,
2176
2195
  LM_GGML_SCHED_PRIO_NORMAL,
2177
2196
  LM_GGML_SCHED_PRIO_MEDIUM,
2178
2197
  LM_GGML_SCHED_PRIO_HIGH,