cui-llama.rn 1.7.4 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. package/README.md +217 -17
  2. package/android/src/main/CMakeLists.txt +34 -15
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +79 -5
  4. package/android/src/main/java/com/rnllama/RNLlama.java +237 -0
  5. package/android/src/main/jni.cpp +213 -14
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
  16. package/cpp/README.md +1 -1
  17. package/cpp/chat-parser.cpp +385 -0
  18. package/cpp/chat-parser.h +120 -0
  19. package/cpp/chat.cpp +726 -596
  20. package/cpp/chat.h +71 -6
  21. package/cpp/common.cpp +56 -38
  22. package/cpp/common.h +9 -3
  23. package/cpp/ggml-backend-reg.cpp +5 -0
  24. package/cpp/ggml-backend.cpp +10 -2
  25. package/cpp/ggml-common.h +4 -0
  26. package/cpp/ggml-cpu/amx/amx.cpp +1 -1
  27. package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
  28. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  29. package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
  30. package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
  31. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  32. package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
  33. package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  34. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  35. package/cpp/ggml-cpu/common.h +4 -3
  36. package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
  37. package/cpp/ggml-cpu/ggml-cpu.c +123 -104
  38. package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
  39. package/cpp/ggml-cpu/ops.cpp +330 -148
  40. package/cpp/ggml-cpu/ops.h +1 -0
  41. package/cpp/ggml-cpu/quants.c +1158 -0
  42. package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  43. package/cpp/ggml-cpu/repack.cpp +1571 -0
  44. package/cpp/ggml-cpu/repack.h +98 -0
  45. package/cpp/ggml-cpu/simd-mappings.h +330 -38
  46. package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  47. package/cpp/ggml-cpu/vec.cpp +87 -18
  48. package/cpp/ggml-cpu/vec.h +249 -94
  49. package/cpp/ggml-cpu.h +1 -0
  50. package/cpp/ggml-impl.h +63 -183
  51. package/cpp/ggml-llama-sim.metallib +0 -0
  52. package/cpp/ggml-llama.metallib +0 -0
  53. package/cpp/ggml-metal.m +152 -45
  54. package/cpp/ggml-quants.c +0 -2
  55. package/cpp/ggml.c +61 -21
  56. package/cpp/ggml.h +22 -3
  57. package/cpp/gguf.cpp +24 -3
  58. package/cpp/json-partial.cpp +256 -0
  59. package/cpp/json-partial.h +38 -0
  60. package/cpp/json-schema-to-grammar.cpp +5 -47
  61. package/cpp/json-schema-to-grammar.h +4 -4
  62. package/cpp/llama-arch.cpp +153 -3
  63. package/cpp/llama-arch.h +27 -1
  64. package/cpp/llama-batch.cpp +741 -272
  65. package/cpp/llama-batch.h +112 -54
  66. package/cpp/llama-chat.cpp +30 -8
  67. package/cpp/llama-chat.h +1 -0
  68. package/cpp/llama-context.cpp +524 -339
  69. package/cpp/llama-context.h +38 -17
  70. package/cpp/llama-cparams.cpp +4 -0
  71. package/cpp/llama-cparams.h +2 -0
  72. package/cpp/llama-grammar.cpp +12 -2
  73. package/cpp/llama-graph.cpp +431 -356
  74. package/cpp/llama-graph.h +126 -58
  75. package/cpp/llama-hparams.cpp +10 -2
  76. package/cpp/llama-hparams.h +19 -2
  77. package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
  78. package/cpp/llama-kv-cache-unified-iswa.h +128 -0
  79. package/cpp/llama-kv-cache-unified.cpp +1841 -0
  80. package/cpp/llama-kv-cache-unified.h +303 -0
  81. package/cpp/llama-kv-cells.h +439 -0
  82. package/cpp/llama-memory-hybrid.cpp +246 -0
  83. package/cpp/llama-memory-hybrid.h +138 -0
  84. package/cpp/llama-memory-recurrent.cpp +1112 -0
  85. package/cpp/llama-memory-recurrent.h +183 -0
  86. package/cpp/llama-memory.cpp +41 -0
  87. package/cpp/llama-memory.h +86 -5
  88. package/cpp/llama-mmap.cpp +1 -1
  89. package/cpp/llama-model-loader.cpp +42 -17
  90. package/cpp/llama-model-saver.cpp +1 -0
  91. package/cpp/llama-model.cpp +1639 -513
  92. package/cpp/llama-model.h +26 -0
  93. package/cpp/llama-sampling.cpp +2 -2
  94. package/cpp/llama-vocab.cpp +65 -28
  95. package/cpp/llama-vocab.h +1 -0
  96. package/cpp/llama.cpp +11 -7
  97. package/cpp/llama.h +150 -42
  98. package/cpp/minja/chat-template.hpp +1 -1
  99. package/cpp/minja/minja.hpp +1 -1
  100. package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
  101. package/cpp/nlohmann/json_fwd.hpp +187 -0
  102. package/cpp/regex-partial.cpp +204 -0
  103. package/cpp/regex-partial.h +56 -0
  104. package/cpp/rn-llama.cpp +646 -35
  105. package/cpp/rn-llama.h +32 -1
  106. package/cpp/rn-tts.h +39 -0
  107. package/cpp/sampling.cpp +7 -8
  108. package/cpp/tools/mtmd/clip-impl.h +5 -0
  109. package/cpp/tools/mtmd/clip.cpp +572 -436
  110. package/cpp/tools/mtmd/clip.h +14 -4
  111. package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
  112. package/cpp/tools/mtmd/mtmd-audio.h +2 -17
  113. package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
  114. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  115. package/cpp/tools/mtmd/mtmd.cpp +368 -248
  116. package/cpp/tools/mtmd/mtmd.h +6 -70
  117. package/cpp/unicode.cpp +5 -0
  118. package/ios/CMakeLists.txt +26 -6
  119. package/ios/RNLlama.h +1 -1
  120. package/ios/RNLlama.mm +153 -3
  121. package/ios/RNLlamaContext.h +9 -1
  122. package/ios/RNLlamaContext.mm +112 -9
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  131. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  132. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  133. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  134. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  135. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  136. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  137. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  138. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  139. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  140. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  141. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  142. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  143. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  144. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  145. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  146. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  147. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
  148. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  165. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  166. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  167. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  168. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  169. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  170. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  171. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  172. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  173. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  174. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  175. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  176. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  177. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  178. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  179. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  180. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  181. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  182. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  183. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  184. package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  185. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  186. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  187. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  188. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  189. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  190. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  191. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  192. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
  193. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
  194. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  195. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  196. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  197. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
  198. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  199. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  200. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  201. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  202. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  203. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  204. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  205. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  206. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  207. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  208. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  209. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  210. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  211. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  212. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  213. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  214. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  215. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
  216. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  217. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  218. package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  219. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  220. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  221. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  222. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  223. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  224. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  225. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  226. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  227. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  228. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  229. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  230. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  231. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  232. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  233. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  234. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  235. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  236. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  237. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  238. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  239. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  240. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  241. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  242. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  243. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  244. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  245. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  246. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  247. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  248. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  249. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  250. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  251. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  252. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
  253. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  254. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  255. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  256. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  257. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  258. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  259. package/jest/mock.js +24 -0
  260. package/package.json +1 -1
  261. package/src/NativeRNLlama.ts +46 -2
  262. package/src/index.ts +105 -1
  263. package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  264. package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
  265. package/cpp/ggml-cpu/sgemm.cpp +0 -3544
  266. package/cpp/ggml-cpu/sgemm.h +0 -14
  267. package/cpp/llama-kv-cache.cpp +0 -2827
  268. package/cpp/llama-kv-cache.h +0 -515
  269. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  270. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  271. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  272. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
  273. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  274. /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  275. /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
  276. /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
@@ -3,7 +3,6 @@
3
3
 
4
4
  #include "ggml.h"
5
5
  #include "llama.h"
6
- #include "clip.h"
7
6
 
8
7
  #include <stddef.h>
9
8
  #include <stdint.h>
@@ -109,6 +108,10 @@ MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
109
108
  // whether the current model supports audio input
110
109
  MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
111
110
 
111
+ // get audio bitrate in Hz, for example 16000 for Whisper
112
+ // return -1 if audio is not supported
113
+ MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
114
+
112
115
  // mtmd_bitmap
113
116
  //
114
117
  // if bitmap is image:
@@ -203,79 +206,12 @@ MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
203
206
  const mtmd_input_chunk * chunk);
204
207
 
205
208
  // get output embeddings from the last encode pass
209
+ // the reading size (in bytes) is equal to:
210
+ // llama_model_n_embd(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
206
211
  MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
207
212
 
208
213
  /////////////////////////////////////////
209
214
 
210
- //
211
- // Helper functions (can be implemented based on other functions)
212
- //
213
- // Please note that these helpers are not guaranteed to be stable.
214
- // BREAKING CHANGES are expected.
215
- //
216
-
217
- // helper function to construct a mtmd_bitmap from a file
218
- // it calls mtmd_helper_bitmap_init_from_buf() internally
219
- // returns nullptr on failure
220
- // this function is thread-safe
221
- MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
222
-
223
- // helper function to construct a mtmd_bitmap from a buffer containing a file
224
- // supported formats:
225
- // image: formats supported by stb_image: jpg, png, bmp, gif, etc.
226
- // audio: formats supported by miniaudio: wav, mp3, flac
227
- // note: audio files will be auto-detected based on magic bytes
228
- // returns nullptr on failure
229
- // this function is thread-safe
230
- MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
231
-
232
- // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
233
- MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
234
-
235
- // helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
236
- // normally, n_pos is equal to n_tokens, but for M-RoPE it is different
237
- MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
238
-
239
- // helper function that automatically:
240
- // 1. run llama_decode() on text chunks
241
- // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
242
- // if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
243
- // otherwise, returns 0 on success
244
- // this function is NOT thread-safe
245
- MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
246
- struct llama_context * lctx,
247
- const mtmd_input_chunks * chunks,
248
- llama_pos n_past,
249
- llama_seq_id seq_id,
250
- int32_t n_batch,
251
- bool logits_last,
252
- llama_pos * new_n_past);
253
-
254
- // works like mtmd_helper_eval_chunks(), but only for a single chunk
255
- // this function is NOT thread-safe
256
- MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
257
- struct llama_context * lctx,
258
- const mtmd_input_chunk * chunk,
259
- llama_pos n_past,
260
- llama_seq_id seq_id,
261
- int32_t n_batch,
262
- bool logits_last,
263
- llama_pos * new_n_past);
264
-
265
- // helper function to decode an image whose embeddings have already been calculated
266
- // this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
267
- // ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
268
- MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
269
- struct llama_context * lctx,
270
- const mtmd_input_chunk * chunk,
271
- float * encoded_embd,
272
- llama_pos n_past,
273
- llama_seq_id seq_id,
274
- int32_t n_batch,
275
- llama_pos * new_n_past);
276
-
277
- /////////////////////////////////////////
278
-
279
215
  // test function, to be used in test-mtmd-c-api.c
280
216
  MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
281
217
 
package/cpp/unicode.cpp CHANGED
@@ -204,12 +204,17 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
204
204
  // disable C++17 deprecation warning for std::codecvt_utf8
205
205
  # pragma clang diagnostic push
206
206
  # pragma clang diagnostic ignored "-Wdeprecated-declarations"
207
+ #elif defined(__GNUC__)
208
+ # pragma GCC diagnostic push
209
+ # pragma GCC diagnostic ignored "-Wdeprecated-declarations"
207
210
  #endif
208
211
 
209
212
  std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
210
213
 
211
214
  #if defined(__clang__)
212
215
  # pragma clang diagnostic pop
216
+ #elif defined(__GNUC__)
217
+ # pragma GCC diagnostic pop
213
218
  #endif
214
219
 
215
220
  return conv.from_bytes(s);
@@ -24,8 +24,19 @@ add_definitions(
24
24
  -DLM_GGML_METAL_USE_BF16
25
25
  )
26
26
 
27
+ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64;x86_64")
28
+ add_definitions(-DLM_GGML_CPU_GENERIC)
29
+ endif ()
30
+
27
31
  set(SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../cpp)
28
32
 
33
+ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64")
34
+ set(SOURCE_FILES_ARCH
35
+ ${SOURCE_DIR}/ggml-cpu/arch/arm/quants.c
36
+ ${SOURCE_DIR}/ggml-cpu/arch/arm/repack.cpp
37
+ )
38
+ endif ()
39
+
29
40
  # Define public headers
30
41
  set(PUBLIC_HEADERS
31
42
  ${SOURCE_DIR}/rn-llama.h
@@ -44,12 +55,11 @@ add_library(rnllama SHARED
44
55
  ${SOURCE_DIR}/ggml-cpu/amx/mmq.cpp
45
56
  ${SOURCE_DIR}/ggml-cpu/ggml-cpu.c
46
57
  ${SOURCE_DIR}/ggml-cpu/ggml-cpu.cpp
47
- ${SOURCE_DIR}/ggml-cpu/ggml-cpu-aarch64.cpp
48
- ${SOURCE_DIR}/ggml-cpu/ggml-cpu-quants.c
49
- ${SOURCE_DIR}/ggml-cpu/ggml-cpu-traits.cpp
58
+ ${SOURCE_DIR}/ggml-cpu/quants.c
59
+ ${SOURCE_DIR}/ggml-cpu/traits.cpp
60
+ ${SOURCE_DIR}/ggml-cpu/repack.cpp
50
61
  ${SOURCE_DIR}/ggml-cpu/unary-ops.cpp
51
62
  ${SOURCE_DIR}/ggml-cpu/binary-ops.cpp
52
- ${SOURCE_DIR}/ggml-cpu/sgemm.cpp
53
63
  ${SOURCE_DIR}/ggml-cpu/vec.cpp
54
64
  ${SOURCE_DIR}/ggml-cpu/ops.cpp
55
65
  ${SOURCE_DIR}/ggml-metal.m
@@ -65,7 +75,6 @@ add_library(rnllama SHARED
65
75
  ${SOURCE_DIR}/llama-adapter.cpp
66
76
  ${SOURCE_DIR}/llama-chat.cpp
67
77
  ${SOURCE_DIR}/llama-context.cpp
68
- ${SOURCE_DIR}/llama-kv-cache.cpp
69
78
  ${SOURCE_DIR}/llama-arch.cpp
70
79
  ${SOURCE_DIR}/llama-batch.cpp
71
80
  ${SOURCE_DIR}/llama-cparams.cpp
@@ -75,6 +84,10 @@ add_library(rnllama SHARED
75
84
  ${SOURCE_DIR}/llama-model-loader.cpp
76
85
  ${SOURCE_DIR}/llama-model-saver.cpp
77
86
  ${SOURCE_DIR}/llama-mmap.cpp
87
+ ${SOURCE_DIR}/llama-kv-cache-unified.cpp
88
+ ${SOURCE_DIR}/llama-kv-cache-unified-iswa.cpp
89
+ ${SOURCE_DIR}/llama-memory-hybrid.cpp
90
+ ${SOURCE_DIR}/llama-memory-recurrent.cpp
78
91
  ${SOURCE_DIR}/llama-vocab.cpp
79
92
  ${SOURCE_DIR}/llama-memory.cpp
80
93
  ${SOURCE_DIR}/llama-io.cpp
@@ -87,13 +100,18 @@ add_library(rnllama SHARED
87
100
  ${SOURCE_DIR}/json-schema-to-grammar.cpp
88
101
  ${SOURCE_DIR}/minja/minja.hpp
89
102
  ${SOURCE_DIR}/minja/chat-template.hpp
90
- ${SOURCE_DIR}/json.hpp
103
+ ${SOURCE_DIR}/nlohmann/json.hpp
104
+ ${SOURCE_DIR}/nlohmann/json_fwd.hpp
105
+ ${SOURCE_DIR}/chat-parser.cpp
106
+ ${SOURCE_DIR}/json-partial.cpp
107
+ ${SOURCE_DIR}/regex-partial.cpp
91
108
  # Multimodal support
92
109
  ${SOURCE_DIR}/tools/mtmd/mtmd.cpp
93
110
  ${SOURCE_DIR}/tools/mtmd/mtmd-audio.cpp
94
111
  ${SOURCE_DIR}/tools/mtmd/clip.cpp
95
112
  ${SOURCE_DIR}/tools/mtmd/mtmd-helper.cpp
96
113
  ${SOURCE_DIR}/rn-llama.cpp
114
+ ${SOURCE_FILES_ARCH}
97
115
  )
98
116
 
99
117
  # Setup include directories
@@ -102,6 +120,8 @@ target_include_directories(rnllama
102
120
  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp>
103
121
  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp/ggml-cpu>
104
122
  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp/tools/mtmd>
123
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp/minja>
124
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp/nlohmann>
105
125
  $<INSTALL_INTERFACE:include>
106
126
  )
107
127
 
package/ios/RNLlama.h CHANGED
@@ -4,7 +4,7 @@
4
4
  #if RNLLAMA_BUILD_FROM_SOURCE
5
5
  #import "json.hpp"
6
6
  #else
7
- #import <rnllama/json.hpp>
7
+ #import <rnllama/nlohmann/json.hpp>
8
8
  #endif
9
9
 
10
10
  // TODO: Use RNLlamaSpec (Need to refactor NSDictionary usage)
package/ios/RNLlama.mm CHANGED
@@ -102,13 +102,21 @@ RCT_EXPORT_METHOD(getFormattedChat:(double)contextId
102
102
  if ([params[@"jinja"] boolValue]) {
103
103
  NSString *jsonSchema = params[@"json_schema"];
104
104
  NSString *tools = params[@"tools"];
105
- bool parallelToolCalls = [params[@"parallel_tool_calls"] boolValue];
105
+ BOOL parallelToolCalls = [params[@"parallel_tool_calls"] boolValue];
106
106
  NSString *toolChoice = params[@"tool_choice"];
107
- resolve([context getFormattedChatWithJinja:messages withChatTemplate:chatTemplate withJsonSchema:jsonSchema withTools:tools withParallelToolCalls:parallelToolCalls withToolChoice:toolChoice]);
107
+ BOOL enableThinking = [params[@"enable_thinking"] boolValue];
108
+ resolve([context getFormattedChatWithJinja:messages
109
+ withChatTemplate:chatTemplate
110
+ withJsonSchema:jsonSchema
111
+ withTools:tools
112
+ withParallelToolCalls:parallelToolCalls
113
+ withToolChoice:toolChoice
114
+ withEnableThinking:enableThinking
115
+ ]);
108
116
  } else {
109
117
  resolve([context getFormattedChat:messages withChatTemplate:chatTemplate]);
110
118
  }
111
- } catch (const nlohmann::json_abi_v3_11_3::detail::parse_error& e) {
119
+ } catch (const nlohmann::json_abi_v3_12_0::detail::parse_error& e) {
112
120
  NSString *errorMessage = [NSString stringWithUTF8String:e.what()];
113
121
  reject(@"llama_error", [NSString stringWithFormat:@"JSON parse error in getFormattedChat: %@", errorMessage], nil);
114
122
  } catch (const std::exception& e) { // catch cpp exceptions
@@ -297,6 +305,25 @@ RCT_EXPORT_METHOD(embedding:(double)contextId
297
305
  }
298
306
  }
299
307
 
308
+ RCT_EXPORT_METHOD(rerank:(double)contextId
309
+ query:(NSString *)query
310
+ documents:(NSArray<NSString *> *)documents
311
+ params:(NSDictionary *)params
312
+ resolver:(RCTPromiseResolveBlock)resolve
313
+ rejecter:(RCTPromiseRejectBlock)reject) {
314
+ RNLlamaContext *context = llamaContexts[[NSNumber numberWithDouble:contextId]];
315
+ if (context == nil) {
316
+ reject(@"context_not_found", @"Context not found", nil);
317
+ return;
318
+ }
319
+ @try {
320
+ NSArray *result = [context rerank:query documents:documents params:params];
321
+ resolve(result);
322
+ } @catch (NSException *exception) {
323
+ reject(@"rerank_error", exception.reason, nil);
324
+ }
325
+ }
326
+
300
327
  RCT_EXPORT_METHOD(bench:(double)contextId
301
328
  pp:(int)pp
302
329
  tg:(int)tg
@@ -434,6 +461,129 @@ RCT_EXPORT_METHOD(releaseMultimodal:(double)contextId
434
461
  resolve(nil);
435
462
  }
436
463
 
464
+ RCT_EXPORT_METHOD(initVocoder:(double)contextId
465
+ withVocoderModelPath:(NSString *)vocoderModelPath
466
+ withResolver:(RCTPromiseResolveBlock)resolve
467
+ withRejecter:(RCTPromiseRejectBlock)reject)
468
+ {
469
+ RNLlamaContext *context = llamaContexts[[NSNumber numberWithDouble:contextId]];
470
+ if (context == nil) {
471
+ reject(@"llama_error", @"Context not found", nil);
472
+ return;
473
+ }
474
+ if ([context isPredicting]) {
475
+ reject(@"llama_error", @"Context is busy", nil);
476
+ return;
477
+ }
478
+
479
+ @try {
480
+ bool success = [context initVocoder:vocoderModelPath];
481
+ resolve(@(success));
482
+ } @catch (NSException *exception) {
483
+ reject(@"llama_cpp_error", exception.reason, nil);
484
+ }
485
+ }
486
+
487
+ RCT_EXPORT_METHOD(isVocoderEnabled:(double)contextId
488
+ withResolver:(RCTPromiseResolveBlock)resolve
489
+ withRejecter:(RCTPromiseRejectBlock)reject)
490
+ {
491
+ RNLlamaContext *context = llamaContexts[[NSNumber numberWithDouble:contextId]];
492
+ if (context == nil) {
493
+ reject(@"llama_error", @"Context not found", nil);
494
+ return;
495
+ }
496
+
497
+ resolve(@([context isVocoderEnabled]));
498
+ }
499
+
500
+ RCT_EXPORT_METHOD(getFormattedAudioCompletion:(double)contextId
501
+ withSpeakerJsonStr:(NSString *)speakerJsonStr
502
+ withTextToSpeak:(NSString *)textToSpeak
503
+ withResolver:(RCTPromiseResolveBlock)resolve
504
+ withRejecter:(RCTPromiseRejectBlock)reject)
505
+ {
506
+ RNLlamaContext *context = llamaContexts[[NSNumber numberWithDouble:contextId]];
507
+ if (context == nil) {
508
+ reject(@"llama_error", @"Context not found", nil);
509
+ return;
510
+ }
511
+
512
+ if (![context isVocoderEnabled]) {
513
+ reject(@"llama_error", @"Vocoder is not enabled", nil);
514
+ return;
515
+ }
516
+
517
+ @try {
518
+ NSString *result = [context getFormattedAudioCompletion:speakerJsonStr textToSpeak:textToSpeak];
519
+ resolve(result);
520
+ } @catch (NSException *exception) {
521
+ reject(@"llama_cpp_error", exception.reason, nil);
522
+ }
523
+ }
524
+
525
+ RCT_EXPORT_METHOD(getAudioCompletionGuideTokens:(double)contextId
526
+ withTextToSpeak:(NSString *)textToSpeak
527
+ withResolver:(RCTPromiseResolveBlock)resolve
528
+ withRejecter:(RCTPromiseRejectBlock)reject)
529
+ {
530
+ RNLlamaContext *context = llamaContexts[[NSNumber numberWithDouble:contextId]];
531
+ if (context == nil) {
532
+ reject(@"llama_error", @"Context not found", nil);
533
+ return;
534
+ }
535
+
536
+ if (![context isVocoderEnabled]) {
537
+ reject(@"llama_error", @"Vocoder is not enabled", nil);
538
+ return;
539
+ }
540
+
541
+ @try {
542
+ NSArray *guideTokens = [context getAudioCompletionGuideTokens:textToSpeak];
543
+ resolve(guideTokens);
544
+ } @catch (NSException *exception) {
545
+ reject(@"llama_cpp_error", exception.reason, nil);
546
+ }
547
+ }
548
+
549
+ RCT_EXPORT_METHOD(decodeAudioTokens:(double)contextId
550
+ withTokens:(NSArray *)tokens
551
+ withResolver:(RCTPromiseResolveBlock)resolve
552
+ withRejecter:(RCTPromiseRejectBlock)reject)
553
+ {
554
+ RNLlamaContext *context = llamaContexts[[NSNumber numberWithDouble:contextId]];
555
+ if (context == nil) {
556
+ reject(@"llama_error", @"Context not found", nil);
557
+ return;
558
+ }
559
+
560
+ if (![context isVocoderEnabled]) {
561
+ reject(@"llama_error", @"Vocoder is not enabled", nil);
562
+ return;
563
+ }
564
+
565
+ @try {
566
+ NSArray *audioData = [context decodeAudioTokens:tokens];
567
+ resolve(audioData);
568
+ } @catch (NSException *exception) {
569
+ reject(@"llama_cpp_error", exception.reason, nil);
570
+ }
571
+ }
572
+
573
+ RCT_EXPORT_METHOD(releaseVocoder:(double)contextId
574
+ withResolver:(RCTPromiseResolveBlock)resolve
575
+ withRejecter:(RCTPromiseRejectBlock)reject)
576
+ {
577
+ RNLlamaContext *context = llamaContexts[[NSNumber numberWithDouble:contextId]];
578
+ if (context == nil) {
579
+ reject(@"llama_error", @"Context not found", nil);
580
+ return;
581
+ }
582
+
583
+ [context releaseVocoder];
584
+ resolve(nil);
585
+ }
586
+
437
587
  RCT_EXPORT_METHOD(releaseContext:(double)contextId
438
588
  withResolver:(RCTPromiseResolveBlock)resolve
439
589
  withRejecter:(RCTPromiseRejectBlock)reject)
@@ -43,12 +43,14 @@
43
43
  - (NSDictionary *)tokenize:(NSString *)text imagePaths:(NSArray *)imagePaths;
44
44
  - (NSString *)detokenize:(NSArray *)tokens;
45
45
  - (NSDictionary *)embedding:(NSString *)text params:(NSDictionary *)params;
46
+ - (NSArray *)rerank:(NSString *)query documents:(NSArray<NSString *> *)documents params:(NSDictionary *)params;
46
47
  - (NSDictionary *)getFormattedChatWithJinja:(NSString *)messages
47
48
  withChatTemplate:(NSString *)chatTemplate
48
49
  withJsonSchema:(NSString *)jsonSchema
49
50
  withTools:(NSString *)tools
50
51
  withParallelToolCalls:(BOOL)parallelToolCalls
51
- withToolChoice:(NSString *)toolChoice;
52
+ withToolChoice:(NSString *)toolChoice
53
+ withEnableThinking:(BOOL)enableThinking;
52
54
  - (NSString *)getFormattedChat:(NSString *)messages withChatTemplate:(NSString *)chatTemplate;
53
55
  - (NSDictionary *)loadSession:(NSString *)path;
54
56
  - (int)saveSession:(NSString *)path size:(int)size;
@@ -56,6 +58,12 @@
56
58
  - (void)applyLoraAdapters:(NSArray *)loraAdapters;
57
59
  - (void)removeLoraAdapters;
58
60
  - (NSArray *)getLoadedLoraAdapters;
61
+ - (bool)initVocoder:(NSString *)vocoderModelPath;
62
+ - (bool)isVocoderEnabled;
63
+ - (NSString *)getFormattedAudioCompletion:(NSString *)speakerJsonStr textToSpeak:(NSString *)textToSpeak;
64
+ - (NSArray *)getAudioCompletionGuideTokens:(NSString *)textToSpeak;
65
+ - (NSArray *)decodeAudioTokens:(NSArray *)tokens;
66
+ - (void)releaseVocoder;
59
67
  - (void)invalidate;
60
68
 
61
69
  @end
@@ -90,13 +90,6 @@
90
90
  NSLog(@"chatTemplate: %@", chatTemplate);
91
91
  }
92
92
 
93
- NSString *reasoningFormat = params[@"reasoning_format"];
94
- if (reasoningFormat && [reasoningFormat isEqualToString:@"deepseek"]) {
95
- defaultParams.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
96
- } else {
97
- defaultParams.reasoning_format = COMMON_REASONING_FORMAT_NONE;
98
- }
99
-
100
93
  if (params[@"n_ctx"]) defaultParams.n_ctx = [params[@"n_ctx"] intValue];
101
94
  if (params[@"use_mlock"]) defaultParams.use_mlock = [params[@"use_mlock"]boolValue];
102
95
 
@@ -362,6 +355,7 @@
362
355
  withTools:(NSString *)tools
363
356
  withParallelToolCalls:(BOOL)parallelToolCalls
364
357
  withToolChoice:(NSString *)toolChoice
358
+ withEnableThinking:(BOOL)enableThinking
365
359
  {
366
360
  auto tmpl_str = chatTemplate == nil ? "" : [chatTemplate UTF8String];
367
361
 
@@ -372,7 +366,8 @@
372
366
  jsonSchema == nil ? "" : [jsonSchema UTF8String],
373
367
  tools == nil ? "" : [tools UTF8String],
374
368
  parallelToolCalls,
375
- toolChoice == nil ? "" : [toolChoice UTF8String]
369
+ toolChoice == nil ? "" : [toolChoice UTF8String],
370
+ enableThinking
376
371
  );
377
372
  result[@"prompt"] = [NSString stringWithUTF8String:chatParams.prompt.c_str()];
378
373
  result[@"chat_format"] = @(static_cast<int>(chatParams.format));
@@ -386,6 +381,7 @@
386
381
  @"token": @(trigger.token),
387
382
  }];
388
383
  }
384
+ result[@"thinking_forced_open"] = @(chatParams.thinking_forced_open);
389
385
  result[@"grammar_triggers"] = grammar_triggers;
390
386
  NSMutableArray *preserved_tokens = [[NSMutableArray alloc] init];
391
387
  for (const auto & token : chatParams.preserved_tokens) {
@@ -581,6 +577,16 @@
581
577
  }
582
578
  }
583
579
 
580
+ if (params[@"guide_tokens"] && [params[@"guide_tokens"] isKindOfClass:[NSArray class]]) {
581
+ NSArray *guide_tokens_array = params[@"guide_tokens"];
582
+ std::vector<llama_token> guide_tokens;
583
+ guide_tokens.reserve([guide_tokens_array count]);
584
+ for (NSNumber *token_num in guide_tokens_array) {
585
+ guide_tokens.push_back([token_num intValue]);
586
+ }
587
+ llama->setGuideTokens(guide_tokens);
588
+ }
589
+
584
590
  if (!llama->initSampling()) {
585
591
  @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to initialize sampling" userInfo:nil];
586
592
  }
@@ -604,6 +610,9 @@
604
610
  } catch (const std::exception &e) {
605
611
  llama->endCompletion();
606
612
  @throw [NSException exceptionWithName:@"LlamaException" reason:[NSString stringWithUTF8String:e.what()] userInfo:nil];
613
+ } catch (const std::runtime_error& e) {
614
+ llama->endCompletion();
615
+ @throw [NSException exceptionWithName:@"LlamaException" reason:[NSString stringWithUTF8String:e.what()] userInfo:nil];
607
616
  }
608
617
 
609
618
  if (llama->context_full) {
@@ -680,7 +689,20 @@
680
689
  if (!llama->is_interrupted) {
681
690
  try {
682
691
  auto chat_format = params[@"chat_format"] ? [params[@"chat_format"] intValue] : COMMON_CHAT_FORMAT_CONTENT_ONLY;
683
- common_chat_msg message = common_chat_parse(llama->generated_text, static_cast<common_chat_format>(chat_format));
692
+ common_chat_syntax chat_syntax;
693
+ chat_syntax.format = static_cast<common_chat_format>(chat_format);
694
+
695
+ NSString *reasoningFormat = params[@"reasoning_format"];
696
+ if (reasoningFormat && [reasoningFormat isEqualToString:@"deepseek"]) {
697
+ chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
698
+ } else if (reasoningFormat && [reasoningFormat isEqualToString:@"deepseek-legacy"]) {
699
+ chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
700
+ } else {
701
+ chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_NONE;
702
+ }
703
+ chat_syntax.thinking_forced_open = [params[@"thinking_forced_open"] boolValue];
704
+
705
+ common_chat_msg message = common_chat_parse(llama->generated_text, false, chat_syntax);
684
706
  if (!message.reasoning_content.empty()) {
685
707
  reasoningContent = [NSString stringWithUTF8String:message.reasoning_content.c_str()];
686
708
  }
@@ -716,6 +738,15 @@
716
738
  result[@"stopped_limit"] = @(llama->stopped_limit);
717
739
  result[@"stopping_word"] = [NSString stringWithUTF8String:llama->stopping_word.c_str()];
718
740
  result[@"tokens_cached"] = @(llama->n_past);
741
+
742
+ if (llama->isVocoderEnabled() && !llama->audio_tokens.empty()) {
743
+ NSMutableArray *audioTokens = [[NSMutableArray alloc] init];
744
+ for (llama_token token : llama->audio_tokens) {
745
+ [audioTokens addObject:@(token)];
746
+ }
747
+ result[@"audio_tokens"] = audioTokens;
748
+ }
749
+
719
750
  result[@"timings"] = @{
720
751
  @"prompt_n": @(timings.n_p_eval),
721
752
  @"prompt_ms": @(timings.t_p_eval_ms),
@@ -775,6 +806,8 @@
775
806
  return result;
776
807
  } catch (const std::exception &e) {
777
808
  @throw [NSException exceptionWithName:@"LlamaException" reason:[NSString stringWithUTF8String:e.what()] userInfo:nil];
809
+ } catch (const std::runtime_error& e) {
810
+ @throw [NSException exceptionWithName:@"LlamaException" reason:[NSString stringWithUTF8String:e.what()] userInfo:nil];
778
811
  }
779
812
  }
780
813
 
@@ -817,6 +850,9 @@
817
850
  } catch (const std::exception &e) {
818
851
  llama->endCompletion();
819
852
  @throw [NSException exceptionWithName:@"LlamaException" reason:[NSString stringWithUTF8String:e.what()] userInfo:nil];
853
+ } catch (const std::runtime_error& e) {
854
+ llama->endCompletion();
855
+ @throw [NSException exceptionWithName:@"LlamaException" reason:[NSString stringWithUTF8String:e.what()] userInfo:nil];
820
856
  }
821
857
  llama->doCompletion();
822
858
 
@@ -838,6 +874,34 @@
838
874
  return resultDict;
839
875
  }
840
876
 
877
+ - (NSArray *)rerank:(NSString *)query documents:(NSArray<NSString *> *)documents params:(NSDictionary *)params {
878
+ // Convert NSArray to std::vector
879
+ std::vector<std::string> documentsVector;
880
+ for (NSString *doc in documents) {
881
+ documentsVector.push_back(std::string([doc UTF8String]));
882
+ }
883
+
884
+ NSMutableArray *resultArray = [[NSMutableArray alloc] init];
885
+
886
+ try {
887
+ std::vector<float> scores = llama->rerank(std::string([query UTF8String]), documentsVector);
888
+
889
+ // Create result array with score and index
890
+ for (size_t i = 0; i < scores.size(); i++) {
891
+ NSMutableDictionary *item = [[NSMutableDictionary alloc] init];
892
+ item[@"score"] = @(scores[i]);
893
+ item[@"index"] = @((int)i);
894
+ [resultArray addObject:item];
895
+ }
896
+ } catch (const std::exception &e) {
897
+ @throw [NSException exceptionWithName:@"LlamaException" reason:[NSString stringWithUTF8String:e.what()] userInfo:nil];
898
+ } catch (const std::runtime_error& e) {
899
+ @throw [NSException exceptionWithName:@"LlamaException" reason:[NSString stringWithUTF8String:e.what()] userInfo:nil];
900
+ }
901
+
902
+ return resultArray;
903
+ }
904
+
841
905
  - (NSDictionary *)loadSession:(NSString *)path {
842
906
  if (!path || [path length] == 0) {
843
907
  @throw [NSException exceptionWithName:@"LlamaException" reason:@"Session path is empty" userInfo:nil];
@@ -920,6 +984,45 @@
920
984
  return result;
921
985
  }
922
986
 
987
+ - (bool)initVocoder:(NSString *)vocoderModelPath {
988
+ return llama->initVocoder([vocoderModelPath UTF8String]);
989
+ }
990
+
991
+ - (bool)isVocoderEnabled {
992
+ return llama->isVocoderEnabled();
993
+ }
994
+
995
+ - (NSString *)getFormattedAudioCompletion:(NSString *)speakerJsonStr textToSpeak:(NSString *)textToSpeak {
996
+ std::string speakerStr = speakerJsonStr ? [speakerJsonStr UTF8String] : "";
997
+ return [NSString stringWithUTF8String:llama->getFormattedAudioCompletion(speakerStr, [textToSpeak UTF8String]).c_str()];
998
+ }
999
+
1000
+ - (NSArray *)getAudioCompletionGuideTokens:(NSString *)textToSpeak {
1001
+ std::vector<llama_token> guide_tokens = llama->getAudioCompletionGuideTokens([textToSpeak UTF8String]);
1002
+ NSMutableArray *result = [[NSMutableArray alloc] init];
1003
+ for (llama_token token : guide_tokens) {
1004
+ [result addObject:@(token)];
1005
+ }
1006
+ return result;
1007
+ }
1008
+
1009
+ - (NSArray *)decodeAudioTokens:(NSArray *)tokens {
1010
+ std::vector<llama_token> token_vector;
1011
+ for (NSNumber *token in tokens) {
1012
+ token_vector.push_back([token intValue]);
1013
+ }
1014
+ std::vector<float> audio_data = llama->decodeAudioTokens(token_vector);
1015
+ NSMutableArray *result = [[NSMutableArray alloc] init];
1016
+ for (float sample : audio_data) {
1017
+ [result addObject:@(sample)];
1018
+ }
1019
+ return result;
1020
+ }
1021
+
1022
+ - (void)releaseVocoder {
1023
+ llama->releaseVocoder();
1024
+ }
1025
+
923
1026
  - (void)invalidate {
924
1027
  delete llama;
925
1028
  // llama_backend_free();