cui-llama.rn 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (285) hide show
  1. package/README.md +35 -7
  2. package/android/src/main/CMakeLists.txt +22 -11
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +42 -6
  4. package/android/src/main/java/com/rnllama/RNLlama.java +139 -4
  5. package/android/src/main/jni.cpp +173 -18
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +24 -4
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +22 -2
  16. package/cpp/LICENSE +21 -0
  17. package/cpp/chat.cpp +129 -107
  18. package/cpp/chat.h +2 -0
  19. package/cpp/common.cpp +58 -78
  20. package/cpp/common.h +29 -21
  21. package/cpp/ggml-alloc.c +4 -1
  22. package/cpp/ggml-backend.cpp +9 -5
  23. package/cpp/ggml-backend.h +4 -4
  24. package/cpp/ggml-cpp.h +1 -1
  25. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  26. package/cpp/ggml-cpu/amx/amx.h +8 -0
  27. package/cpp/ggml-cpu/amx/common.h +91 -0
  28. package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
  29. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  30. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/binary-ops.h +1 -1
  31. package/cpp/ggml-cpu/common.h +72 -0
  32. package/cpp/{ggml-cpu-aarch64.cpp → ggml-cpu/ggml-cpu-aarch64.cpp} +809 -103
  33. package/cpp/{ggml-cpu-quants.c → ggml-cpu/ggml-cpu-quants.c} +306 -6
  34. package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} +114 -55
  35. package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} +32 -16
  36. package/cpp/{ops.cpp → ggml-cpu/ops.cpp} +353 -173
  37. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/ops.h +2 -20
  38. package/cpp/{sgemm.cpp → ggml-cpu/sgemm.cpp} +501 -0
  39. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/simd-mappings.h +7 -3
  40. package/{ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/unary-ops.h +1 -1
  41. package/cpp/{vec.cpp → ggml-cpu/vec.cpp} +0 -6
  42. package/{ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/vec.h +16 -0
  43. package/cpp/ggml-cpu.h +5 -0
  44. package/cpp/ggml-impl.h +16 -9
  45. package/cpp/ggml-llama-sim.metallib +0 -0
  46. package/cpp/ggml-llama.metallib +0 -0
  47. package/cpp/ggml-metal-impl.h +36 -11
  48. package/cpp/ggml-metal.m +810 -176
  49. package/cpp/ggml-opt.cpp +373 -190
  50. package/cpp/ggml-opt.h +49 -28
  51. package/cpp/ggml-quants.c +0 -6
  52. package/cpp/ggml.c +227 -282
  53. package/cpp/ggml.h +82 -101
  54. package/cpp/gguf.cpp +33 -33
  55. package/cpp/json-schema-to-grammar.cpp +3 -0
  56. package/cpp/llama-adapter.cpp +6 -0
  57. package/cpp/llama-arch.cpp +49 -17
  58. package/cpp/llama-arch.h +9 -0
  59. package/cpp/llama-batch.cpp +8 -2
  60. package/cpp/llama-batch.h +2 -1
  61. package/cpp/llama-chat.cpp +39 -16
  62. package/cpp/llama-chat.h +4 -2
  63. package/cpp/llama-context.cpp +440 -611
  64. package/cpp/llama-context.h +44 -33
  65. package/cpp/llama-cparams.h +1 -0
  66. package/cpp/llama-graph.cpp +214 -291
  67. package/cpp/llama-graph.h +69 -21
  68. package/cpp/llama-hparams.cpp +17 -1
  69. package/cpp/llama-hparams.h +39 -5
  70. package/cpp/llama-kv-cache.cpp +2067 -620
  71. package/cpp/llama-kv-cache.h +410 -108
  72. package/cpp/llama-memory.h +12 -1
  73. package/cpp/llama-model-loader.cpp +24 -15
  74. package/cpp/llama-model-saver.cpp +281 -0
  75. package/cpp/llama-model-saver.h +37 -0
  76. package/cpp/llama-model.cpp +1089 -359
  77. package/cpp/llama-model.h +19 -3
  78. package/cpp/llama-sampling.cpp +20 -7
  79. package/cpp/llama-vocab.cpp +54 -9
  80. package/cpp/llama-vocab.h +6 -0
  81. package/cpp/llama.cpp +14 -0
  82. package/cpp/llama.h +86 -142
  83. package/cpp/minja/chat-template.hpp +9 -5
  84. package/cpp/minja/minja.hpp +69 -36
  85. package/cpp/rn-llama.cpp +602 -190
  86. package/cpp/rn-llama.h +34 -8
  87. package/cpp/sampling.cpp +57 -50
  88. package/cpp/tools/mtmd/clip-impl.h +462 -0
  89. package/cpp/tools/mtmd/clip.cpp +4024 -0
  90. package/cpp/tools/mtmd/clip.h +101 -0
  91. package/cpp/tools/mtmd/miniaudio.h +93468 -0
  92. package/cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  93. package/cpp/tools/mtmd/mtmd-audio.h +62 -0
  94. package/cpp/tools/mtmd/mtmd-helper.cpp +297 -0
  95. package/cpp/tools/mtmd/mtmd.cpp +942 -0
  96. package/cpp/tools/mtmd/mtmd.h +362 -0
  97. package/cpp/tools/mtmd/stb_image.h +7988 -0
  98. package/ios/CMakeLists.txt +20 -10
  99. package/ios/RNLlama.h +6 -0
  100. package/ios/RNLlama.mm +82 -3
  101. package/ios/RNLlamaContext.h +5 -1
  102. package/ios/RNLlamaContext.mm +131 -38
  103. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +2 -0
  104. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +29 -21
  105. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  106. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  107. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  108. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  109. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  110. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  111. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +82 -101
  112. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  113. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  114. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  115. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  116. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  117. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  118. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  119. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  120. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  121. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  122. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +86 -142
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  131. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  132. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  133. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  134. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  135. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  136. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  137. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  138. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  139. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  140. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  141. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  142. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  143. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  144. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  145. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  146. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  147. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  148. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  149. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  150. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  151. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  152. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  153. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  154. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  155. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  156. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  160. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +2 -0
  161. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +29 -21
  162. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  163. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  164. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  165. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  166. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  167. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  168. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +82 -101
  169. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  170. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  171. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  172. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  173. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  174. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  175. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  176. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  177. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  178. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  179. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  180. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  181. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +86 -142
  182. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  183. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  184. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  185. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
  186. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  187. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  188. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  189. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  190. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  191. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  192. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  193. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  194. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  195. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  196. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  197. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  198. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  199. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  200. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  201. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  202. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  203. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  204. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  205. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  206. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  207. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  208. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  209. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  210. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  211. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  212. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  213. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  214. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  215. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  216. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  217. package/jest/mock.js +33 -7
  218. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  219. package/lib/commonjs/index.js +153 -21
  220. package/lib/commonjs/index.js.map +1 -1
  221. package/lib/module/NativeRNLlama.js.map +1 -1
  222. package/lib/module/index.js +152 -20
  223. package/lib/module/index.js.map +1 -1
  224. package/lib/typescript/NativeRNLlama.d.ts +54 -4
  225. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  226. package/lib/typescript/index.d.ts +72 -6
  227. package/lib/typescript/index.d.ts.map +1 -1
  228. package/package.json +1 -1
  229. package/src/NativeRNLlama.ts +72 -4
  230. package/src/index.ts +212 -38
  231. package/cpp/binary-ops.h +0 -16
  232. package/cpp/ops.h +0 -128
  233. package/cpp/simd-mappings.h +0 -888
  234. package/cpp/unary-ops.h +0 -28
  235. package/cpp/vec.h +0 -802
  236. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  237. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  238. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  239. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  240. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  241. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +0 -128
  242. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  243. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  244. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +0 -802
  245. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  246. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  247. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  248. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  249. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  250. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  251. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  252. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  253. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  254. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  255. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  256. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  257. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +0 -128
  258. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  259. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  260. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +0 -28
  261. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +0 -16
  262. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  263. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  264. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  265. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  266. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +0 -128
  267. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  268. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +0 -888
  269. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  270. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  271. package/lib/commonjs/chat.js +0 -37
  272. package/lib/commonjs/chat.js.map +0 -1
  273. package/lib/module/chat.js +0 -33
  274. package/lib/module/chat.js.map +0 -1
  275. package/lib/typescript/chat.d.ts +0 -10
  276. package/lib/typescript/chat.d.ts.map +0 -1
  277. package/src/chat.ts +0 -44
  278. /package/cpp/{binary-ops.cpp → ggml-cpu/binary-ops.cpp} +0 -0
  279. /package/cpp/{ggml-cpu-aarch64.h → ggml-cpu/ggml-cpu-aarch64.h} +0 -0
  280. /package/cpp/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -0
  281. /package/cpp/{ggml-cpu-quants.h → ggml-cpu/ggml-cpu-quants.h} +0 -0
  282. /package/cpp/{ggml-cpu-traits.cpp → ggml-cpu/ggml-cpu-traits.cpp} +0 -0
  283. /package/cpp/{ggml-cpu-traits.h → ggml-cpu/ggml-cpu-traits.h} +0 -0
  284. /package/cpp/{sgemm.h → ggml-cpu/sgemm.h} +0 -0
  285. /package/cpp/{unary-ops.cpp → ggml-cpu/unary-ops.cpp} +0 -0
@@ -40,15 +40,18 @@ add_library(rnllama SHARED
40
40
  ${SOURCE_DIR}/ggml-alloc.c
41
41
  ${SOURCE_DIR}/ggml-backend.cpp
42
42
  ${SOURCE_DIR}/ggml-backend-reg.cpp
43
- ${SOURCE_DIR}/ggml-cpu.c
44
- ${SOURCE_DIR}/ggml-cpu.cpp
45
- ${SOURCE_DIR}/ops.cpp
46
- ${SOURCE_DIR}/unary-ops.cpp
47
- ${SOURCE_DIR}/binary-ops.cpp
48
- ${SOURCE_DIR}/vec.cpp
49
- ${SOURCE_DIR}/ggml-cpu-aarch64.cpp
50
- ${SOURCE_DIR}/ggml-cpu-quants.c
51
- ${SOURCE_DIR}/ggml-cpu-traits.cpp
43
+ ${SOURCE_DIR}/ggml-cpu/amx/amx.cpp
44
+ ${SOURCE_DIR}/ggml-cpu/amx/mmq.cpp
45
+ ${SOURCE_DIR}/ggml-cpu/ggml-cpu.c
46
+ ${SOURCE_DIR}/ggml-cpu/ggml-cpu.cpp
47
+ ${SOURCE_DIR}/ggml-cpu/ggml-cpu-aarch64.cpp
48
+ ${SOURCE_DIR}/ggml-cpu/ggml-cpu-quants.c
49
+ ${SOURCE_DIR}/ggml-cpu/ggml-cpu-traits.cpp
50
+ ${SOURCE_DIR}/ggml-cpu/unary-ops.cpp
51
+ ${SOURCE_DIR}/ggml-cpu/binary-ops.cpp
52
+ ${SOURCE_DIR}/ggml-cpu/sgemm.cpp
53
+ ${SOURCE_DIR}/ggml-cpu/vec.cpp
54
+ ${SOURCE_DIR}/ggml-cpu/ops.cpp
52
55
  ${SOURCE_DIR}/ggml-metal.m
53
56
  ${SOURCE_DIR}/ggml-opt.cpp
54
57
  ${SOURCE_DIR}/ggml-threading.cpp
@@ -70,6 +73,7 @@ add_library(rnllama SHARED
70
73
  ${SOURCE_DIR}/llama.cpp
71
74
  ${SOURCE_DIR}/llama-model.cpp
72
75
  ${SOURCE_DIR}/llama-model-loader.cpp
76
+ ${SOURCE_DIR}/llama-model-saver.cpp
73
77
  ${SOURCE_DIR}/llama-mmap.cpp
74
78
  ${SOURCE_DIR}/llama-vocab.cpp
75
79
  ${SOURCE_DIR}/llama-memory.cpp
@@ -78,13 +82,17 @@ add_library(rnllama SHARED
78
82
  ${SOURCE_DIR}/sampling.cpp
79
83
  ${SOURCE_DIR}/unicode-data.cpp
80
84
  ${SOURCE_DIR}/unicode.cpp
81
- ${SOURCE_DIR}/sgemm.cpp
82
85
  ${SOURCE_DIR}/common.cpp
83
86
  ${SOURCE_DIR}/chat.cpp
84
87
  ${SOURCE_DIR}/json-schema-to-grammar.cpp
85
88
  ${SOURCE_DIR}/minja/minja.hpp
86
89
  ${SOURCE_DIR}/minja/chat-template.hpp
87
90
  ${SOURCE_DIR}/json.hpp
91
+ # Multimodal support
92
+ ${SOURCE_DIR}/tools/mtmd/mtmd.cpp
93
+ ${SOURCE_DIR}/tools/mtmd/mtmd-audio.cpp
94
+ ${SOURCE_DIR}/tools/mtmd/clip.cpp
95
+ ${SOURCE_DIR}/tools/mtmd/mtmd-helper.cpp
88
96
  ${SOURCE_DIR}/rn-llama.cpp
89
97
  )
90
98
 
@@ -92,6 +100,8 @@ add_library(rnllama SHARED
92
100
  target_include_directories(rnllama
93
101
  PUBLIC
94
102
  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp>
103
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp/ggml-cpu>
104
+ $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cpp/tools/mtmd>
95
105
  $<INSTALL_INTERFACE:include>
96
106
  )
97
107
 
package/ios/RNLlama.h CHANGED
@@ -1,6 +1,12 @@
1
1
  #import <React/RCTEventEmitter.h>
2
2
  #import <React/RCTBridgeModule.h>
3
3
 
4
+ #if RNLLAMA_BUILD_FROM_SOURCE
5
+ #import "json.hpp"
6
+ #else
7
+ #import <rnllama/json.hpp>
8
+ #endif
9
+
4
10
  // TODO: Use RNLlamaSpec (Need to refactor NSDictionary usage)
5
11
  @interface RNLlama : RCTEventEmitter <RCTBridgeModule>
6
12
 
package/ios/RNLlama.mm CHANGED
@@ -108,8 +108,13 @@ RCT_EXPORT_METHOD(getFormattedChat:(double)contextId
108
108
  } else {
109
109
  resolve([context getFormattedChat:messages withChatTemplate:chatTemplate]);
110
110
  }
111
+ } catch (const nlohmann::json_abi_v3_11_3::detail::parse_error& e) {
112
+ NSString *errorMessage = [NSString stringWithUTF8String:e.what()];
113
+ reject(@"llama_error", [NSString stringWithFormat:@"JSON parse error in getFormattedChat: %@", errorMessage], nil);
111
114
  } catch (const std::exception& e) { // catch cpp exceptions
112
115
  reject(@"llama_error", [NSString stringWithUTF8String:e.what()], nil);
116
+ } catch (...) {
117
+ reject(@"llama_error", @"Unknown error in getFormattedChat", nil);
113
118
  }
114
119
  }
115
120
 
@@ -229,6 +234,7 @@ RCT_EXPORT_METHOD(stopCompletion:(double)contextId
229
234
 
230
235
  RCT_EXPORT_METHOD(tokenizeASync:(double)contextId
231
236
  text:(NSString *)text
237
+ imagePaths:(NSArray *)imagePaths
232
238
  withResolver:(RCTPromiseResolveBlock)resolve
233
239
  withRejecter:(RCTPromiseRejectBlock)reject)
234
240
  {
@@ -237,9 +243,13 @@ RCT_EXPORT_METHOD(tokenizeASync:(double)contextId
237
243
  reject(@"llama_error", @"Context not found", nil);
238
244
  return;
239
245
  }
240
- NSMutableArray *tokens = [context tokenize:text];
241
- resolve(@{ @"tokens": tokens });
242
- [tokens release];
246
+ @try {
247
+ NSMutableDictionary *result = [context tokenize:text imagePaths:imagePaths];
248
+ resolve(result);
249
+ [result release];
250
+ } @catch (NSException *exception) {
251
+ reject(@"llama_error", exception.reason, nil);
252
+ }
243
253
  }
244
254
 
245
255
  RCT_EXPORT_BLOCKING_SYNCHRONOUS_METHOD(tokenizeSync:(double)contextId
@@ -355,6 +365,75 @@ RCT_EXPORT_METHOD(getLoadedLoraAdapters:(double)contextId
355
365
  resolve([context getLoadedLoraAdapters]);
356
366
  }
357
367
 
368
+ RCT_EXPORT_METHOD(initMultimodal:(double)contextId
369
+ withParams:(NSDictionary *)params
370
+ withResolver:(RCTPromiseResolveBlock)resolve
371
+ withRejecter:(RCTPromiseRejectBlock)reject)
372
+ {
373
+ RNLlamaContext *context = llamaContexts[[NSNumber numberWithDouble:contextId]];
374
+ if (context == nil) {
375
+ reject(@"llama_error", @"Context not found", nil);
376
+ return;
377
+ }
378
+ if ([context isPredicting]) {
379
+ reject(@"llama_error", @"Context is busy", nil);
380
+ return;
381
+ }
382
+
383
+ @try {
384
+ bool success = [context initMultimodal:params];
385
+ resolve(@(success));
386
+ } @catch (NSException *exception) {
387
+ reject(@"llama_cpp_error", exception.reason, nil);
388
+ }
389
+ }
390
+
391
+ RCT_EXPORT_METHOD(isMultimodalEnabled:(double)contextId
392
+ withResolver:(RCTPromiseResolveBlock)resolve
393
+ withRejecter:(RCTPromiseRejectBlock)reject)
394
+ {
395
+ RNLlamaContext *context = llamaContexts[[NSNumber numberWithDouble:contextId]];
396
+ if (context == nil) {
397
+ reject(@"llama_error", @"Context not found", nil);
398
+ return;
399
+ }
400
+
401
+ resolve(@([context isMultimodalEnabled]));
402
+ }
403
+
404
+ RCT_EXPORT_METHOD(getMultimodalSupport:(double)contextId
405
+ withResolver:(RCTPromiseResolveBlock)resolve
406
+ withRejecter:(RCTPromiseRejectBlock)reject)
407
+ {
408
+ RNLlamaContext *context = llamaContexts[[NSNumber numberWithDouble:contextId]];
409
+ if (context == nil) {
410
+ reject(@"llama_error", @"Context not found", nil);
411
+ return;
412
+ }
413
+
414
+ if (![context isMultimodalEnabled]) {
415
+ reject(@"llama_error", @"Multimodal is not enabled", nil);
416
+ return;
417
+ }
418
+
419
+ NSDictionary *multimodalSupport = [context getMultimodalSupport];
420
+ resolve(multimodalSupport);
421
+ }
422
+
423
+ RCT_EXPORT_METHOD(releaseMultimodal:(double)contextId
424
+ withResolver:(RCTPromiseResolveBlock)resolve
425
+ withRejecter:(RCTPromiseRejectBlock)reject)
426
+ {
427
+ RNLlamaContext *context = llamaContexts[[NSNumber numberWithDouble:contextId]];
428
+ if (context == nil) {
429
+ reject(@"llama_error", @"Context not found", nil);
430
+ return;
431
+ }
432
+
433
+ [context releaseMultimodal];
434
+ resolve(nil);
435
+ }
436
+
358
437
  RCT_EXPORT_METHOD(releaseContext:(double)contextId
359
438
  withResolver:(RCTPromiseResolveBlock)resolve
360
439
  withRejecter:(RCTPromiseRejectBlock)reject)
@@ -34,9 +34,13 @@
34
34
  - (NSDictionary *)modelInfo;
35
35
  - (bool)isModelLoaded;
36
36
  - (bool)isPredicting;
37
+ - (bool)initMultimodal:(NSDictionary *)params;
38
+ - (NSDictionary *)getMultimodalSupport;
39
+ - (bool)isMultimodalEnabled;
40
+ - (void)releaseMultimodal;
37
41
  - (NSDictionary *)completion:(NSDictionary *)params onToken:(void (^)(NSMutableDictionary *tokenResult))onToken;
38
42
  - (void)stopCompletion;
39
- - (NSArray *)tokenize:(NSString *)text;
43
+ - (NSDictionary *)tokenize:(NSString *)text imagePaths:(NSArray *)imagePaths;
40
44
  - (NSString *)detokenize:(NSArray *)tokens;
41
45
  - (NSDictionary *)embedding:(NSString *)text params:(NSDictionary *)params;
42
46
  - (NSDictionary *)getFormattedChatWithJinja:(NSString *)messages
@@ -82,7 +82,7 @@
82
82
  BOOL isAsset = [params[@"is_model_asset"] boolValue];
83
83
  NSString *path = modelPath;
84
84
  if (isAsset) path = [[NSBundle mainBundle] pathForResource:modelPath ofType:nil];
85
- defaultParams.model = {[path UTF8String]};
85
+ defaultParams.model.path = [path UTF8String];
86
86
 
87
87
  NSString *chatTemplate = params[@"chat_template"];
88
88
  if (chatTemplate) {
@@ -106,37 +106,27 @@
106
106
  NSString *reasonNoMetal = @"";
107
107
  defaultParams.n_gpu_layers = 0;
108
108
  #ifdef LM_GGML_USE_METAL
109
- // Check ggml-metal availability
110
- NSError * error = nil;
111
109
  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
112
- id<MTLLibrary> library = [device
113
- newLibraryWithSource:@"#include <metal_stdlib>\n"
114
- "using namespace metal;"
115
- "typedef matrix<bfloat, 4, 4> bfloat4x4;"
116
- "kernel void test() { simd_sum(0); }"
117
- options:nil
118
- error:&error
119
- ];
120
- if (error) {
121
- reasonNoMetal = [error localizedDescription];
110
+
111
+ // Check ggml-metal availability
112
+ BOOL supportsGgmlMetal = [device supportsFamily:MTLGPUFamilyApple7];
113
+ if (@available(iOS 16.0, tvOS 16.0, *)) {
114
+ supportsGgmlMetal = supportsGgmlMetal && [device supportsFamily:MTLGPUFamilyMetal3];
115
+ }
116
+ if (!supportsGgmlMetal) {
117
+ reasonNoMetal = @"Metal is not supported in this device";
122
118
  skipGpuDevices = true;
123
- } else {
124
- id<MTLFunction> kernel = [library newFunctionWithName:@"test"];
125
- id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:kernel error:&error];
126
- if (pipeline == nil) {
127
- reasonNoMetal = [error localizedDescription];
128
- skipGpuDevices = true;
129
- } else {
119
+ }
120
+
130
121
  #if TARGET_OS_SIMULATOR
131
- // Use the backend, but no layers because not supported fully on simulator
132
- defaultParams.n_gpu_layers = 0;
133
- isMetalEnabled = true;
122
+ // Use the backend, but no layers because not supported fully on simulator
123
+ defaultParams.n_gpu_layers = 0;
124
+ isMetalEnabled = true;
134
125
  #else
135
- defaultParams.n_gpu_layers = [params[@"n_gpu_layers"] intValue];
136
- isMetalEnabled = true;
126
+ defaultParams.n_gpu_layers = [params[@"n_gpu_layers"] intValue];
127
+ isMetalEnabled = true;
137
128
  #endif
138
- }
139
- }
129
+
140
130
  device = nil;
141
131
  #else
142
132
  reasonNoMetal = @"Metal is not enabled in this build";
@@ -158,6 +148,8 @@
158
148
  }
159
149
  if (cpu_devs.size() > 0) {
160
150
  defaultParams.devices = cpu_devs;
151
+ defaultParams.n_gpu_layers = 0;
152
+ isMetalEnabled = false;
161
153
  }
162
154
  }
163
155
 
@@ -184,6 +176,8 @@
184
176
 
185
177
  if (params[@"flash_attn"] && [params[@"flash_attn"] boolValue]) defaultParams.flash_attn = true;
186
178
 
179
+ if (params[@"ctx_shift"]) defaultParams.ctx_shift = [params[@"ctx_shift"] boolValue];
180
+
187
181
  if (params[@"cache_type_k"]) defaultParams.cache_type_k = rnllama::kv_cache_type_from_str([params[@"cache_type_k"] UTF8String]);
188
182
  if (params[@"cache_type_v"]) defaultParams.cache_type_v = rnllama::kv_cache_type_from_str([params[@"cache_type_v"] UTF8String]);
189
183
 
@@ -338,6 +332,30 @@
338
332
  return llama->is_predicting;
339
333
  }
340
334
 
335
+ - (bool)initMultimodal:(NSDictionary *)params {
336
+ NSString *mmproj_path = params[@"path"];
337
+ BOOL use_gpu = params[@"use_gpu"] ? [params[@"use_gpu"] boolValue] : true;
338
+ return llama->initMultimodal([mmproj_path UTF8String], use_gpu);
339
+ }
340
+
341
+ - (NSDictionary *)getMultimodalSupport {
342
+ if (!is_model_loaded) return nil;
343
+ return @{
344
+ @"vision": @(llama->isMultimodalSupportVision()),
345
+ @"audio": @(llama->isMultimodalSupportAudio())
346
+ };
347
+ }
348
+
349
+ - (bool)isMultimodalEnabled {
350
+ if (!is_model_loaded) return false;
351
+ return llama->isMultimodalEnabled();
352
+ }
353
+
354
+ - (void)releaseMultimodal {
355
+ if (!is_model_loaded) return;
356
+ llama->releaseMultimodal();
357
+ }
358
+
341
359
  - (NSDictionary *)getFormattedChatWithJinja:(NSString *)messages
342
360
  withChatTemplate:(NSString *)chatTemplate
343
361
  withJsonSchema:(NSString *)jsonSchema
@@ -566,8 +584,32 @@
566
584
  if (!llama->initSampling()) {
567
585
  @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to initialize sampling" userInfo:nil];
568
586
  }
587
+
569
588
  llama->beginCompletion();
570
- llama->loadPrompt();
589
+ try {
590
+ // Use the unified loadPrompt function with image paths if available
591
+ NSArray *imagePaths = params[@"media_paths"];
592
+ if (imagePaths && [imagePaths count] > 0) {
593
+ // Multiple image paths
594
+ std::vector<std::string> media_paths_vector;
595
+ for (NSString *path in imagePaths) {
596
+ if ([path isKindOfClass:[NSString class]]) {
597
+ media_paths_vector.push_back([path UTF8String]);
598
+ }
599
+ }
600
+ llama->loadPrompt(media_paths_vector);
601
+ } else {
602
+ llama->loadPrompt({});
603
+ }
604
+ } catch (const std::exception &e) {
605
+ llama->endCompletion();
606
+ @throw [NSException exceptionWithName:@"LlamaException" reason:[NSString stringWithUTF8String:e.what()] userInfo:nil];
607
+ }
608
+
609
+ if (llama->context_full) {
610
+ llama->endCompletion();
611
+ @throw [NSException exceptionWithName:@"LlamaException" reason:@"Context is full" userInfo:nil];
612
+ }
571
613
 
572
614
  size_t sent_count = 0;
573
615
  size_t sent_token_probs_index = 0;
@@ -628,7 +670,7 @@
628
670
  }
629
671
 
630
672
  llama_perf_context_print(llama->ctx);
631
- llama->is_predicting = false;
673
+ llama->endCompletion();
632
674
 
633
675
  const auto timings = llama_perf_context(llama->ctx);
634
676
 
@@ -655,7 +697,7 @@
655
697
  }];
656
698
  }
657
699
  } catch (const std::exception &e) {
658
- // NSLog(@"Error parsing tool calls: %s", e.what());
700
+ } catch (...) {
659
701
  }
660
702
  }
661
703
 
@@ -668,6 +710,7 @@
668
710
  result[@"tokens_predicted"] = @(llama->num_tokens_predicted);
669
711
  result[@"tokens_evaluated"] = @(llama->num_prompt_tokens);
670
712
  result[@"truncated"] = @(llama->truncated);
713
+ result[@"context_full"] = @(llama->context_full);
671
714
  result[@"stopped_eos"] = @(llama->stopped_eos);
672
715
  result[@"stopped_word"] = @(llama->stopped_word);
673
716
  result[@"stopped_limit"] = @(llama->stopped_limit);
@@ -691,13 +734,48 @@
691
734
  llama->is_interrupted = true;
692
735
  }
693
736
 
694
- - (NSArray *)tokenize:(NSString *)text {
695
- const std::vector<llama_token> toks = common_tokenize(llama->ctx, [text UTF8String], false);
696
- NSMutableArray *result = [[NSMutableArray alloc] init];
697
- for (llama_token tok : toks) {
698
- [result addObject:@(tok)];
737
+ - (NSDictionary *)tokenize:(NSString *)text imagePaths:(NSArray *)imagePaths {
738
+ std::vector<std::string> media_paths_vector;
739
+ if (imagePaths && [imagePaths count] > 0) {
740
+ for (NSString *path in imagePaths) {
741
+ if ([path isKindOfClass:[NSString class]]) {
742
+ media_paths_vector.push_back([path UTF8String]);
743
+ }
744
+ }
745
+ }
746
+ try {
747
+ rnllama::llama_rn_tokenize_result tokenize_result = llama->tokenize([text UTF8String], media_paths_vector);
748
+
749
+ NSMutableDictionary *result = [[NSMutableDictionary alloc] init];
750
+
751
+ result[@"tokens"] = [NSMutableArray arrayWithCapacity:tokenize_result.tokens.size()];
752
+ for (llama_token tok : tokenize_result.tokens) {
753
+ [result[@"tokens"] addObject:@(tok)];
754
+ }
755
+ result[@"has_media"] = @(tokenize_result.has_media);
756
+
757
+ NSMutableArray *bitmap_hashes = [[NSMutableArray alloc] init];
758
+ for (std::string hash : tokenize_result.bitmap_hashes) {
759
+ [bitmap_hashes addObject:[NSString stringWithUTF8String:hash.c_str()]];
760
+ }
761
+ result[@"bitmap_hashes"] = bitmap_hashes;
762
+
763
+ NSMutableArray *chunk_pos = [[NSMutableArray alloc] init];
764
+ for (int pos : tokenize_result.chunk_pos) {
765
+ [chunk_pos addObject:@(pos)];
766
+ }
767
+ result[@"chunk_pos"] = chunk_pos;
768
+
769
+ NSMutableArray *chunk_pos_media = [[NSMutableArray alloc] init];
770
+ for (int pos : tokenize_result.chunk_pos_media) {
771
+ [chunk_pos_media addObject:@(pos)];
772
+ }
773
+ result[@"chunk_pos_media"] = chunk_pos_media;
774
+
775
+ return result;
776
+ } catch (const std::exception &e) {
777
+ @throw [NSException exceptionWithName:@"LlamaException" reason:[NSString stringWithUTF8String:e.what()] userInfo:nil];
699
778
  }
700
- return result;
701
779
  }
702
780
 
703
781
  - (NSString *)detokenize:(NSArray *)tokens {
@@ -734,7 +812,12 @@
734
812
  @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to initialize sampling" userInfo:nil];
735
813
  }
736
814
  llama->beginCompletion();
737
- llama->loadPrompt();
815
+ try {
816
+ llama->loadPrompt({});
817
+ } catch (const std::exception &e) {
818
+ llama->endCompletion();
819
+ @throw [NSException exceptionWithName:@"LlamaException" reason:[NSString stringWithUTF8String:e.what()] userInfo:nil];
820
+ }
738
821
  llama->doCompletion();
739
822
 
740
823
  std::vector<float> result = llama->getEmbedding(embdParams);
@@ -751,7 +834,7 @@
751
834
  }
752
835
  resultDict[@"prompt_tokens"] = promptTokens;
753
836
 
754
- llama->is_predicting = false;
837
+ llama->endCompletion();
755
838
  return resultDict;
756
839
  }
757
840
 
@@ -769,6 +852,11 @@
769
852
  @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to load session" userInfo:nil];
770
853
  }
771
854
  llama->embd.resize(n_token_count_out);
855
+ // Find LLAMA_TOKEN_NULL in the tokens and resize the array to the index of the null token
856
+ auto null_token_iter = std::find(llama->embd.begin(), llama->embd.end(), LLAMA_TOKEN_NULL);
857
+ if (null_token_iter != llama->embd.end()) {
858
+ llama->embd.resize(std::distance(llama->embd.begin(), null_token_iter));
859
+ }
772
860
  const std::string text = rnllama::tokens_to_str(llama->ctx, llama->embd.cbegin(), llama->embd.cend());
773
861
  return @{
774
862
  @"tokens_loaded": @(n_token_count_out),
@@ -781,6 +869,11 @@
781
869
  @throw [NSException exceptionWithName:@"LlamaException" reason:@"Session path is empty" userInfo:nil];
782
870
  }
783
871
  std::vector<llama_token> session_tokens = llama->embd;
872
+ // Find LLAMA_TOKEN_NULL in the tokens and resize the array to the index of the null token
873
+ auto null_token_iter = std::find(session_tokens.begin(), session_tokens.end(), LLAMA_TOKEN_NULL);
874
+ if (null_token_iter != session_tokens.end()) {
875
+ session_tokens.resize(std::distance(session_tokens.begin(), null_token_iter));
876
+ }
784
877
  int default_size = session_tokens.size();
785
878
  int save_size = size > 0 && size <= default_size ? size : default_size;
786
879
  if (!llama_state_save_file(llama->ctx, [path UTF8String], session_tokens.data(), save_size)) {
@@ -3,6 +3,7 @@
3
3
  #pragma once
4
4
 
5
5
  #include "common.h"
6
+ #include <chrono>
6
7
  #include <string>
7
8
  #include <vector>
8
9
  #include "minja/chat-template.hpp"
@@ -79,6 +80,7 @@ struct common_chat_templates_inputs {
79
80
  common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
80
81
  bool parallel_tool_calls = false;
81
82
  bool extract_reasoning = true;
83
+ std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
82
84
  };
83
85
 
84
86
  struct common_chat_params {
@@ -6,6 +6,7 @@
6
6
 
7
7
  #include <set>
8
8
  #include <string>
9
+ #include <string_view>
9
10
  #include <vector>
10
11
  #include <sstream>
11
12
 
@@ -77,7 +78,6 @@ enum llama_example {
77
78
  LLAMA_EXAMPLE_COMMON,
78
79
  LLAMA_EXAMPLE_SPECULATIVE,
79
80
  LLAMA_EXAMPLE_MAIN,
80
- LLAMA_EXAMPLE_INFILL,
81
81
  LLAMA_EXAMPLE_EMBEDDING,
82
82
  LLAMA_EXAMPLE_PERPLEXITY,
83
83
  LLAMA_EXAMPLE_RETRIEVAL,
@@ -87,7 +87,7 @@ enum llama_example {
87
87
  LLAMA_EXAMPLE_SERVER,
88
88
  LLAMA_EXAMPLE_CVECTOR_GENERATOR,
89
89
  LLAMA_EXAMPLE_EXPORT_LORA,
90
- LLAMA_EXAMPLE_LLAVA,
90
+ LLAMA_EXAMPLE_MTMD,
91
91
  LLAMA_EXAMPLE_LOOKUP,
92
92
  LLAMA_EXAMPLE_PARALLEL,
93
93
  LLAMA_EXAMPLE_TTS,
@@ -107,6 +107,7 @@ enum common_sampler_type {
107
107
  COMMON_SAMPLER_TYPE_XTC = 8,
108
108
  COMMON_SAMPLER_TYPE_INFILL = 9,
109
109
  COMMON_SAMPLER_TYPE_PENALTIES = 10,
110
+ COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
110
111
  };
111
112
 
112
113
  // dimensionality reduction methods, used by cvector-generator
@@ -172,6 +173,7 @@ struct common_params_sampling {
172
173
  std::vector<enum common_sampler_type> samplers = {
173
174
  COMMON_SAMPLER_TYPE_PENALTIES,
174
175
  COMMON_SAMPLER_TYPE_DRY,
176
+ COMMON_SAMPLER_TYPE_TOP_N_SIGMA,
175
177
  COMMON_SAMPLER_TYPE_TOP_K,
176
178
  COMMON_SAMPLER_TYPE_TYPICAL_P,
177
179
  COMMON_SAMPLER_TYPE_TOP_P,
@@ -336,17 +338,17 @@ struct common_params {
336
338
  bool flash_attn = false; // flash attention
337
339
  bool no_perf = false; // disable performance metrics
338
340
  bool ctx_shift = true; // context shift on inifinite text generation
341
+ bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
339
342
 
340
343
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
341
- bool logits_all = false; // return logits for all tokens in the batch
342
344
  bool use_mmap = true; // use mmap for faster loads
343
345
  bool use_mlock = false; // use mlock to keep model in memory
344
346
  bool verbose_prompt = false; // print prompt tokens before generation
345
347
  bool display_prompt = true; // print prompt before generation
346
- bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
347
348
  bool no_kv_offload = false; // disable KV offloading
348
349
  bool warmup = true; // warmup run
349
350
  bool check_tensors = false; // validate tensor data
351
+ bool no_op_offload = false; // globally disable offload host tensor operations to device
350
352
 
351
353
  bool single_turn = false; // single turn chat conversation
352
354
 
@@ -355,8 +357,10 @@ struct common_params {
355
357
 
356
358
  common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
357
359
 
358
- // multimodal models (see examples/llava)
360
+ // multimodal models (see tools/mtmd)
359
361
  struct common_params_model mmproj;
362
+ bool mmproj_use_gpu = true; // use GPU for multimodal model
363
+ bool no_mmproj = false; // explicitly disable multimodal model
360
364
  std::vector<std::string> image; // path to image file(s)
361
365
 
362
366
  // embedding
@@ -379,6 +383,7 @@ struct common_params {
379
383
  bool use_jinja = false; // NOLINT
380
384
  bool enable_chat_template = true;
381
385
  common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
386
+ bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
382
387
 
383
388
  std::vector<std::string> api_keys;
384
389
 
@@ -422,13 +427,14 @@ struct common_params {
422
427
 
423
428
  bool process_output = false; // collect data for the output tensor
424
429
  bool compute_ppl = true; // whether to compute perplexity
430
+ bool parse_special = false; // whether to parse special tokens during imatrix tokenization
425
431
 
426
432
  // cvector-generator params
427
433
  int n_pca_batch = 100;
428
434
  int n_pca_iterations = 1000;
429
435
  dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
430
- std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
431
- std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
436
+ std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
437
+ std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
432
438
 
433
439
  bool spm_infill = false; // suffix/prefix/middle pattern for infill
434
440
 
@@ -437,6 +443,11 @@ struct common_params {
437
443
 
438
444
  // common params
439
445
  std::string out_file; // output filename for all example programs
446
+ // optional callback for model loading progress and cancellation:
447
+ // called with a progress value between 0.0 and 1.0.
448
+ // return false from callback to abort model loading or true to continue
449
+ llama_progress_callback load_progress_callback = NULL;
450
+ void * load_progress_callback_user_data = NULL;
440
451
  };
441
452
 
442
453
  // call once at the start of a program if it uses libcommon
@@ -514,10 +525,9 @@ static bool string_starts_with(const std::string & str,
514
525
  return str.rfind(prefix, 0) == 0;
515
526
  }
516
527
 
517
- static bool string_ends_with(const std::string & str,
518
- const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
519
- return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
520
- }
528
+ // While we wait for C++20's std::string::ends_with...
529
+ bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
530
+ size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
521
531
 
522
532
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
523
533
  void string_process_escapes(std::string & input);
@@ -558,6 +568,8 @@ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const
558
568
  // clear LoRA adapters from context, then apply new list of adapters
559
569
  void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
560
570
 
571
+ std::string get_model_endpoint();
572
+
561
573
  //
562
574
  // Batch utils
563
575
  //
@@ -624,16 +636,6 @@ std::string common_detokenize(
624
636
  const std::vector<llama_token> & tokens,
625
637
  bool special = true);
626
638
 
627
- //
628
- // KV cache utils
629
- //
630
-
631
- // Dump the KV cache view with the number of sequences per cell.
632
- void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
633
-
634
- // Dump the KV cache view showing individual sequences in each cell (long output).
635
- void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
636
-
637
639
  //
638
640
  // Embedding utils
639
641
  //
@@ -675,3 +677,9 @@ const char * const LLM_KV_SPLIT_COUNT = "split.count";
675
677
  const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
676
678
 
677
679
  }
680
+
681
+ //
682
+ // training utils
683
+ //
684
+
685
+ lm_ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);