@novastera-oss/llamarn 0.1.5-beta.3 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. package/README.md +80 -14
  2. package/RNLlamaCpp.podspec +10 -3
  3. package/android/CMakeLists.txt +8 -0
  4. package/android/src/main/cpp/include/llama.h +62 -125
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  13. package/cpp/LlamaCppModel.cpp +148 -5
  14. package/cpp/LlamaCppModel.h +11 -2
  15. package/cpp/PureCppImpl.cpp +3 -3
  16. package/cpp/PureCppImpl.h +3 -0
  17. package/cpp/build-info.cpp +2 -2
  18. package/cpp/llama.cpp/README.md +11 -3
  19. package/cpp/llama.cpp/build-xcframework.sh +1 -0
  20. package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
  21. package/cpp/llama.cpp/common/arg.cpp +153 -113
  22. package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
  23. package/cpp/llama.cpp/common/chat-parser.h +117 -0
  24. package/cpp/llama.cpp/common/chat.cpp +847 -699
  25. package/cpp/llama.cpp/common/chat.h +73 -6
  26. package/cpp/llama.cpp/common/common.cpp +50 -82
  27. package/cpp/llama.cpp/common/common.h +21 -17
  28. package/cpp/llama.cpp/common/json-partial.cpp +255 -0
  29. package/cpp/llama.cpp/common/json-partial.h +37 -0
  30. package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
  31. package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
  32. package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
  33. package/cpp/llama.cpp/common/regex-partial.h +56 -0
  34. package/cpp/llama.cpp/common/sampling.cpp +7 -8
  35. package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
  36. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
  37. package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
  38. package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
  39. package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
  40. package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
  41. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
  42. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
  72. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  74. package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
  77. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
  78. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  79. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
  83. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  86. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  88. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
  112. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
  113. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
  117. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  118. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
  119. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  120. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  121. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
  122. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
  123. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
  124. package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
  125. package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
  126. package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
  127. package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
  128. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
  129. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
  130. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
  131. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
  132. package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
  133. package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
  134. package/cpp/llama.cpp/include/llama.h +62 -125
  135. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
  136. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
  137. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
  138. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
  139. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
  140. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
  141. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
  142. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
  143. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
  144. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
  145. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
  146. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
  147. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
  148. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
  149. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
  150. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
  151. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
  152. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
  153. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  154. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
  155. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
  156. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
  157. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
  158. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
  159. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
  160. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
  161. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
  162. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
  163. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
  164. package/cpp/llama.cpp/models/templates/README.md +2 -0
  165. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  166. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  167. package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  168. package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
  169. package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
  170. package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
  171. package/cpp/llama.cpp/src/llama-arch.h +2 -0
  172. package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
  173. package/cpp/llama.cpp/src/llama-context.cpp +340 -123
  174. package/cpp/llama.cpp/src/llama-context.h +30 -0
  175. package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
  176. package/cpp/llama.cpp/src/llama-cparams.h +2 -0
  177. package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
  178. package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
  179. package/cpp/llama.cpp/src/llama-graph.h +52 -7
  180. package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
  181. package/cpp/llama.cpp/src/llama-hparams.h +37 -5
  182. package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
  183. package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
  184. package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
  185. package/cpp/llama.cpp/src/llama-memory.h +4 -3
  186. package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
  187. package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
  188. package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
  189. package/cpp/llama.cpp/src/llama-model.cpp +529 -172
  190. package/cpp/llama.cpp/src/llama-model.h +6 -1
  191. package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
  192. package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
  193. package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
  194. package/cpp/llama.cpp/src/llama-vocab.h +6 -0
  195. package/cpp/llama.cpp/src/llama.cpp +14 -0
  196. package/cpp/rn-completion.cpp +4 -2
  197. package/ios/include/chat.h +73 -6
  198. package/ios/include/common/minja/chat-template.hpp +9 -5
  199. package/ios/include/common/minja/minja.hpp +69 -36
  200. package/ios/include/common.h +21 -17
  201. package/ios/include/llama.h +62 -125
  202. package/ios/libs/llama.xcframework/Info.plist +19 -19
  203. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  204. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
  205. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  206. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
  207. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
  208. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  209. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  210. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  211. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
  212. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  213. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  214. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  215. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
  219. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
  220. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
  221. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
  222. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
  223. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
  224. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
  225. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  226. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
  227. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
  228. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
  229. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  230. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  231. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  232. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
  233. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  234. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
  235. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
  236. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  237. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  238. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
  239. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
  240. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  241. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  242. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  243. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  244. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  245. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
  246. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  247. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
  248. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
  249. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  250. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  251. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
  252. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
  253. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  254. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  255. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  256. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  257. package/lib/module/NativeRNLlamaCpp.js.map +1 -1
  258. package/lib/typescript/src/NativeRNLlamaCpp.d.ts +3 -0
  259. package/lib/typescript/src/NativeRNLlamaCpp.d.ts.map +1 -1
  260. package/package.json +2 -1
  261. package/src/NativeRNLlamaCpp.ts +1 -0
  262. package/cpp/llama.cpp/common/stb_image.h +0 -7988
  263. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  264. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  265. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  266. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  267. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  268. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  269. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  270. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  271. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  272. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  273. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  274. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
package/README.md CHANGED
@@ -2,8 +2,6 @@
2
2
 
3
3
  > ⚠️ **WORK IN PROGRESS**: This package is currently under active development. Community help and feedback are greatly appreciated, especially in the areas mentioned in What Needs Help.
4
4
 
5
- A React Native wrapper for llama.cpp focused on providing a simple, reliable way to run LLMs on mobile devices. This project was inspired by and builds upon the excellent work of [llama.rn](https://github.com/mybigday/llama.rn).
6
-
7
5
  ## Goals
8
6
 
9
7
  * Provide a thin, reliable wrapper around llama.cpp for React Native
@@ -25,11 +23,12 @@ A React Native wrapper for llama.cpp focused on providing a simple, reliable way
25
23
 
26
24
  We welcome contributions, especially in these areas:
27
25
 
28
- 1. **Android GPU Testing**:
29
- * Testing on a variety of Android devices with different GPUs
30
- * Verifying OpenCL and Vulkan acceleration on real hardware
31
- * Performance profiling and optimization for mobile GPUs
32
- * Adding automated GPU detection tests
26
+ 1. **Android GPU Testing and Detection**:
27
+ * Development of reliable GPU detection mechanism in React Native
28
+ * Implementation of proper backend initialization verification
29
+ * Creation of robust testing framework for GPU availability
30
+ * Integration of OpenCL and Vulkan acceleration once detection is stable
31
+ * Performance benchmarking and optimization for mobile GPUs
33
32
 
34
33
  2. **CI Improvements**:
35
34
  * Adding automated Android GPU tests to CI pipeline
@@ -62,6 +61,73 @@ If you're interested in helping with any of these areas, please check our Contri
62
61
  npm install @novastera-oss/llamarn
63
62
  ```
64
63
 
64
+ ## Developer Setup
65
+
66
+ If you're contributing to the library or running the example project, follow these setup steps:
67
+
68
+ ### Prerequisites
69
+
70
+ 1. Clone the repository and navigate to the project directory
71
+ 2. Ensure you have React Native development environment set up for your target platform(s)
72
+
73
+ ### Initial Setup
74
+
75
+ ```sh
76
+ # Install dependencies
77
+ npm install
78
+
79
+ # Optional if you already had previous version of llamacpp
80
+ npm run clean-llama
81
+
82
+ # Initialize llama.cpp submodule and dependencies
83
+ npm run setup-llama-cpp
84
+ ```
85
+
86
+ ### Android Development
87
+
88
+ 1. Build the native Android libraries:
89
+ ```sh
90
+ # Build the external native libraries for Android
91
+ ./scripts/build_android_external.sh
92
+ ```
93
+
94
+ 2. Run the example project:
95
+ ```sh
96
+ cd example
97
+ npm run android
98
+ ```
99
+
100
+ ### iOS Development
101
+
102
+ 1. Navigate to the example project and install iOS dependencies:
103
+ ```sh
104
+ cd example
105
+ cd ios
106
+
107
+ # Install CocoaPods dependencies
108
+ bundle exec pod install
109
+
110
+ # Or if not using Bundler:
111
+ # pod install
112
+
113
+ cd ..
114
+ ```
115
+
116
+ 2. Run the example project:
117
+ ```sh
118
+ npm run ios
119
+ ```
120
+
121
+ ### Development Notes
122
+
123
+ - **Android**: The `build_android_external.sh` script compiles llama.cpp for Android architectures and sets up the necessary native libraries. This step is required before running the Android example.
124
+
125
+ - **iOS**: The iOS setup uses CocoaPods to manage native dependencies. The prebuilt llama.cpp framework is included in the repository.
126
+
127
+ - **Troubleshooting**: If you encounter build issues, try cleaning your build cache:
128
+ - Android: `cd android && ./gradlew clean`
129
+ - iOS: `cd example/ios && rm -rf build && rm Podfile.lock && pod install`
130
+
65
131
  ## Basic Usage
66
132
 
67
133
  ### Simple Completion
@@ -216,20 +282,20 @@ The module accepts different path formats depending on the platform:
216
282
 
217
283
  ## About
218
284
 
219
- Part of [Novastera's](https://novastera.com) suite of privacy-focused solutions, this package enables on-device LLM inference with no data leaving the user's device. We're committed to helping developers build AI-powered applications that respect user privacy.
285
+ This library is currently being used in [Novastera's](https://novastera.com) mobile application, demonstrating its capabilities in production environments. We're committed to enabling on-device LLM inference with no data leaving the user's device, helping developers build AI-powered applications that respect user privacy.
220
286
 
221
287
  ## License
222
288
 
223
- Apache 2.0 © [Novastera](https://novastera.com)
289
+ Apache 2.0
224
290
 
225
291
  ## Acknowledgments
226
292
 
227
- We extend our heartfelt gratitude to the following projects and communities that made this library possible:
293
+ We acknowledge the following projects and communities that have contributed to the development of this library:
228
294
 
229
- * **[mybigday/llama.rn](https://github.com/mybigday/llama.rn)** - A pioneering React Native binding for llama.cpp that demonstrated the feasibility of on-device LLM inference in mobile applications.
295
+ * **[mybigday/llama.rn](https://github.com/mybigday/llama.rn)** - A foundational React Native binding for llama.cpp that demonstrated the viability of on-device LLM inference in mobile applications.
230
296
 
231
- * **[Zach-Dean-Attractions-io/react-native-pure-cpp-turbo-module-library](https://github.com/Zach-Dean-Attractions-io/react-native-pure-cpp-turbo-module-library)** - A good resource that enabled us to successfully build the Android Turbo Module when the official React Native documentation proved insufficient. This repository provided the missing pieces for proper C++ integration on Android.
297
+ * **[ggml-org/llama.cpp](https://github.com/ggml-org/llama.cpp)** - The core C++ library that enables efficient LLM inference, serving as the foundation for this project.
232
298
 
233
- * **[ggml-org/llama.cpp](https://github.com/ggml-org/llama.cpp)** - The underlying C++ library that powers all llama.cpp-based projects, providing efficient LLM inference capabilities.
299
+ * The test implementation of the Android Turbo Module ([react-native-pure-cpp-turbo-module-library](https://github.com/Zach-Dean-Attractions-io/react-native-pure-cpp-turbo-module-library)) provided valuable insights for our C++ integration.
234
300
 
235
- Without these contributions to the open-source community, this project would not have been possible. We're proud to continue building upon this foundation and contributing back to the ecosystem.
301
+ These projects have significantly contributed to the open-source ecosystem, and we are committed to building upon their work while maintaining the same spirit of collaboration and innovation.
@@ -27,11 +27,17 @@ Pod::Spec.new do |s|
27
27
  # If these are compiled directly, their paths need to be relative to the podspec (e.g., "cpp/llama.cpp/common/common.{h,cpp}")
28
28
  "cpp/llama.cpp/common/common.{h,cpp}",
29
29
  "cpp/llama.cpp/common/log.{h,cpp}",
30
+ "cpp/llama.cpp/common/arg.{h,cpp}",
30
31
  "cpp/llama.cpp/common/sampling.{h,cpp}",
31
32
  "cpp/llama.cpp/common/chat.{h,cpp}",
33
+ "cpp/llama.cpp/common/chat-parser.{h,cpp}",
34
+ "cpp/llama.cpp/common/regex-partial.{h,cpp}",
35
+ "cpp/llama.cpp/common/console.{h,cpp}",
36
+ "cpp/llama.cpp/common/json-partial.{h,cpp}",
32
37
  "cpp/llama.cpp/common/ngram-cache.{h,cpp}",
33
38
  "cpp/llama.cpp/common/json-schema-to-grammar.{h,cpp}",
34
39
  "cpp/llama.cpp/common/speculative.{h,cpp}",
40
+ "cpp/llama.cpp/common/llguidance.{h,cpp}",
35
41
  "cpp/llama.cpp/common/*.hpp",
36
42
  "cpp/llama.cpp/common/minja/*.hpp"
37
43
 
@@ -52,16 +58,17 @@ Pod::Spec.new do |s|
52
58
  "SWIFT_OPTIMIZATION_LEVEL" => "-O",
53
59
  "ENABLE_BITCODE" => "NO",
54
60
  "DEFINES_MODULE" => "YES",
55
- "OTHER_LDFLAGS" => "$(inherited)",
61
+ "OTHER_LDFLAGS" => "$(inherited) -framework Accelerate -framework Foundation -framework Metal -framework MetalKit",
56
62
  # These preprocessor macros ensure TurboModule registration works correctly
57
63
  "GCC_PREPROCESSOR_DEFINITIONS" => ["$(inherited)", "RCT_NEW_ARCH_ENABLED=1",
58
64
  "__STDC_FORMAT_MACROS=1", # For format macros in C++
59
65
  "LLAMA_SHARED=1"] # For llama shared symbols
60
66
  }
61
67
 
62
- # Add user_target_xcconfig to propagate linker flags
68
+ # Add user_target_xcconfig to propagate linker flags and fix framework issues
63
69
  s.user_target_xcconfig = {
64
- "OTHER_LDFLAGS" => "$(inherited)"
70
+ "OTHER_LDFLAGS" => "$(inherited) -framework Accelerate -framework Foundation -framework Metal -framework MetalKit",
71
+ "FRAMEWORK_SEARCH_PATHS" => "$(inherited) $(PLATFORM_DIR)/Developer/Library/Frameworks"
65
72
  }
66
73
 
67
74
  # Install dependencies for Turbo Modules
@@ -57,7 +57,15 @@ add_library(
57
57
  ${CPP_DIR}/llama.cpp/common/common.cpp
58
58
  ${CPP_DIR}/llama.cpp/common/sampling.cpp
59
59
  ${CPP_DIR}/llama.cpp/common/chat.cpp
60
+ ${CPP_DIR}/llama.cpp/common/chat-parser.cpp
61
+ ${CPP_DIR}/llama.cpp/common/regex-partial.cpp
62
+ ${CPP_DIR}/llama.cpp/common/arg.cpp
63
+ ${CPP_DIR}/llama.cpp/common/console.cpp
64
+ ${CPP_DIR}/llama.cpp/common/json-partial.cpp
65
+ ${CPP_DIR}/llama.cpp/common/ngram-cache.cpp
60
66
  ${CPP_DIR}/llama.cpp/common/json-schema-to-grammar.cpp
67
+ ${CPP_DIR}/llama.cpp/common/speculative.cpp
68
+ ${CPP_DIR}/llama.cpp/common/llguidance.cpp
61
69
  )
62
70
 
63
71
  add_library(
@@ -4,6 +4,7 @@
4
4
  #include "ggml.h"
5
5
  #include "ggml-cpu.h"
6
6
  #include "ggml-backend.h"
7
+ #include "ggml-opt.h"
7
8
 
8
9
  #include <stddef.h>
9
10
  #include <stdint.h>
@@ -344,7 +345,7 @@ extern "C" {
344
345
  float yarn_beta_fast; // YaRN low correction dim
345
346
  float yarn_beta_slow; // YaRN high correction dim
346
347
  uint32_t yarn_orig_ctx; // YaRN original context size
347
- float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default)
348
+ float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
348
349
 
349
350
  ggml_backend_sched_eval_callback cb_eval;
350
351
  void * cb_eval_user_data;
@@ -360,10 +361,11 @@ extern "C" {
360
361
 
361
362
  // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
362
363
  bool embeddings; // if true, extract embeddings (together with logits)
363
- bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
364
- bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
365
- bool no_perf; // whether to measure performance timings
366
- bool op_offload; // whether to offload host tensor operations to device
364
+ bool offload_kqv; // offload the KQV ops (including the KV cache) to GPU
365
+ bool flash_attn; // use flash attention [EXPERIMENTAL]
366
+ bool no_perf; // measure performance timings
367
+ bool op_offload; // offload host tensor operations to device
368
+ bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
367
369
  };
368
370
 
369
371
  // model quantization parameters
@@ -445,6 +447,10 @@ extern "C" {
445
447
  size_t n_paths,
446
448
  struct llama_model_params params);
447
449
 
450
+ LLAMA_API void llama_model_save_to_file(
451
+ const struct llama_model * model,
452
+ const char * path_model);
453
+
448
454
  DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
449
455
  "use llama_model_free instead");
450
456
 
@@ -465,6 +471,7 @@ extern "C" {
465
471
  LLAMA_API int64_t llama_time_us(void);
466
472
 
467
473
  LLAMA_API size_t llama_max_devices(void);
474
+ LLAMA_API size_t llama_max_parallel_sequences(void);
468
475
 
469
476
  LLAMA_API bool llama_supports_mmap (void);
470
477
  LLAMA_API bool llama_supports_mlock (void);
@@ -602,71 +609,14 @@ extern "C" {
602
609
  // KV cache
603
610
  //
604
611
 
605
- // TODO: start using struct llama_kv_cache
606
-
607
- // Information associated with an individual cell in the KV cache view.
608
- struct llama_kv_cache_view_cell {
609
- // The position for this cell. Takes KV cache shifts into account.
610
- // May be negative if the cell is not populated.
611
- llama_pos pos;
612
- };
613
-
614
- // An updateable view of the KV cache.
615
- struct llama_kv_cache_view {
616
- // Number of KV cache cells. This will be the same as the context size.
617
- int32_t n_cells;
618
-
619
- // Maximum number of sequences that can exist in a cell. It's not an error
620
- // if there are more sequences in a cell than this value, however they will
621
- // not be visible in the view cells_sequences.
622
- int32_t n_seq_max;
623
-
624
- // Number of tokens in the cache. For example, if there are two populated
625
- // cells, the first with 1 sequence id in it and the second with 2 sequence
626
- // ids then you'll have 3 tokens.
627
- int32_t token_count;
628
-
629
- // Number of populated cache cells.
630
- int32_t used_cells;
631
-
632
- // Maximum contiguous empty slots in the cache.
633
- int32_t max_contiguous;
634
-
635
- // Index to the start of the max_contiguous slot range. Can be negative
636
- // when cache is full.
637
- int32_t max_contiguous_idx;
638
-
639
- // Information for an individual cell.
640
- struct llama_kv_cache_view_cell * cells;
641
-
642
- // The sequences for each cell. There will be n_seq_max items per cell.
643
- llama_seq_id * cells_sequences;
644
- };
645
-
646
- // Create an empty KV cache view. (use only for debugging purposes)
647
- LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
648
-
649
- // Free a KV cache view. (use only for debugging purposes)
650
- LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
651
-
652
- // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
653
- // TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
654
- LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
655
-
656
- ///
657
-
658
612
  // Returns the number of tokens in the KV cache (slow, use only for debug)
659
613
  // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
660
- LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
661
-
662
- DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
663
- "use llama_kv_self_n_tokens instead");
614
+ DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
615
+ "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
664
616
 
665
617
  // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
666
- LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
667
-
668
- DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
669
- "use llama_kv_self_used_cells instead");
618
+ DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
619
+ "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
670
620
 
671
621
  // Clear the KV cache - both cell info is erased and KV data is zeroed
672
622
  LLAMA_API void llama_kv_self_clear(
@@ -725,10 +675,18 @@ extern "C" {
725
675
  llama_pos p1,
726
676
  int d);
727
677
 
678
+ // Returns the smallest position present in the KV cache for the specified sequence
679
+ // This is typically non-zero only for SWA caches
680
+ // Return -1 if the sequence is empty
681
+ LLAMA_API llama_pos llama_kv_self_seq_pos_min(
682
+ struct llama_context * ctx,
683
+ llama_seq_id seq_id);
684
+
728
685
  // Returns the largest position present in the KV cache for the specified sequence
686
+ // Return -1 if the sequence is empty
729
687
  LLAMA_API llama_pos llama_kv_self_seq_pos_max(
730
688
  struct llama_context * ctx,
731
- llama_seq_id seq_id);
689
+ llama_seq_id seq_id);
732
690
 
733
691
  // Defragment the KV cache
734
692
  // This will be applied:
@@ -742,61 +700,6 @@ extern "C" {
742
700
  // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
743
701
  LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
744
702
 
745
- DEPRECATED(LLAMA_API void llama_kv_cache_clear(
746
- struct llama_context * ctx),
747
- "use llama_kv_self_clear instead");
748
-
749
- DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
750
- struct llama_context * ctx,
751
- llama_seq_id seq_id,
752
- llama_pos p0,
753
- llama_pos p1),
754
- "use llama_kv_self_seq_rm instead");
755
-
756
- DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
757
- struct llama_context * ctx,
758
- llama_seq_id seq_id_src,
759
- llama_seq_id seq_id_dst,
760
- llama_pos p0,
761
- llama_pos p1),
762
- "use llama_kv_self_seq_cp instead");
763
-
764
- DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
765
- struct llama_context * ctx,
766
- llama_seq_id seq_id),
767
- "use llama_kv_self_seq_keep instead");
768
-
769
- DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
770
- struct llama_context * ctx,
771
- llama_seq_id seq_id,
772
- llama_pos p0,
773
- llama_pos p1,
774
- llama_pos delta),
775
- "use llama_kv_self_seq_add instead");
776
-
777
- DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
778
- struct llama_context * ctx,
779
- llama_seq_id seq_id,
780
- llama_pos p0,
781
- llama_pos p1,
782
- int d),
783
- "use llama_kv_self_seq_div instead");
784
-
785
- DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
786
- struct llama_context * ctx,
787
- llama_seq_id seq_id),
788
- "use llama_kv_self_seq_pos_max instead");
789
-
790
- DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
791
- "use llama_kv_self_defrag instead");
792
-
793
- DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
794
- "use llama_kv_self_can_shift instead");
795
-
796
- DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
797
- "use llama_kv_self_update instead");
798
-
799
-
800
703
  //
801
704
  // State / sessions
802
705
  //
@@ -938,9 +841,12 @@ extern "C" {
938
841
  // Requires KV cache.
939
842
  // For encode-decoder contexts, processes the batch using the decoder.
940
843
  // Positive return values does not mean a fatal error, but rather a warning.
941
- // 0 - success
942
- // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
943
- // < 0 - error. the KV cache state is restored to the state before this call
844
+ // Upon non-zero return values, the KV cache state is restored to the state before this call
845
+ // 0 - success
846
+ // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
847
+ // 2 - aborted
848
+ // -1 - invalid input batch
849
+ // < -1 - error
944
850
  LLAMA_API int32_t llama_decode(
945
851
  struct llama_context * ctx,
946
852
  struct llama_batch batch);
@@ -1433,6 +1339,37 @@ extern "C" {
1433
1339
  LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
1434
1340
  LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
1435
1341
 
1342
+ //
1343
+ // training
1344
+ //
1345
+
1346
+ // function that returns whether or not a given tensor contains trainable parameters
1347
+ typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
1348
+
1349
+ // always returns true
1350
+ LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
1351
+
1352
+ struct llama_opt_params {
1353
+ uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
1354
+
1355
+ llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
1356
+ void * param_filter_ud; // userdata for determining which tensors contain trainable parameters
1357
+
1358
+ ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
1359
+ void * get_opt_pars_ud; // userdata for calculating optimizer parameters
1360
+ };
1361
+
1362
+ LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
1363
+
1364
+ LLAMA_API void llama_opt_epoch(
1365
+ struct llama_context * lctx,
1366
+ ggml_opt_dataset_t dataset,
1367
+ ggml_opt_result_t result_train,
1368
+ ggml_opt_result_t result_eval,
1369
+ int64_t idata_split,
1370
+ ggml_opt_epoch_callback callback_train,
1371
+ ggml_opt_epoch_callback callback_eval);
1372
+
1436
1373
  #ifdef __cplusplus
1437
1374
  }
1438
1375
  #endif
@@ -35,8 +35,8 @@
35
35
 
36
36
  namespace facebook::react {
37
37
 
38
- LlamaCppModel::LlamaCppModel(rn_llama_context* rn_ctx)
39
- : rn_ctx_(rn_ctx), should_stop_completion_(false), is_predicting_(false) {
38
+ LlamaCppModel::LlamaCppModel(rn_llama_context* rn_ctx, std::shared_ptr<CallInvoker> jsInvoker)
39
+ : rn_ctx_(rn_ctx), should_stop_completion_(false), is_predicting_(false), jsInvoker_(jsInvoker) {
40
40
  initHelpers();
41
41
  }
42
42
 
@@ -435,11 +435,18 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
435
435
  rn_ctx_->params.n_predict = options.n_predict;
436
436
 
437
437
  // Check for a partial callback
438
- auto callback_adapter = [&partialCallback, runtime](const std::string& token, bool is_done) -> bool {
438
+ auto callback_adapter = [&partialCallback, runtime, this](const std::string& token, bool is_done) -> bool {
439
+ // Check for stop condition first
440
+ if (should_stop_completion_) {
441
+ return false; // Signal to stop completion
442
+ }
443
+
439
444
  if (partialCallback && runtime && !is_done) {
440
445
  partialCallback(*runtime, token.c_str());
441
446
  }
442
- return true;
447
+
448
+ // Return true to continue, false to stop
449
+ return !should_stop_completion_;
443
450
  };
444
451
 
445
452
  // Run the completion based on whether we have messages or prompt
@@ -546,7 +553,7 @@ jsi::Value LlamaCppModel::jsonToJsi(jsi::Runtime& rt, const json& j) {
546
553
  return jsi::Value::undefined();
547
554
  }
548
555
 
549
- // JSI method for completions
556
+ // JSI method for completions (synchronous - kept for compatibility)
550
557
  jsi::Value LlamaCppModel::completionJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count) {
551
558
  if (count < 1 || !args[0].isObject()) {
552
559
  throw jsi::JSError(rt, "completion requires an options object");
@@ -581,6 +588,116 @@ jsi::Value LlamaCppModel::completionJsi(jsi::Runtime& rt, const jsi::Value* args
581
588
  }
582
589
  }
583
590
 
591
+ // JSI method for async completions (recommended approach)
592
+ jsi::Value LlamaCppModel::completionAsyncJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count) {
593
+ if (count < 1 || !args[0].isObject()) {
594
+ throw jsi::JSError(rt, "completion requires an options object");
595
+ }
596
+
597
+ if (!jsInvoker_) {
598
+ // Fallback to synchronous if no CallInvoker available
599
+ return completionJsi(rt, args, count);
600
+ }
601
+
602
+ // Parse options and callback on the current thread
603
+ CompletionOptions options;
604
+ std::shared_ptr<jsi::Function> callbackFn = nullptr;
605
+
606
+ try {
607
+ options = parseCompletionOptions(rt, args[0].getObject(rt));
608
+
609
+ if (count > 1 && args[1].isObject() && args[1].getObject(rt).isFunction(rt)) {
610
+ callbackFn = std::make_shared<jsi::Function>(args[1].getObject(rt).getFunction(rt));
611
+ options.stream = true;
612
+ }
613
+ } catch (const std::exception& e) {
614
+ throw jsi::JSError(rt, e.what());
615
+ }
616
+
617
+ // Create Promise constructor
618
+ auto Promise = rt.global().getPropertyAsFunction(rt, "Promise");
619
+
620
+ auto executor = jsi::Function::createFromHostFunction(
621
+ rt,
622
+ jsi::PropNameID::forAscii(rt, "executor"),
623
+ 2,
624
+ [this, options, callbackFn](jsi::Runtime& runtime, const jsi::Value& thisValue, const jsi::Value* args, size_t count) -> jsi::Value {
625
+
626
+ auto resolve = std::make_shared<jsi::Function>(args[0].asObject(runtime).asFunction(runtime));
627
+ auto reject = std::make_shared<jsi::Function>(args[1].asObject(runtime).asFunction(runtime));
628
+
629
+ // Create shared references to runtime and invoker for thread safety
630
+ auto runtimePtr = &runtime;
631
+ auto invoker = jsInvoker_;
632
+ auto selfPtr = shared_from_this(); // This requires LlamaCppModel to inherit from std::enable_shared_from_this
633
+
634
+ // Launch background thread for completion
635
+ std::thread([selfPtr, options, callbackFn, resolve, reject, runtimePtr, invoker]() {
636
+ try {
637
+ // Create callback that schedules token updates on JS thread
638
+ std::function<void(jsi::Runtime&, const char*)> partialCallback = nullptr;
639
+
640
+ if (callbackFn && invoker) {
641
+ partialCallback = [callbackFn, invoker, runtimePtr](jsi::Runtime& rt, const char* token) {
642
+ std::string tokenCopy(token);
643
+ invoker->invokeAsync([callbackFn, tokenCopy, runtimePtr]() {
644
+ try {
645
+ jsi::Object data(*runtimePtr);
646
+ data.setProperty(*runtimePtr, "token", jsi::String::createFromUtf8(*runtimePtr, tokenCopy));
647
+ callbackFn->call(*runtimePtr, data);
648
+ } catch (...) {
649
+ // Ignore callback errors
650
+ }
651
+ });
652
+ };
653
+ }
654
+
655
+ // Run completion
656
+ CompletionResult result = selfPtr->completion(options, partialCallback, runtimePtr);
657
+
658
+ // Schedule success callback on JS thread
659
+ invoker->invokeAsync([selfPtr, resolve, result, runtimePtr]() {
660
+ try {
661
+ jsi::Object jsResult = selfPtr->completionResultToJsi(*runtimePtr, result);
662
+ resolve->call(*runtimePtr, jsResult);
663
+ } catch (const std::exception& e) {
664
+ // If conversion fails, create a simple error response
665
+ jsi::Object errorObj(*runtimePtr);
666
+ errorObj.setProperty(*runtimePtr, "error", jsi::String::createFromUtf8(*runtimePtr, e.what()));
667
+ resolve->call(*runtimePtr, errorObj);
668
+ }
669
+ });
670
+
671
+ } catch (const std::exception& e) {
672
+ // Schedule error callback on JS thread
673
+ std::string errorMsg(e.what());
674
+ invoker->invokeAsync([reject, errorMsg, runtimePtr]() {
675
+ try {
676
+ reject->call(*runtimePtr, jsi::String::createFromUtf8(*runtimePtr, errorMsg));
677
+ } catch (...) {
678
+ // Ignore rejection errors
679
+ }
680
+ });
681
+ }
682
+ }).detach();
683
+
684
+ return jsi::Value::undefined();
685
+ }
686
+ );
687
+
688
+ return Promise.callAsConstructor(rt, std::move(executor));
689
+ }
690
+
691
+ // JSI method for stopping completion
692
+ jsi::Value LlamaCppModel::stopCompletionJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count) {
693
+ try {
694
+ setShouldStopCompletion(true);
695
+ return jsi::Value(true);
696
+ } catch (const std::exception& e) {
697
+ throw jsi::JSError(rt, e.what());
698
+ }
699
+ }
700
+
584
701
  jsi::Value LlamaCppModel::tokenizeJsi(jsi::Runtime& rt, const jsi::Value* args, size_t count) {
585
702
  if (count < 1 || !args[0].isObject()) {
586
703
  throw jsi::JSError(rt, "tokenize requires an options object with 'content' field");
@@ -930,12 +1047,36 @@ jsi::Value LlamaCppModel::get(jsi::Runtime& rt, const jsi::PropNameID& name) {
930
1047
  });
931
1048
  }
932
1049
  else if (nameStr == "completion") {
1050
+ // Use async completion as the default to provide better UX
1051
+ if (jsInvoker_) {
1052
+ return jsi::Function::createFromHostFunction(
1053
+ rt, name, 2,
1054
+ [this](jsi::Runtime& runtime, const jsi::Value& thisValue, const jsi::Value* args, size_t count) {
1055
+ return this->completionAsyncJsi(runtime, args, count);
1056
+ });
1057
+ } else {
1058
+ // Fallback to sync completion if no CallInvoker
1059
+ return jsi::Function::createFromHostFunction(
1060
+ rt, name, 2,
1061
+ [this](jsi::Runtime& runtime, const jsi::Value& thisValue, const jsi::Value* args, size_t count) {
1062
+ return this->completionJsi(runtime, args, count);
1063
+ });
1064
+ }
1065
+ }
1066
+ else if (nameStr == "completionSync") {
933
1067
  return jsi::Function::createFromHostFunction(
934
1068
  rt, name, 2,
935
1069
  [this](jsi::Runtime& runtime, const jsi::Value& thisValue, const jsi::Value* args, size_t count) {
936
1070
  return this->completionJsi(runtime, args, count);
937
1071
  });
938
1072
  }
1073
+ else if (nameStr == "stopCompletion") {
1074
+ return jsi::Function::createFromHostFunction(
1075
+ rt, name, 0,
1076
+ [this](jsi::Runtime& runtime, const jsi::Value& thisValue, const jsi::Value* args, size_t count) {
1077
+ return this->stopCompletionJsi(runtime, args, count);
1078
+ });
1079
+ }
939
1080
  else if (nameStr == "embedding") {
940
1081
  return jsi::Function::createFromHostFunction(
941
1082
  rt, name, 1,
@@ -973,6 +1114,8 @@ std::vector<jsi::PropNameID> LlamaCppModel::getPropertyNames(jsi::Runtime& rt) {
973
1114
  result.push_back(jsi::PropNameID::forAscii(rt, "tokenize"));
974
1115
  result.push_back(jsi::PropNameID::forAscii(rt, "detokenize"));
975
1116
  result.push_back(jsi::PropNameID::forAscii(rt, "completion"));
1117
+ result.push_back(jsi::PropNameID::forAscii(rt, "completionSync"));
1118
+ result.push_back(jsi::PropNameID::forAscii(rt, "stopCompletion"));
976
1119
  result.push_back(jsi::PropNameID::forAscii(rt, "embedding"));
977
1120
  result.push_back(jsi::PropNameID::forAscii(rt, "release"));
978
1121
  result.push_back(jsi::PropNameID::forAscii(rt, "n_vocab"));