cui-llama.rn 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (309) hide show
  1. package/LICENSE +20 -20
  2. package/README.md +317 -319
  3. package/android/build.gradle +116 -116
  4. package/android/gradle.properties +5 -5
  5. package/android/src/main/AndroidManifest.xml +4 -4
  6. package/android/src/main/CMakeLists.txt +124 -124
  7. package/android/src/main/java/com/rnllama/LlamaContext.java +645 -645
  8. package/android/src/main/java/com/rnllama/RNLlama.java +695 -695
  9. package/android/src/main/java/com/rnllama/RNLlamaPackage.java +48 -48
  10. package/android/src/main/jni-utils.h +100 -100
  11. package/android/src/main/jni.cpp +1263 -1263
  12. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  14. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  15. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  16. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  17. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  20. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +135 -135
  21. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +136 -136
  22. package/cpp/README.md +4 -4
  23. package/cpp/ggml-llama-sim.metallib +0 -0
  24. package/cpp/ggml-llama.metallib +0 -0
  25. package/cpp/ggml-metal-impl.h +597 -597
  26. package/cpp/ggml-metal.m +4 -0
  27. package/cpp/ggml.h +1 -1
  28. package/cpp/rn-llama.cpp +873 -873
  29. package/cpp/rn-llama.h +138 -138
  30. package/cpp/sampling.h +107 -107
  31. package/cpp/unicode-data.cpp +7034 -7034
  32. package/cpp/unicode-data.h +20 -20
  33. package/cpp/unicode.cpp +849 -849
  34. package/cpp/unicode.h +66 -66
  35. package/ios/CMakeLists.txt +116 -108
  36. package/ios/RNLlama.h +7 -7
  37. package/ios/RNLlama.mm +418 -405
  38. package/ios/RNLlamaContext.h +57 -57
  39. package/ios/RNLlamaContext.mm +835 -835
  40. package/ios/rnllama.xcframework/Info.plist +74 -74
  41. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +16 -0
  42. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +143 -0
  43. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +677 -0
  44. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/cpu-common.h +72 -0
  45. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-alloc.h +76 -0
  46. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend-impl.h +255 -0
  47. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +354 -0
  48. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +1857 -0
  49. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +39 -0
  50. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +8 -0
  51. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +512 -0
  52. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +63 -0
  53. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +38 -0
  54. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +138 -0
  55. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +594 -0
  56. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +597 -0
  57. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal.h +66 -0
  58. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +216 -0
  59. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-quants.h +100 -0
  60. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-threading.h +14 -0
  61. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +2222 -0
  62. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/gguf.h +202 -0
  63. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +21 -0
  64. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json.hpp +24766 -0
  65. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-adapter.h +76 -0
  66. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +428 -0
  67. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +88 -0
  68. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +56 -0
  69. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +265 -0
  70. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +38 -0
  71. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cpp.h +30 -0
  72. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-grammar.h +173 -0
  73. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +592 -0
  74. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +156 -0
  75. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-impl.h +61 -0
  76. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-io.h +35 -0
  77. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +213 -0
  78. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +21 -0
  79. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-mmap.h +68 -0
  80. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-loader.h +169 -0
  81. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +409 -0
  82. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-sampling.h +32 -0
  83. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +125 -0
  84. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +1434 -0
  85. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/log.h +132 -0
  86. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +537 -0
  87. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +2941 -0
  88. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +128 -0
  89. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +138 -0
  90. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sampling.h +107 -0
  91. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +14 -0
  92. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +888 -0
  93. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/speculative.h +28 -0
  94. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/unary-ops.h +28 -0
  95. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/unicode-data.h +20 -0
  96. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/unicode.h +66 -0
  97. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +802 -0
  98. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
  99. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  100. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  101. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +16 -0
  102. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +143 -0
  103. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +677 -0
  104. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/cpu-common.h +72 -0
  105. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-alloc.h +76 -0
  106. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend-impl.h +255 -0
  107. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +354 -0
  108. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +1857 -0
  109. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +39 -0
  110. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +8 -0
  111. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +512 -0
  112. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +63 -0
  113. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +38 -0
  114. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +138 -0
  115. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +594 -0
  116. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +597 -0
  117. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal.h +66 -0
  118. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +216 -0
  119. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-quants.h +100 -0
  120. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-threading.h +14 -0
  121. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +2222 -0
  122. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/gguf.h +202 -0
  123. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +21 -0
  124. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +24766 -0
  125. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-adapter.h +76 -0
  126. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +428 -0
  127. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +88 -0
  128. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +56 -0
  129. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +265 -0
  130. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +38 -0
  131. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cpp.h +30 -0
  132. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-grammar.h +173 -0
  133. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +592 -0
  134. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +156 -0
  135. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-impl.h +61 -0
  136. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-io.h +35 -0
  137. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +213 -0
  138. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +21 -0
  139. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-mmap.h +68 -0
  140. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-loader.h +169 -0
  141. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +409 -0
  142. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-sampling.h +32 -0
  143. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +125 -0
  144. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +1434 -0
  145. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/log.h +132 -0
  146. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +537 -0
  147. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +2941 -0
  148. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +128 -0
  149. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +138 -0
  150. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sampling.h +107 -0
  151. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +14 -0
  152. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +888 -0
  153. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/speculative.h +28 -0
  154. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +28 -0
  155. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unicode-data.h +20 -0
  156. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unicode.h +66 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +802 -0
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +101 -0
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  162. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +16 -0
  163. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +143 -0
  164. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +677 -0
  165. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/cpu-common.h +72 -0
  166. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-alloc.h +76 -0
  167. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend-impl.h +255 -0
  168. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +354 -0
  169. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +1857 -0
  170. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +39 -0
  171. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +8 -0
  172. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +512 -0
  173. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +63 -0
  174. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +38 -0
  175. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +138 -0
  176. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +594 -0
  177. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +597 -0
  178. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal.h +66 -0
  179. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +216 -0
  180. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-quants.h +100 -0
  181. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-threading.h +14 -0
  182. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +2222 -0
  183. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/gguf.h +202 -0
  184. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +21 -0
  185. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json.hpp +24766 -0
  186. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-adapter.h +76 -0
  187. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +428 -0
  188. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +88 -0
  189. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +56 -0
  190. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +265 -0
  191. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +38 -0
  192. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cpp.h +30 -0
  193. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-grammar.h +173 -0
  194. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +592 -0
  195. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +156 -0
  196. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-impl.h +61 -0
  197. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-io.h +35 -0
  198. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +213 -0
  199. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +21 -0
  200. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-mmap.h +68 -0
  201. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-loader.h +169 -0
  202. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +409 -0
  203. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-sampling.h +32 -0
  204. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +125 -0
  205. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +1434 -0
  206. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/log.h +132 -0
  207. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +537 -0
  208. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +2941 -0
  209. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +128 -0
  210. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +138 -0
  211. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sampling.h +107 -0
  212. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +14 -0
  213. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +888 -0
  214. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/speculative.h +28 -0
  215. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +28 -0
  216. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unicode-data.h +20 -0
  217. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unicode.h +66 -0
  218. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/vec.h +802 -0
  219. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
  220. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  221. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  222. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +16 -0
  223. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +143 -0
  224. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +677 -0
  225. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/cpu-common.h +72 -0
  226. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-alloc.h +76 -0
  227. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend-impl.h +255 -0
  228. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +354 -0
  229. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +1857 -0
  230. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +39 -0
  231. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +8 -0
  232. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +512 -0
  233. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +63 -0
  234. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +38 -0
  235. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +138 -0
  236. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +594 -0
  237. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +597 -0
  238. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal.h +66 -0
  239. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +216 -0
  240. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-quants.h +100 -0
  241. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-threading.h +14 -0
  242. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +2222 -0
  243. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/gguf.h +202 -0
  244. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +21 -0
  245. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +24766 -0
  246. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-adapter.h +76 -0
  247. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +428 -0
  248. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +88 -0
  249. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +56 -0
  250. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +265 -0
  251. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +38 -0
  252. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cpp.h +30 -0
  253. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-grammar.h +173 -0
  254. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +592 -0
  255. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +156 -0
  256. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-impl.h +61 -0
  257. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-io.h +35 -0
  258. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +213 -0
  259. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +21 -0
  260. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-mmap.h +68 -0
  261. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-loader.h +169 -0
  262. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +409 -0
  263. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-sampling.h +32 -0
  264. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +125 -0
  265. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +1434 -0
  266. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/log.h +132 -0
  267. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +537 -0
  268. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +2941 -0
  269. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +128 -0
  270. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +138 -0
  271. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sampling.h +107 -0
  272. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +14 -0
  273. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +888 -0
  274. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/speculative.h +28 -0
  275. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +28 -0
  276. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unicode-data.h +20 -0
  277. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unicode.h +66 -0
  278. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +802 -0
  279. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  280. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +101 -0
  281. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  282. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  283. package/jest/mock.js +203 -203
  284. package/lib/commonjs/NativeRNLlama.js +1 -2
  285. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  286. package/lib/commonjs/chat.js.map +1 -1
  287. package/lib/commonjs/grammar.js +12 -31
  288. package/lib/commonjs/grammar.js.map +1 -1
  289. package/lib/commonjs/index.js +47 -47
  290. package/lib/commonjs/index.js.map +1 -1
  291. package/lib/commonjs/package.json +1 -0
  292. package/lib/module/NativeRNLlama.js +2 -0
  293. package/lib/module/NativeRNLlama.js.map +1 -1
  294. package/lib/module/chat.js +2 -0
  295. package/lib/module/chat.js.map +1 -1
  296. package/lib/module/grammar.js +14 -31
  297. package/lib/module/grammar.js.map +1 -1
  298. package/lib/module/index.js +47 -45
  299. package/lib/module/index.js.map +1 -1
  300. package/lib/module/package.json +1 -0
  301. package/lib/typescript/NativeRNLlama.d.ts +6 -4
  302. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  303. package/lib/typescript/index.d.ts.map +1 -1
  304. package/llama-rn.podspec +48 -48
  305. package/package.json +233 -233
  306. package/src/NativeRNLlama.ts +426 -426
  307. package/src/chat.ts +44 -44
  308. package/src/grammar.ts +854 -854
  309. package/src/index.ts +495 -487
@@ -1,835 +1,835 @@
1
- #import "RNLlamaContext.h"
2
- #import <Metal/Metal.h>
3
-
4
- @implementation RNLlamaContext
5
-
6
- + (void)toggleNativeLog:(BOOL)enabled onEmitLog:(void (^)(NSString *level, NSString *text))onEmitLog {
7
- if (enabled) {
8
- void (^copiedBlock)(NSString *, NSString *) = [onEmitLog copy];
9
- llama_log_set([](lm_ggml_log_level level, const char * text, void * data) {
10
- llama_log_callback_default(level, text, data);
11
- NSString *levelStr = @"";
12
- if (level == LM_GGML_LOG_LEVEL_ERROR) {
13
- levelStr = @"error";
14
- } else if (level == LM_GGML_LOG_LEVEL_INFO) {
15
- levelStr = @"info";
16
- } else if (level == LM_GGML_LOG_LEVEL_WARN) {
17
- levelStr = @"warn";
18
- }
19
-
20
- NSString *textStr = [NSString stringWithUTF8String:text];
21
- // NOTE: Convert to UTF-8 string may fail
22
- if (!textStr) {
23
- return;
24
- }
25
- void (^block)(NSString *, NSString *) = (__bridge void (^)(NSString *, NSString *))(data);
26
- block(levelStr, textStr);
27
- }, copiedBlock);
28
- } else {
29
- llama_log_set(llama_log_callback_default, nullptr);
30
- }
31
- }
32
-
33
- + (NSDictionary *)modelInfo:(NSString *)path skip:(NSArray *)skip {
34
- struct lm_gguf_init_params params = {
35
- /*.no_alloc = */ false,
36
- /*.ctx = */ NULL,
37
- };
38
-
39
- struct lm_gguf_context * ctx = lm_gguf_init_from_file([path UTF8String], params);
40
-
41
- if (!ctx) {
42
- NSLog(@"%s: failed to load '%s'\n", __func__, [path UTF8String]);
43
- return @{};
44
- }
45
-
46
- NSMutableDictionary *info = [[NSMutableDictionary alloc] init];
47
-
48
- info[@"version"] = @(lm_gguf_get_version(ctx));
49
- info[@"alignment"] = @(lm_gguf_get_alignment(ctx));
50
- info[@"data_offset"] = @(lm_gguf_get_data_offset(ctx));
51
-
52
- // kv
53
- {
54
- const int n_kv = lm_gguf_get_n_kv(ctx);
55
-
56
- for (int i = 0; i < n_kv; ++i) {
57
- const char * key = lm_gguf_get_key(ctx, i);
58
-
59
- if (skip && [skip containsObject:[NSString stringWithUTF8String:key]]) {
60
- continue;
61
- }
62
- const std::string value = lm_gguf_kv_to_str(ctx, i);
63
- info[[NSString stringWithUTF8String:key]] = [NSString stringWithUTF8String:value.c_str()];
64
- }
65
- }
66
-
67
- lm_gguf_free(ctx);
68
-
69
- return info;
70
- }
71
-
72
- + (instancetype)initWithParams:(NSDictionary *)params onProgress:(void (^)(unsigned int progress))onProgress {
73
- // llama_backend_init(false);
74
- common_params defaultParams;
75
-
76
- if (params[@"vocab_only"]) {
77
- defaultParams.vocab_only = [params[@"vocab_only"] boolValue];
78
- defaultParams.warmup = false;
79
- }
80
-
81
- NSString *modelPath = params[@"model"];
82
- BOOL isAsset = [params[@"is_model_asset"] boolValue];
83
- NSString *path = modelPath;
84
- if (isAsset) path = [[NSBundle mainBundle] pathForResource:modelPath ofType:nil];
85
- defaultParams.model = [path UTF8String];
86
-
87
- NSString *chatTemplate = params[@"chat_template"];
88
- if (chatTemplate) {
89
- defaultParams.chat_template = [chatTemplate UTF8String];
90
- NSLog(@"chatTemplate: %@", chatTemplate);
91
- }
92
-
93
- NSString *reasoningFormat = params[@"reasoning_format"];
94
- if (reasoningFormat && [reasoningFormat isEqualToString:@"deepseek"]) {
95
- defaultParams.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
96
- } else {
97
- defaultParams.reasoning_format = COMMON_REASONING_FORMAT_NONE;
98
- }
99
-
100
- if (params[@"n_ctx"]) defaultParams.n_ctx = [params[@"n_ctx"] intValue];
101
- if (params[@"use_mlock"]) defaultParams.use_mlock = [params[@"use_mlock"]boolValue];
102
-
103
- BOOL skipGpuDevices = params[@"no_gpu_devices"] && [params[@"no_gpu_devices"] boolValue];
104
-
105
- BOOL isMetalEnabled = false;
106
- NSString *reasonNoMetal = @"";
107
- defaultParams.n_gpu_layers = 0;
108
- #ifdef LM_GGML_USE_METAL
109
- // Check ggml-metal availability
110
- NSError * error = nil;
111
- id<MTLDevice> device = MTLCreateSystemDefaultDevice();
112
- id<MTLLibrary> library = [device
113
- newLibraryWithSource:@"#include <metal_stdlib>\n"
114
- "using namespace metal;"
115
- "typedef matrix<bfloat, 4, 4> bfloat4x4;"
116
- "kernel void test() { simd_sum(0); }"
117
- options:nil
118
- error:&error
119
- ];
120
- if (error) {
121
- reasonNoMetal = [error localizedDescription];
122
- skipGpuDevices = true;
123
- } else {
124
- id<MTLFunction> kernel = [library newFunctionWithName:@"test"];
125
- id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:kernel error:&error];
126
- if (pipeline == nil) {
127
- reasonNoMetal = [error localizedDescription];
128
- skipGpuDevices = true;
129
- } else {
130
- #if TARGET_OS_SIMULATOR
131
- // Use the backend, but no layers because not supported fully on simulator
132
- defaultParams.n_gpu_layers = 0;
133
- isMetalEnabled = true;
134
- #else
135
- defaultParams.n_gpu_layers = [params[@"n_gpu_layers"] intValue];
136
- isMetalEnabled = true;
137
- #endif
138
- }
139
- }
140
- device = nil;
141
- #else
142
- reasonNoMetal = @"Metal is not enabled in this build";
143
- isMetalEnabled = false;
144
- #endif
145
-
146
- if (skipGpuDevices) {
147
- std::vector<lm_ggml_backend_dev_t> cpu_devs;
148
- for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
149
- lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
150
- switch (lm_ggml_backend_dev_type(dev)) {
151
- case LM_GGML_BACKEND_DEVICE_TYPE_CPU:
152
- case LM_GGML_BACKEND_DEVICE_TYPE_ACCEL:
153
- cpu_devs.push_back(dev);
154
- break;
155
- case LM_GGML_BACKEND_DEVICE_TYPE_GPU:
156
- break;
157
- }
158
- }
159
- if (cpu_devs.size() > 0) {
160
- defaultParams.devices = cpu_devs;
161
- }
162
- }
163
-
164
- if (params[@"n_batch"]) defaultParams.n_batch = [params[@"n_batch"] intValue];
165
- if (params[@"n_ubatch"]) defaultParams.n_ubatch = [params[@"n_ubatch"] intValue];
166
- if (params[@"use_mmap"]) defaultParams.use_mmap = [params[@"use_mmap"] boolValue];
167
-
168
- if (params[@"pooling_type"] && [params[@"pooling_type"] isKindOfClass:[NSNumber class]]) {
169
- defaultParams.pooling_type = static_cast<enum llama_pooling_type>([params[@"pooling_type"] intValue]);
170
- }
171
-
172
- if (params[@"embedding"] && [params[@"embedding"] boolValue]) {
173
- defaultParams.embedding = true;
174
- // For non-causal models, batch size must be equal to ubatch size
175
- defaultParams.n_ubatch = defaultParams.n_batch;
176
-
177
- if (params[@"embd_normalize"] && [params[@"embd_normalize"] isKindOfClass:[NSNumber class]]) {
178
- defaultParams.embd_normalize = [params[@"embd_normalize"] intValue];
179
- }
180
- }
181
-
182
- if (params[@"rope_freq_base"]) defaultParams.rope_freq_base = [params[@"rope_freq_base"] floatValue];
183
- if (params[@"rope_freq_scale"]) defaultParams.rope_freq_scale = [params[@"rope_freq_scale"] floatValue];
184
-
185
- if (params[@"flash_attn"] && [params[@"flash_attn"] boolValue]) defaultParams.flash_attn = true;
186
-
187
- if (params[@"cache_type_k"]) defaultParams.cache_type_k = rnllama::kv_cache_type_from_str([params[@"cache_type_k"] UTF8String]);
188
- if (params[@"cache_type_v"]) defaultParams.cache_type_v = rnllama::kv_cache_type_from_str([params[@"cache_type_v"] UTF8String]);
189
-
190
- int nThreads = params[@"n_threads"] ? [params[@"n_threads"] intValue] : 0;
191
- const int maxThreads = (int) [[NSProcessInfo processInfo] processorCount];
192
- // Use 2 threads by default on 4-core devices, 4 threads on more cores
193
- const int defaultNThreads = nThreads == 4 ? 2 : MIN(4, maxThreads);
194
- defaultParams.cpuparams.n_threads = nThreads > 0 ? nThreads : defaultNThreads;
195
-
196
- RNLlamaContext *context = [[RNLlamaContext alloc] init];
197
- context->llama = new rnllama::llama_rn_context();
198
- context->llama->is_load_interrupted = false;
199
- context->llama->loading_progress = 0;
200
- context->onProgress = onProgress;
201
-
202
- if (params[@"use_progress_callback"] && [params[@"use_progress_callback"] boolValue]) {
203
- defaultParams.progress_callback = [](float progress, void * user_data) {
204
- RNLlamaContext *context = (__bridge RNLlamaContext *)(user_data);
205
- unsigned percentage = (unsigned) (100 * progress);
206
- if (percentage > context->llama->loading_progress) {
207
- context->llama->loading_progress = percentage;
208
- context->onProgress(percentage);
209
- }
210
- return !context->llama->is_load_interrupted;
211
- };
212
- defaultParams.progress_callback_user_data = context;
213
- }
214
-
215
- context->is_model_loaded = context->llama->loadModel(defaultParams);
216
-
217
- if (
218
- params[@"embedding"] && [params[@"embedding"] boolValue] &&
219
- llama_model_has_encoder(context->llama->model) && llama_model_has_decoder(context->llama->model)
220
- ) {
221
- delete context->llama;
222
- @throw [NSException exceptionWithName:@"LlamaException" reason:@"Embedding is not supported in encoder-decoder models" userInfo:nil];
223
- }
224
-
225
- std::vector<common_adapter_lora_info> lora;
226
- if (params[@"lora"]) {
227
- common_adapter_lora_info la;
228
- la.path = [params[@"lora"] UTF8String];
229
- la.scale = 1.0f;
230
- if (params[@"lora_scaled"]) la.scale = [params[@"lora_scaled"] floatValue];
231
- lora.push_back(la);
232
- }
233
- if (params[@"lora_list"] && [params[@"lora_list"] isKindOfClass:[NSArray class]]) {
234
- NSArray *lora_list = params[@"lora_list"];
235
- for (NSDictionary *lora_adapter in lora_list) {
236
- NSString *path = lora_adapter[@"path"];
237
- if (!path) continue;
238
- float scale = [lora_adapter[@"scaled"] floatValue];
239
- common_adapter_lora_info la;
240
- la.path = [path UTF8String];
241
- la.scale = scale;
242
- lora.push_back(la);
243
- }
244
- }
245
- if (lora.size() > 0) {
246
- int result = context->llama->applyLoraAdapters(lora);
247
- if (result != 0) {
248
- delete context->llama;
249
- @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to apply lora adapters" userInfo:nil];
250
- }
251
- }
252
-
253
- context->is_metal_enabled = isMetalEnabled;
254
- context->reason_no_metal = reasonNoMetal;
255
-
256
- return context;
257
- }
258
-
259
- - (void)interruptLoad {
260
- llama->is_load_interrupted = true;
261
- }
262
-
263
- - (bool)isMetalEnabled {
264
- return is_metal_enabled;
265
- }
266
-
267
- - (NSString *)reasonNoMetal {
268
- return reason_no_metal;
269
- }
270
-
271
- - (NSDictionary *)modelInfo {
272
- char desc[1024];
273
- llama_model_desc(llama->model, desc, sizeof(desc));
274
-
275
- int count = llama_model_meta_count(llama->model);
276
- NSDictionary *meta = [[NSMutableDictionary alloc] init];
277
- for (int i = 0; i < count; i++) {
278
- char key[256];
279
- llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
280
- char val[4096];
281
- llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
282
-
283
- NSString *keyStr = [NSString stringWithUTF8String:key];
284
- NSString *valStr = [NSString stringWithUTF8String:val];
285
- [meta setValue:valStr forKey:keyStr];
286
- }
287
-
288
- auto template_tool_use = llama->templates.get()->template_tool_use.get();
289
- NSDictionary *tool_use_caps_dir = nil;
290
- if (template_tool_use) {
291
- auto tool_use_caps = template_tool_use->original_caps();
292
- tool_use_caps_dir = @{
293
- @"tools": @(tool_use_caps.supports_tools),
294
- @"toolCalls": @(tool_use_caps.supports_tool_calls),
295
- @"toolResponses": @(tool_use_caps.supports_tool_responses),
296
- @"systemRole": @(tool_use_caps.supports_system_role),
297
- @"parallelToolCalls": @(tool_use_caps.supports_parallel_tool_calls),
298
- @"toolCallId": @(tool_use_caps.supports_tool_call_id)
299
- };
300
- }
301
-
302
- auto default_tmpl = llama->templates.get()->template_default.get();
303
- auto default_tmpl_caps = default_tmpl->original_caps();
304
-
305
- return @{
306
- @"desc": [NSString stringWithUTF8String:desc],
307
- @"size": @(llama_model_size(llama->model)),
308
- @"nEmbd": @(llama_model_n_embd(llama->model)),
309
- @"nParams": @(llama_model_n_params(llama->model)),
310
- @"chatTemplates": @{
311
- @"llamaChat": @(llama->validateModelChatTemplate(false, nullptr)),
312
- @"minja": @{
313
- @"default": @(llama->validateModelChatTemplate(true, nullptr)),
314
- @"defaultCaps": @{
315
- @"tools": @(default_tmpl_caps.supports_tools),
316
- @"toolCalls": @(default_tmpl_caps.supports_tool_calls),
317
- @"toolResponses": @(default_tmpl_caps.supports_tool_responses),
318
- @"systemRole": @(default_tmpl_caps.supports_system_role),
319
- @"parallelToolCalls": @(default_tmpl_caps.supports_parallel_tool_calls),
320
- @"toolCallId": @(default_tmpl_caps.supports_tool_call_id)
321
- },
322
- @"toolUse": @(llama->validateModelChatTemplate(true, "tool_use")),
323
- @"toolUseCaps": tool_use_caps_dir ?: @{}
324
- }
325
- },
326
- @"metadata": meta,
327
-
328
- // deprecated
329
- @"isChatTemplateSupported": @(llama->validateModelChatTemplate(false, nullptr))
330
- };
331
- }
332
-
333
- - (bool)isModelLoaded {
334
- return is_model_loaded;
335
- }
336
-
337
- - (bool)isPredicting {
338
- return llama->is_predicting;
339
- }
340
-
341
- - (NSDictionary *)getFormattedChatWithJinja:(NSString *)messages
342
- withChatTemplate:(NSString *)chatTemplate
343
- withJsonSchema:(NSString *)jsonSchema
344
- withTools:(NSString *)tools
345
- withParallelToolCalls:(BOOL)parallelToolCalls
346
- withToolChoice:(NSString *)toolChoice
347
- {
348
- auto tmpl_str = chatTemplate == nil ? "" : [chatTemplate UTF8String];
349
-
350
- NSMutableDictionary *result = [[NSMutableDictionary alloc] init];
351
- auto chatParams = llama->getFormattedChatWithJinja(
352
- [messages UTF8String],
353
- tmpl_str,
354
- jsonSchema == nil ? "" : [jsonSchema UTF8String],
355
- tools == nil ? "" : [tools UTF8String],
356
- parallelToolCalls,
357
- toolChoice == nil ? "" : [toolChoice UTF8String]
358
- );
359
- result[@"prompt"] = [NSString stringWithUTF8String:chatParams.prompt.c_str()];
360
- result[@"chat_format"] = @(static_cast<int>(chatParams.format));
361
- result[@"grammar"] = [NSString stringWithUTF8String:chatParams.grammar.c_str()];
362
- result[@"grammar_lazy"] = @(chatParams.grammar_lazy);
363
- NSMutableArray *grammar_triggers = [[NSMutableArray alloc] init];
364
- for (const auto & trigger : chatParams.grammar_triggers) {
365
- [grammar_triggers addObject:@{
366
- @"type": @(trigger.type),
367
- @"value": [NSString stringWithUTF8String:trigger.value.c_str()],
368
- @"token": @(trigger.token),
369
- }];
370
- }
371
- result[@"grammar_triggers"] = grammar_triggers;
372
- NSMutableArray *preserved_tokens = [[NSMutableArray alloc] init];
373
- for (const auto & token : chatParams.preserved_tokens) {
374
- [preserved_tokens addObject:[NSString stringWithUTF8String:token.c_str()]];
375
- }
376
- result[@"preserved_tokens"] = preserved_tokens;
377
- NSMutableArray *additional_stops = [[NSMutableArray alloc] init];
378
- for (const auto & stop : chatParams.additional_stops) {
379
- [additional_stops addObject:[NSString stringWithUTF8String:stop.c_str()]];
380
- }
381
- result[@"additional_stops"] = additional_stops;
382
-
383
- return result;
384
- }
385
-
386
- - (NSString *)getFormattedChat:(NSString *)messages withChatTemplate:(NSString *)chatTemplate {
387
- auto tmpl_str = chatTemplate == nil ? "" : [chatTemplate UTF8String];
388
- return [NSString stringWithUTF8String:llama->getFormattedChat(
389
- [messages UTF8String],
390
- tmpl_str
391
- ).c_str()];;
392
- }
393
-
394
- - (NSArray *)tokenProbsToDict:(std::vector<rnllama::completion_token_output>)probs {
395
- NSMutableArray *out = [[NSMutableArray alloc] init];
396
- for (const auto &prob : probs)
397
- {
398
- NSMutableArray *probsForToken = [[NSMutableArray alloc] init];
399
- for (const auto &p : prob.probs)
400
- {
401
- std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx, p.tok);
402
- [probsForToken addObject:@{
403
- @"tok_str": [NSString stringWithUTF8String:tokStr.c_str()],
404
- @"prob": [NSNumber numberWithDouble:p.prob]
405
- }];
406
- }
407
- std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx, prob.tok);
408
- [out addObject:@{
409
- @"content": [NSString stringWithUTF8String:tokStr.c_str()],
410
- @"probs": probsForToken
411
- }];
412
- }
413
- return out;
414
- }
415
-
416
- - (NSDictionary *)completion:(NSDictionary *)params
417
- onToken:(void (^)(NSMutableDictionary * tokenResult))onToken
418
- {
419
- llama->rewind();
420
-
421
- //llama_reset_timings(llama->ctx);
422
-
423
- NSString *prompt = [params objectForKey:@"prompt"];
424
-
425
- llama->params.prompt = [prompt UTF8String];
426
- llama->params.sampling.seed = params[@"seed"] ? [params[@"seed"] intValue] : -1;
427
-
428
- if (params[@"n_threads"]) {
429
- int nThreads = params[@"n_threads"] ? [params[@"n_threads"] intValue] : llama->params.cpuparams.n_threads;
430
- const int maxThreads = (int) [[NSProcessInfo processInfo] processorCount];
431
- // Use 2 threads by default on 4-core devices, 4 threads on more cores
432
- const int defaultNThreads = nThreads == 4 ? 2 : MIN(4, maxThreads);
433
- llama->params.cpuparams.n_threads = nThreads > 0 ? nThreads : defaultNThreads;
434
- }
435
- if (params[@"n_predict"]) llama->params.n_predict = [params[@"n_predict"] intValue];
436
- if (params[@"ignore_eos"]) llama->params.sampling.ignore_eos = [params[@"ignore_eos"] boolValue];
437
-
438
- auto & sparams = llama->params.sampling;
439
-
440
- if (params[@"temperature"]) sparams.temp = [params[@"temperature"] doubleValue];
441
-
442
- if (params[@"n_probs"]) sparams.n_probs = [params[@"n_probs"] intValue];
443
-
444
- if (params[@"penalty_last_n"]) sparams.penalty_last_n = [params[@"penalty_last_n"] intValue];
445
- if (params[@"penalty_repeat"]) sparams.penalty_repeat = [params[@"penalty_repeat"] doubleValue];
446
- if (params[@"penalty_freq"]) sparams.penalty_freq = [params[@"penalty_freq"] doubleValue];
447
- if (params[@"penalty_present"]) sparams.penalty_present = [params[@"penalty_present"] doubleValue];
448
-
449
- if (params[@"mirostat"]) sparams.mirostat = [params[@"mirostat"] intValue];
450
- if (params[@"mirostat_tau"]) sparams.mirostat_tau = [params[@"mirostat_tau"] doubleValue];
451
- if (params[@"mirostat_eta"]) sparams.mirostat_eta = [params[@"mirostat_eta"] doubleValue];
452
-
453
- if (params[@"top_k"]) sparams.top_k = [params[@"top_k"] intValue];
454
- if (params[@"top_p"]) sparams.top_p = [params[@"top_p"] doubleValue];
455
- if (params[@"min_p"]) sparams.min_p = [params[@"min_p"] doubleValue];
456
- if (params[@"xtc_threshold"]) sparams.xtc_threshold = [params[@"xtc_threshold"] doubleValue];
457
- if (params[@"xtc_probability"]) sparams.xtc_probability = [params[@"xtc_probability"] doubleValue];
458
- if (params[@"typical_p"]) sparams.typ_p = [params[@"typical_p"] doubleValue];
459
-
460
- if (params[@"dry_multiplier"]) sparams.dry_multiplier = [params[@"dry_multiplier"] doubleValue];
461
- if (params[@"dry_base"]) sparams.dry_base = [params[@"dry_base"] doubleValue];
462
- if (params[@"dry_allowed_length"]) sparams.dry_allowed_length = [params[@"dry_allowed_length"] intValue];
463
- if (params[@"dry_penalty_last_n"]) sparams.dry_penalty_last_n = [params[@"dry_penalty_last_n"] intValue];
464
-
465
- if (params[@"top_n_sigma"]) sparams.top_n_sigma = [params[@"top_n_sigma"] doubleValue];
466
-
467
- // dry break seq
468
- if (params[@"dry_sequence_breakers"] && [params[@"dry_sequence_breakers"] isKindOfClass:[NSArray class]]) {
469
- NSArray *dry_sequence_breakers = params[@"dry_sequence_breakers"];
470
- for (NSString *s in dry_sequence_breakers) {
471
- sparams.dry_sequence_breakers.push_back([s UTF8String]);
472
- }
473
- }
474
-
475
- if (params[@"grammar"]) {
476
- sparams.grammar = [params[@"grammar"] UTF8String];
477
- }
478
-
479
- if (params[@"json_schema"] && !params[@"grammar"]) {
480
- sparams.grammar = json_schema_to_grammar(json::parse([params[@"json_schema"] UTF8String]));
481
- }
482
-
483
- if (params[@"grammar_lazy"]) {
484
- sparams.grammar_lazy = [params[@"grammar_lazy"] boolValue];
485
- }
486
-
487
- if (params[@"preserved_tokens"] && [params[@"preserved_tokens"] isKindOfClass:[NSArray class]]) {
488
- NSArray *preserved_tokens = params[@"preserved_tokens"];
489
- for (NSString *token in preserved_tokens) {
490
- auto ids = common_tokenize(llama->ctx, [token UTF8String], /* add_special= */ false, /* parse_special= */ true);
491
- if (ids.size() == 1) {
492
- sparams.preserved_tokens.insert(ids[0]);
493
- } else {
494
- // LOG_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", [token UTF8String]);
495
- }
496
- }
497
- }
498
-
499
- if (params[@"grammar_triggers"] && [params[@"grammar_triggers"] isKindOfClass:[NSArray class]]) {
500
- NSArray *grammar_triggers = params[@"grammar_triggers"];
501
- for (NSDictionary *grammar_trigger in grammar_triggers) {
502
- const auto type = static_cast<common_grammar_trigger_type>([grammar_trigger[@"type"] intValue]);
503
- const auto & word = [grammar_trigger[@"value"] UTF8String];
504
-
505
- if (type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
506
- auto ids = common_tokenize(llama->ctx, word, /* add_special= */ false, /* parse_special= */ true);
507
- if (ids.size() == 1) {
508
- auto token = ids[0];
509
- if (std::find(sparams.preserved_tokens.begin(), sparams.preserved_tokens.end(), (llama_token) token) == sparams.preserved_tokens.end()) {
510
- throw std::runtime_error("Grammar trigger word should be marked as preserved token");
511
- }
512
- common_grammar_trigger trigger;
513
- trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
514
- trigger.value = word;
515
- trigger.token = token;
516
- sparams.grammar_triggers.push_back(std::move(trigger));
517
- } else {
518
- sparams.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
519
- }
520
- } else {
521
- common_grammar_trigger trigger;
522
- trigger.type = type;
523
- trigger.value = word;
524
- if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
525
- const auto token = (llama_token) [grammar_trigger[@"token"] intValue];
526
- trigger.token = token;
527
- }
528
- sparams.grammar_triggers.push_back(std::move(trigger));
529
- }
530
- }
531
- }
532
-
533
- llama->params.antiprompt.clear();
534
- if (params[@"stop"]) {
535
- NSArray *stop = params[@"stop"];
536
- for (NSString *s in stop) {
537
- llama->params.antiprompt.push_back([s UTF8String]);
538
- }
539
- }
540
-
541
- const llama_model * model = llama_get_model(llama->ctx);
542
- const llama_vocab * vocab = llama_model_get_vocab(model);
543
-
544
- sparams.logit_bias.clear();
545
- if (params[@"ignore_eos"] && [params[@"ignore_eos"] boolValue]) {
546
- sparams.logit_bias[llama_vocab_eos(vocab)].bias = -INFINITY;
547
- }
548
-
549
- if (params[@"logit_bias"] && [params[@"logit_bias"] isKindOfClass:[NSArray class]]) {
550
- const int n_vocab = llama_vocab_n_tokens(vocab);
551
- NSArray *logit_bias = params[@"logit_bias"];
552
- for (NSArray *el in logit_bias) {
553
- if ([el isKindOfClass:[NSArray class]] && [el count] == 2) {
554
- llama_token tok = [el[0] intValue];
555
- if (tok >= 0 && tok < n_vocab) {
556
- if ([el[1] isKindOfClass:[NSNumber class]]) {
557
- sparams.logit_bias[tok].bias = [el[1] doubleValue];
558
- } else if ([el[1] isKindOfClass:[NSNumber class]] && ![el[1] boolValue]) {
559
- sparams.logit_bias[tok].bias = -INFINITY;
560
- }
561
- }
562
- }
563
- }
564
- }
565
-
566
- if (!llama->initSampling()) {
567
- @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to initialize sampling" userInfo:nil];
568
- }
569
- llama->beginCompletion();
570
- llama->loadPrompt();
571
-
572
- size_t sent_count = 0;
573
- size_t sent_token_probs_index = 0;
574
-
575
- while (llama->has_next_token && !llama->is_interrupted) {
576
- const rnllama::completion_token_output token_with_probs = llama->doCompletion();
577
- if (token_with_probs.tok == -1 || llama->incomplete) {
578
- continue;
579
- }
580
- const std::string token_text = common_token_to_piece(llama->ctx, token_with_probs.tok);
581
-
582
- size_t pos = std::min(sent_count, llama->generated_text.size());
583
-
584
- const std::string str_test = llama->generated_text.substr(pos);
585
- bool is_stop_full = false;
586
- size_t stop_pos =
587
- llama->findStoppingStrings(str_test, token_text.size(), rnllama::STOP_FULL);
588
- if (stop_pos != std::string::npos) {
589
- is_stop_full = true;
590
- llama->generated_text.erase(
591
- llama->generated_text.begin() + pos + stop_pos,
592
- llama->generated_text.end());
593
- pos = std::min(sent_count, llama->generated_text.size());
594
- } else {
595
- is_stop_full = false;
596
- stop_pos = llama->findStoppingStrings(str_test, token_text.size(),
597
- rnllama::STOP_PARTIAL);
598
- }
599
-
600
- if (
601
- stop_pos == std::string::npos ||
602
- // Send rest of the text if we are at the end of the generation
603
- (!llama->has_next_token && !is_stop_full && stop_pos > 0)
604
- ) {
605
- const std::string to_send = llama->generated_text.substr(pos, std::string::npos);
606
-
607
- sent_count += to_send.size();
608
-
609
- std::vector<rnllama::completion_token_output> probs_output = {};
610
-
611
- NSMutableDictionary *tokenResult = [[NSMutableDictionary alloc] init];
612
- tokenResult[@"token"] = [NSString stringWithUTF8String:to_send.c_str()];
613
-
614
- if (llama->params.sampling.n_probs > 0) {
615
- const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx, to_send, false);
616
- size_t probs_pos = std::min(sent_token_probs_index, llama->generated_token_probs.size());
617
- size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama->generated_token_probs.size());
618
- if (probs_pos < probs_stop_pos) {
619
- probs_output = std::vector<rnllama::completion_token_output>(llama->generated_token_probs.begin() + probs_pos, llama->generated_token_probs.begin() + probs_stop_pos);
620
- }
621
- sent_token_probs_index = probs_stop_pos;
622
-
623
- tokenResult[@"completion_probabilities"] = [self tokenProbsToDict:probs_output];
624
- }
625
-
626
- onToken(tokenResult);
627
- }
628
- }
629
-
630
- llama_perf_context_print(llama->ctx);
631
- llama->is_predicting = false;
632
-
633
- const auto timings = llama_perf_context(llama->ctx);
634
-
635
- NSMutableArray *toolCalls = nil;
636
- NSString *reasoningContent = nil;
637
- NSString *content = nil;
638
- if (!llama->is_interrupted) {
639
- try {
640
- auto chat_format = params[@"chat_format"] ? [params[@"chat_format"] intValue] : COMMON_CHAT_FORMAT_CONTENT_ONLY;
641
- common_chat_msg message = common_chat_parse(llama->generated_text, static_cast<common_chat_format>(chat_format));
642
- if (!message.reasoning_content.empty()) {
643
- reasoningContent = [NSString stringWithUTF8String:message.reasoning_content.c_str()];
644
- }
645
- content = [NSString stringWithUTF8String:message.content.c_str()];
646
- toolCalls = [[NSMutableArray alloc] init];
647
- for (const auto &tc : message.tool_calls) {
648
- [toolCalls addObject:@{
649
- @"type": @"function",
650
- @"function": @{
651
- @"name": [NSString stringWithUTF8String:tc.name.c_str()],
652
- @"arguments": [NSString stringWithUTF8String:tc.arguments.c_str()],
653
- },
654
- @"id": tc.id.empty() ? [NSNull null] : [NSString stringWithUTF8String:tc.id.c_str()],
655
- }];
656
- }
657
- } catch (const std::exception &e) {
658
- // NSLog(@"Error parsing tool calls: %s", e.what());
659
- }
660
- }
661
-
662
- NSMutableDictionary *result = [[NSMutableDictionary alloc] init];
663
- result[@"text"] = [NSString stringWithUTF8String:llama->generated_text.c_str()]; // Original text
664
- if (content) result[@"content"] = content;
665
- if (reasoningContent) result[@"reasoning_content"] = reasoningContent;
666
- if (toolCalls && toolCalls.count > 0) result[@"tool_calls"] = toolCalls;
667
- result[@"completion_probabilities"] = [self tokenProbsToDict:llama->generated_token_probs];
668
- result[@"tokens_predicted"] = @(llama->num_tokens_predicted);
669
- result[@"tokens_evaluated"] = @(llama->num_prompt_tokens);
670
- result[@"truncated"] = @(llama->truncated);
671
- result[@"stopped_eos"] = @(llama->stopped_eos);
672
- result[@"stopped_word"] = @(llama->stopped_word);
673
- result[@"stopped_limit"] = @(llama->stopped_limit);
674
- result[@"stopping_word"] = [NSString stringWithUTF8String:llama->stopping_word.c_str()];
675
- result[@"tokens_cached"] = @(llama->n_past);
676
- result[@"timings"] = @{
677
- @"prompt_n": @(timings.n_p_eval),
678
- @"prompt_ms": @(timings.t_p_eval_ms),
679
- @"prompt_per_token_ms": @(timings.t_p_eval_ms / timings.n_p_eval),
680
- @"prompt_per_second": @(1e3 / timings.t_p_eval_ms * timings.n_p_eval),
681
- @"predicted_n": @(timings.n_eval),
682
- @"predicted_n": @(timings.n_eval),
683
- @"predicted_ms": @(timings.t_eval_ms),
684
- @"predicted_per_token_ms": @(timings.t_eval_ms / timings.n_eval),
685
- @"predicted_per_second": @(1e3 / timings.t_eval_ms * timings.n_eval),
686
- };
687
- return result;
688
- }
689
-
690
- - (void)stopCompletion {
691
- llama->is_interrupted = true;
692
- }
693
-
694
- - (NSArray *)tokenize:(NSString *)text {
695
- const std::vector<llama_token> toks = common_tokenize(llama->ctx, [text UTF8String], false);
696
- NSMutableArray *result = [[NSMutableArray alloc] init];
697
- for (llama_token tok : toks) {
698
- [result addObject:@(tok)];
699
- }
700
- return result;
701
- }
702
-
703
- - (NSString *)detokenize:(NSArray *)tokens {
704
- std::vector<llama_token> toks;
705
- for (NSNumber *tok in tokens) {
706
- toks.push_back([tok intValue]);
707
- }
708
- const std::string text = rnllama::tokens_to_str(llama->ctx, toks.cbegin(), toks.cend());
709
- return [NSString stringWithUTF8String:text.c_str()];
710
- }
711
-
712
- - (NSDictionary *)embedding:(NSString *)text params:(NSDictionary *)params {
713
- if (llama->params.embedding != true) {
714
- @throw [NSException exceptionWithName:@"LlamaException" reason:@"Embedding is not enabled" userInfo:nil];
715
- }
716
-
717
- common_params embdParams;
718
- embdParams.embedding = true;
719
- embdParams.embd_normalize = llama->params.embd_normalize;
720
-
721
- if (params[@"embd_normalize"] && [params[@"embd_normalize"] isKindOfClass:[NSNumber class]]) {
722
- embdParams.embd_normalize = [params[@"embd_normalize"] intValue];
723
- }
724
-
725
- llama->rewind();
726
-
727
- llama_perf_context_reset(llama->ctx);
728
-
729
- llama->params.prompt = [text UTF8String];
730
-
731
- llama->params.n_predict = 0;
732
-
733
- if (!llama->initSampling()) {
734
- @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to initialize sampling" userInfo:nil];
735
- }
736
- llama->beginCompletion();
737
- llama->loadPrompt();
738
- llama->doCompletion();
739
-
740
- std::vector<float> result = llama->getEmbedding(embdParams);
741
-
742
- NSMutableDictionary *resultDict = [[NSMutableDictionary alloc] init];
743
- NSMutableArray *embeddingResult = [[NSMutableArray alloc] init];
744
- for (float f : result) {
745
- [embeddingResult addObject:@(f)];
746
- }
747
- resultDict[@"embedding"] = embeddingResult;
748
- NSMutableArray *promptTokens = [[NSMutableArray alloc] init];
749
- for (llama_token tok : llama->embd) {
750
- [promptTokens addObject:[NSString stringWithUTF8String:common_token_to_piece(llama->ctx, tok).c_str()]];
751
- }
752
- resultDict[@"prompt_tokens"] = promptTokens;
753
-
754
- llama->is_predicting = false;
755
- return resultDict;
756
- }
757
-
758
- - (NSDictionary *)loadSession:(NSString *)path {
759
- if (!path || [path length] == 0) {
760
- @throw [NSException exceptionWithName:@"LlamaException" reason:@"Session path is empty" userInfo:nil];
761
- }
762
- if (![[NSFileManager defaultManager] fileExistsAtPath:path]) {
763
- @throw [NSException exceptionWithName:@"LlamaException" reason:@"Session file does not exist" userInfo:nil];
764
- }
765
-
766
- size_t n_token_count_out = 0;
767
- llama->embd.resize(llama->params.n_ctx);
768
- if (!llama_state_load_file(llama->ctx, [path UTF8String], llama->embd.data(), llama->embd.capacity(), &n_token_count_out)) {
769
- @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to load session" userInfo:nil];
770
- }
771
- llama->embd.resize(n_token_count_out);
772
- const std::string text = rnllama::tokens_to_str(llama->ctx, llama->embd.cbegin(), llama->embd.cend());
773
- return @{
774
- @"tokens_loaded": @(n_token_count_out),
775
- @"prompt": [NSString stringWithUTF8String:text.c_str()]
776
- };
777
- }
778
-
779
- - (int)saveSession:(NSString *)path size:(int)size {
780
- if (!path || [path length] == 0) {
781
- @throw [NSException exceptionWithName:@"LlamaException" reason:@"Session path is empty" userInfo:nil];
782
- }
783
- std::vector<llama_token> session_tokens = llama->embd;
784
- int default_size = session_tokens.size();
785
- int save_size = size > 0 && size <= default_size ? size : default_size;
786
- if (!llama_state_save_file(llama->ctx, [path UTF8String], session_tokens.data(), save_size)) {
787
- @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to save session" userInfo:nil];
788
- }
789
- return session_tokens.size();
790
- }
791
-
792
- - (NSString *)bench:(int)pp tg:(int)tg pl:(int)pl nr:(int)nr {
793
- return [NSString stringWithUTF8String:llama->bench(pp, tg, pl, nr).c_str()];
794
- }
795
-
796
- - (void)applyLoraAdapters:(NSArray *)loraAdapters {
797
- std::vector<common_adapter_lora_info> lora_adapters;
798
- for (NSDictionary *loraAdapter in loraAdapters) {
799
- common_adapter_lora_info la;
800
- la.path = [loraAdapter[@"path"] UTF8String];
801
- la.scale = [loraAdapter[@"scaled"] doubleValue];
802
- la.ptr = llama_adapter_lora_init(llama->model, la.path.c_str());
803
- if (la.ptr == nullptr) {
804
- @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to apply lora adapter" userInfo:nil];
805
- }
806
- lora_adapters.push_back(la);
807
- }
808
- int result = llama->applyLoraAdapters(lora_adapters);
809
- if (result != 0) {
810
- @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to apply lora adapters" userInfo:nil];
811
- }
812
- }
813
-
814
- - (void)removeLoraAdapters {
815
- llama->removeLoraAdapters();
816
- }
817
-
818
- - (NSArray *)getLoadedLoraAdapters {
819
- std::vector<common_adapter_lora_info> loaded_lora_adapters = llama->getLoadedLoraAdapters();
820
- NSMutableArray *result = [[NSMutableArray alloc] init];
821
- for (common_adapter_lora_info &la : loaded_lora_adapters) {
822
- [result addObject:@{
823
- @"path": [NSString stringWithUTF8String:la.path.c_str()],
824
- @"scale": @(la.scale)
825
- }];
826
- }
827
- return result;
828
- }
829
-
830
- - (void)invalidate {
831
- delete llama;
832
- // llama_backend_free();
833
- }
834
-
835
- @end
1
+ #import "RNLlamaContext.h"
2
+ #import <Metal/Metal.h>
3
+
4
+ @implementation RNLlamaContext
5
+
6
+ + (void)toggleNativeLog:(BOOL)enabled onEmitLog:(void (^)(NSString *level, NSString *text))onEmitLog {
7
+ if (enabled) {
8
+ void (^copiedBlock)(NSString *, NSString *) = [onEmitLog copy];
9
+ llama_log_set([](lm_ggml_log_level level, const char * text, void * data) {
10
+ llama_log_callback_default(level, text, data);
11
+ NSString *levelStr = @"";
12
+ if (level == LM_GGML_LOG_LEVEL_ERROR) {
13
+ levelStr = @"error";
14
+ } else if (level == LM_GGML_LOG_LEVEL_INFO) {
15
+ levelStr = @"info";
16
+ } else if (level == LM_GGML_LOG_LEVEL_WARN) {
17
+ levelStr = @"warn";
18
+ }
19
+
20
+ NSString *textStr = [NSString stringWithUTF8String:text];
21
+ // NOTE: Convert to UTF-8 string may fail
22
+ if (!textStr) {
23
+ return;
24
+ }
25
+ void (^block)(NSString *, NSString *) = (__bridge void (^)(NSString *, NSString *))(data);
26
+ block(levelStr, textStr);
27
+ }, copiedBlock);
28
+ } else {
29
+ llama_log_set(llama_log_callback_default, nullptr);
30
+ }
31
+ }
32
+
33
+ + (NSDictionary *)modelInfo:(NSString *)path skip:(NSArray *)skip {
34
+ struct lm_gguf_init_params params = {
35
+ /*.no_alloc = */ false,
36
+ /*.ctx = */ NULL,
37
+ };
38
+
39
+ struct lm_gguf_context * ctx = lm_gguf_init_from_file([path UTF8String], params);
40
+
41
+ if (!ctx) {
42
+ NSLog(@"%s: failed to load '%s'\n", __func__, [path UTF8String]);
43
+ return @{};
44
+ }
45
+
46
+ NSMutableDictionary *info = [[NSMutableDictionary alloc] init];
47
+
48
+ info[@"version"] = @(lm_gguf_get_version(ctx));
49
+ info[@"alignment"] = @(lm_gguf_get_alignment(ctx));
50
+ info[@"data_offset"] = @(lm_gguf_get_data_offset(ctx));
51
+
52
+ // kv
53
+ {
54
+ const int n_kv = lm_gguf_get_n_kv(ctx);
55
+
56
+ for (int i = 0; i < n_kv; ++i) {
57
+ const char * key = lm_gguf_get_key(ctx, i);
58
+
59
+ if (skip && [skip containsObject:[NSString stringWithUTF8String:key]]) {
60
+ continue;
61
+ }
62
+ const std::string value = lm_gguf_kv_to_str(ctx, i);
63
+ info[[NSString stringWithUTF8String:key]] = [NSString stringWithUTF8String:value.c_str()];
64
+ }
65
+ }
66
+
67
+ lm_gguf_free(ctx);
68
+
69
+ return info;
70
+ }
71
+
72
+ + (instancetype)initWithParams:(NSDictionary *)params onProgress:(void (^)(unsigned int progress))onProgress {
73
+ // llama_backend_init(false);
74
+ common_params defaultParams;
75
+
76
+ if (params[@"vocab_only"]) {
77
+ defaultParams.vocab_only = [params[@"vocab_only"] boolValue];
78
+ defaultParams.warmup = false;
79
+ }
80
+
81
+ NSString *modelPath = params[@"model"];
82
+ BOOL isAsset = [params[@"is_model_asset"] boolValue];
83
+ NSString *path = modelPath;
84
+ if (isAsset) path = [[NSBundle mainBundle] pathForResource:modelPath ofType:nil];
85
+ defaultParams.model = {[path UTF8String]};
86
+
87
+ NSString *chatTemplate = params[@"chat_template"];
88
+ if (chatTemplate) {
89
+ defaultParams.chat_template = [chatTemplate UTF8String];
90
+ NSLog(@"chatTemplate: %@", chatTemplate);
91
+ }
92
+
93
+ NSString *reasoningFormat = params[@"reasoning_format"];
94
+ if (reasoningFormat && [reasoningFormat isEqualToString:@"deepseek"]) {
95
+ defaultParams.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
96
+ } else {
97
+ defaultParams.reasoning_format = COMMON_REASONING_FORMAT_NONE;
98
+ }
99
+
100
+ if (params[@"n_ctx"]) defaultParams.n_ctx = [params[@"n_ctx"] intValue];
101
+ if (params[@"use_mlock"]) defaultParams.use_mlock = [params[@"use_mlock"]boolValue];
102
+
103
+ BOOL skipGpuDevices = params[@"no_gpu_devices"] && [params[@"no_gpu_devices"] boolValue];
104
+
105
+ BOOL isMetalEnabled = false;
106
+ NSString *reasonNoMetal = @"";
107
+ defaultParams.n_gpu_layers = 0;
108
+ #ifdef LM_GGML_USE_METAL
109
+ // Check ggml-metal availability
110
+ NSError * error = nil;
111
+ id<MTLDevice> device = MTLCreateSystemDefaultDevice();
112
+ id<MTLLibrary> library = [device
113
+ newLibraryWithSource:@"#include <metal_stdlib>\n"
114
+ "using namespace metal;"
115
+ "typedef matrix<bfloat, 4, 4> bfloat4x4;"
116
+ "kernel void test() { simd_sum(0); }"
117
+ options:nil
118
+ error:&error
119
+ ];
120
+ if (error) {
121
+ reasonNoMetal = [error localizedDescription];
122
+ skipGpuDevices = true;
123
+ } else {
124
+ id<MTLFunction> kernel = [library newFunctionWithName:@"test"];
125
+ id<MTLComputePipelineState> pipeline = [device newComputePipelineStateWithFunction:kernel error:&error];
126
+ if (pipeline == nil) {
127
+ reasonNoMetal = [error localizedDescription];
128
+ skipGpuDevices = true;
129
+ } else {
130
+ #if TARGET_OS_SIMULATOR
131
+ // Use the backend, but no layers because not supported fully on simulator
132
+ defaultParams.n_gpu_layers = 0;
133
+ isMetalEnabled = true;
134
+ #else
135
+ defaultParams.n_gpu_layers = [params[@"n_gpu_layers"] intValue];
136
+ isMetalEnabled = true;
137
+ #endif
138
+ }
139
+ }
140
+ device = nil;
141
+ #else
142
+ reasonNoMetal = @"Metal is not enabled in this build";
143
+ isMetalEnabled = false;
144
+ #endif
145
+
146
+ if (skipGpuDevices) {
147
+ std::vector<lm_ggml_backend_dev_t> cpu_devs;
148
+ for (size_t i = 0; i < lm_ggml_backend_dev_count(); ++i) {
149
+ lm_ggml_backend_dev_t dev = lm_ggml_backend_dev_get(i);
150
+ switch (lm_ggml_backend_dev_type(dev)) {
151
+ case LM_GGML_BACKEND_DEVICE_TYPE_CPU:
152
+ case LM_GGML_BACKEND_DEVICE_TYPE_ACCEL:
153
+ cpu_devs.push_back(dev);
154
+ break;
155
+ case LM_GGML_BACKEND_DEVICE_TYPE_GPU:
156
+ break;
157
+ }
158
+ }
159
+ if (cpu_devs.size() > 0) {
160
+ defaultParams.devices = cpu_devs;
161
+ }
162
+ }
163
+
164
+ if (params[@"n_batch"]) defaultParams.n_batch = [params[@"n_batch"] intValue];
165
+ if (params[@"n_ubatch"]) defaultParams.n_ubatch = [params[@"n_ubatch"] intValue];
166
+ if (params[@"use_mmap"]) defaultParams.use_mmap = [params[@"use_mmap"] boolValue];
167
+
168
+ if (params[@"pooling_type"] && [params[@"pooling_type"] isKindOfClass:[NSNumber class]]) {
169
+ defaultParams.pooling_type = static_cast<enum llama_pooling_type>([params[@"pooling_type"] intValue]);
170
+ }
171
+
172
+ if (params[@"embedding"] && [params[@"embedding"] boolValue]) {
173
+ defaultParams.embedding = true;
174
+ // For non-causal models, batch size must be equal to ubatch size
175
+ defaultParams.n_ubatch = defaultParams.n_batch;
176
+
177
+ if (params[@"embd_normalize"] && [params[@"embd_normalize"] isKindOfClass:[NSNumber class]]) {
178
+ defaultParams.embd_normalize = [params[@"embd_normalize"] intValue];
179
+ }
180
+ }
181
+
182
+ if (params[@"rope_freq_base"]) defaultParams.rope_freq_base = [params[@"rope_freq_base"] floatValue];
183
+ if (params[@"rope_freq_scale"]) defaultParams.rope_freq_scale = [params[@"rope_freq_scale"] floatValue];
184
+
185
+ if (params[@"flash_attn"] && [params[@"flash_attn"] boolValue]) defaultParams.flash_attn = true;
186
+
187
+ if (params[@"cache_type_k"]) defaultParams.cache_type_k = rnllama::kv_cache_type_from_str([params[@"cache_type_k"] UTF8String]);
188
+ if (params[@"cache_type_v"]) defaultParams.cache_type_v = rnllama::kv_cache_type_from_str([params[@"cache_type_v"] UTF8String]);
189
+
190
+ int nThreads = params[@"n_threads"] ? [params[@"n_threads"] intValue] : 0;
191
+ const int maxThreads = (int) [[NSProcessInfo processInfo] processorCount];
192
+ // Use 2 threads by default on 4-core devices, 4 threads on more cores
193
+ const int defaultNThreads = nThreads == 4 ? 2 : MIN(4, maxThreads);
194
+ defaultParams.cpuparams.n_threads = nThreads > 0 ? nThreads : defaultNThreads;
195
+
196
+ RNLlamaContext *context = [[RNLlamaContext alloc] init];
197
+ context->llama = new rnllama::llama_rn_context();
198
+ context->llama->is_load_interrupted = false;
199
+ context->llama->loading_progress = 0;
200
+ context->onProgress = onProgress;
201
+
202
+ if (params[@"use_progress_callback"] && [params[@"use_progress_callback"] boolValue]) {
203
+ defaultParams.progress_callback = [](float progress, void * user_data) {
204
+ RNLlamaContext *context = (__bridge RNLlamaContext *)(user_data);
205
+ unsigned percentage = (unsigned) (100 * progress);
206
+ if (percentage > context->llama->loading_progress) {
207
+ context->llama->loading_progress = percentage;
208
+ context->onProgress(percentage);
209
+ }
210
+ return !context->llama->is_load_interrupted;
211
+ };
212
+ defaultParams.progress_callback_user_data = context;
213
+ }
214
+
215
+ context->is_model_loaded = context->llama->loadModel(defaultParams);
216
+
217
+ if (
218
+ params[@"embedding"] && [params[@"embedding"] boolValue] &&
219
+ llama_model_has_encoder(context->llama->model) && llama_model_has_decoder(context->llama->model)
220
+ ) {
221
+ delete context->llama;
222
+ @throw [NSException exceptionWithName:@"LlamaException" reason:@"Embedding is not supported in encoder-decoder models" userInfo:nil];
223
+ }
224
+
225
+ std::vector<common_adapter_lora_info> lora;
226
+ if (params[@"lora"]) {
227
+ common_adapter_lora_info la;
228
+ la.path = [params[@"lora"] UTF8String];
229
+ la.scale = 1.0f;
230
+ if (params[@"lora_scaled"]) la.scale = [params[@"lora_scaled"] floatValue];
231
+ lora.push_back(la);
232
+ }
233
+ if (params[@"lora_list"] && [params[@"lora_list"] isKindOfClass:[NSArray class]]) {
234
+ NSArray *lora_list = params[@"lora_list"];
235
+ for (NSDictionary *lora_adapter in lora_list) {
236
+ NSString *path = lora_adapter[@"path"];
237
+ if (!path) continue;
238
+ float scale = [lora_adapter[@"scaled"] floatValue];
239
+ common_adapter_lora_info la;
240
+ la.path = [path UTF8String];
241
+ la.scale = scale;
242
+ lora.push_back(la);
243
+ }
244
+ }
245
+ if (lora.size() > 0) {
246
+ int result = context->llama->applyLoraAdapters(lora);
247
+ if (result != 0) {
248
+ delete context->llama;
249
+ @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to apply lora adapters" userInfo:nil];
250
+ }
251
+ }
252
+
253
+ context->is_metal_enabled = isMetalEnabled;
254
+ context->reason_no_metal = reasonNoMetal;
255
+
256
+ return context;
257
+ }
258
+
259
+ - (void)interruptLoad {
260
+ llama->is_load_interrupted = true;
261
+ }
262
+
263
+ - (bool)isMetalEnabled {
264
+ return is_metal_enabled;
265
+ }
266
+
267
+ - (NSString *)reasonNoMetal {
268
+ return reason_no_metal;
269
+ }
270
+
271
+ - (NSDictionary *)modelInfo {
272
+ char desc[1024];
273
+ llama_model_desc(llama->model, desc, sizeof(desc));
274
+
275
+ int count = llama_model_meta_count(llama->model);
276
+ NSDictionary *meta = [[NSMutableDictionary alloc] init];
277
+ for (int i = 0; i < count; i++) {
278
+ char key[256];
279
+ llama_model_meta_key_by_index(llama->model, i, key, sizeof(key));
280
+ char val[4096];
281
+ llama_model_meta_val_str_by_index(llama->model, i, val, sizeof(val));
282
+
283
+ NSString *keyStr = [NSString stringWithUTF8String:key];
284
+ NSString *valStr = [NSString stringWithUTF8String:val];
285
+ [meta setValue:valStr forKey:keyStr];
286
+ }
287
+
288
+ auto template_tool_use = llama->templates.get()->template_tool_use.get();
289
+ NSDictionary *tool_use_caps_dir = nil;
290
+ if (template_tool_use) {
291
+ auto tool_use_caps = template_tool_use->original_caps();
292
+ tool_use_caps_dir = @{
293
+ @"tools": @(tool_use_caps.supports_tools),
294
+ @"toolCalls": @(tool_use_caps.supports_tool_calls),
295
+ @"toolResponses": @(tool_use_caps.supports_tool_responses),
296
+ @"systemRole": @(tool_use_caps.supports_system_role),
297
+ @"parallelToolCalls": @(tool_use_caps.supports_parallel_tool_calls),
298
+ @"toolCallId": @(tool_use_caps.supports_tool_call_id)
299
+ };
300
+ }
301
+
302
+ auto default_tmpl = llama->templates.get()->template_default.get();
303
+ auto default_tmpl_caps = default_tmpl->original_caps();
304
+
305
+ return @{
306
+ @"desc": [NSString stringWithUTF8String:desc],
307
+ @"size": @(llama_model_size(llama->model)),
308
+ @"nEmbd": @(llama_model_n_embd(llama->model)),
309
+ @"nParams": @(llama_model_n_params(llama->model)),
310
+ @"chatTemplates": @{
311
+ @"llamaChat": @(llama->validateModelChatTemplate(false, nullptr)),
312
+ @"minja": @{
313
+ @"default": @(llama->validateModelChatTemplate(true, nullptr)),
314
+ @"defaultCaps": @{
315
+ @"tools": @(default_tmpl_caps.supports_tools),
316
+ @"toolCalls": @(default_tmpl_caps.supports_tool_calls),
317
+ @"toolResponses": @(default_tmpl_caps.supports_tool_responses),
318
+ @"systemRole": @(default_tmpl_caps.supports_system_role),
319
+ @"parallelToolCalls": @(default_tmpl_caps.supports_parallel_tool_calls),
320
+ @"toolCallId": @(default_tmpl_caps.supports_tool_call_id)
321
+ },
322
+ @"toolUse": @(llama->validateModelChatTemplate(true, "tool_use")),
323
+ @"toolUseCaps": tool_use_caps_dir ?: @{}
324
+ }
325
+ },
326
+ @"metadata": meta,
327
+
328
+ // deprecated
329
+ @"isChatTemplateSupported": @(llama->validateModelChatTemplate(false, nullptr))
330
+ };
331
+ }
332
+
333
+ - (bool)isModelLoaded {
334
+ return is_model_loaded;
335
+ }
336
+
337
+ - (bool)isPredicting {
338
+ return llama->is_predicting;
339
+ }
340
+
341
+ - (NSDictionary *)getFormattedChatWithJinja:(NSString *)messages
342
+ withChatTemplate:(NSString *)chatTemplate
343
+ withJsonSchema:(NSString *)jsonSchema
344
+ withTools:(NSString *)tools
345
+ withParallelToolCalls:(BOOL)parallelToolCalls
346
+ withToolChoice:(NSString *)toolChoice
347
+ {
348
+ auto tmpl_str = chatTemplate == nil ? "" : [chatTemplate UTF8String];
349
+
350
+ NSMutableDictionary *result = [[NSMutableDictionary alloc] init];
351
+ auto chatParams = llama->getFormattedChatWithJinja(
352
+ [messages UTF8String],
353
+ tmpl_str,
354
+ jsonSchema == nil ? "" : [jsonSchema UTF8String],
355
+ tools == nil ? "" : [tools UTF8String],
356
+ parallelToolCalls,
357
+ toolChoice == nil ? "" : [toolChoice UTF8String]
358
+ );
359
+ result[@"prompt"] = [NSString stringWithUTF8String:chatParams.prompt.c_str()];
360
+ result[@"chat_format"] = @(static_cast<int>(chatParams.format));
361
+ result[@"grammar"] = [NSString stringWithUTF8String:chatParams.grammar.c_str()];
362
+ result[@"grammar_lazy"] = @(chatParams.grammar_lazy);
363
+ NSMutableArray *grammar_triggers = [[NSMutableArray alloc] init];
364
+ for (const auto & trigger : chatParams.grammar_triggers) {
365
+ [grammar_triggers addObject:@{
366
+ @"type": @(trigger.type),
367
+ @"value": [NSString stringWithUTF8String:trigger.value.c_str()],
368
+ @"token": @(trigger.token),
369
+ }];
370
+ }
371
+ result[@"grammar_triggers"] = grammar_triggers;
372
+ NSMutableArray *preserved_tokens = [[NSMutableArray alloc] init];
373
+ for (const auto & token : chatParams.preserved_tokens) {
374
+ [preserved_tokens addObject:[NSString stringWithUTF8String:token.c_str()]];
375
+ }
376
+ result[@"preserved_tokens"] = preserved_tokens;
377
+ NSMutableArray *additional_stops = [[NSMutableArray alloc] init];
378
+ for (const auto & stop : chatParams.additional_stops) {
379
+ [additional_stops addObject:[NSString stringWithUTF8String:stop.c_str()]];
380
+ }
381
+ result[@"additional_stops"] = additional_stops;
382
+
383
+ return result;
384
+ }
385
+
386
+ - (NSString *)getFormattedChat:(NSString *)messages withChatTemplate:(NSString *)chatTemplate {
387
+ auto tmpl_str = chatTemplate == nil ? "" : [chatTemplate UTF8String];
388
+ return [NSString stringWithUTF8String:llama->getFormattedChat(
389
+ [messages UTF8String],
390
+ tmpl_str
391
+ ).c_str()];;
392
+ }
393
+
394
+ - (NSArray *)tokenProbsToDict:(std::vector<rnllama::completion_token_output>)probs {
395
+ NSMutableArray *out = [[NSMutableArray alloc] init];
396
+ for (const auto &prob : probs)
397
+ {
398
+ NSMutableArray *probsForToken = [[NSMutableArray alloc] init];
399
+ for (const auto &p : prob.probs)
400
+ {
401
+ std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx, p.tok);
402
+ [probsForToken addObject:@{
403
+ @"tok_str": [NSString stringWithUTF8String:tokStr.c_str()],
404
+ @"prob": [NSNumber numberWithDouble:p.prob]
405
+ }];
406
+ }
407
+ std::string tokStr = rnllama::tokens_to_output_formatted_string(llama->ctx, prob.tok);
408
+ [out addObject:@{
409
+ @"content": [NSString stringWithUTF8String:tokStr.c_str()],
410
+ @"probs": probsForToken
411
+ }];
412
+ }
413
+ return out;
414
+ }
415
+
416
+ - (NSDictionary *)completion:(NSDictionary *)params
417
+ onToken:(void (^)(NSMutableDictionary * tokenResult))onToken
418
+ {
419
+ llama->rewind();
420
+
421
+ //llama_reset_timings(llama->ctx);
422
+
423
+ NSString *prompt = [params objectForKey:@"prompt"];
424
+
425
+ llama->params.prompt = [prompt UTF8String];
426
+ llama->params.sampling.seed = params[@"seed"] ? [params[@"seed"] intValue] : -1;
427
+
428
+ if (params[@"n_threads"]) {
429
+ int nThreads = params[@"n_threads"] ? [params[@"n_threads"] intValue] : llama->params.cpuparams.n_threads;
430
+ const int maxThreads = (int) [[NSProcessInfo processInfo] processorCount];
431
+ // Use 2 threads by default on 4-core devices, 4 threads on more cores
432
+ const int defaultNThreads = nThreads == 4 ? 2 : MIN(4, maxThreads);
433
+ llama->params.cpuparams.n_threads = nThreads > 0 ? nThreads : defaultNThreads;
434
+ }
435
+ if (params[@"n_predict"]) llama->params.n_predict = [params[@"n_predict"] intValue];
436
+ if (params[@"ignore_eos"]) llama->params.sampling.ignore_eos = [params[@"ignore_eos"] boolValue];
437
+
438
+ auto & sparams = llama->params.sampling;
439
+
440
+ if (params[@"temperature"]) sparams.temp = [params[@"temperature"] doubleValue];
441
+
442
+ if (params[@"n_probs"]) sparams.n_probs = [params[@"n_probs"] intValue];
443
+
444
+ if (params[@"penalty_last_n"]) sparams.penalty_last_n = [params[@"penalty_last_n"] intValue];
445
+ if (params[@"penalty_repeat"]) sparams.penalty_repeat = [params[@"penalty_repeat"] doubleValue];
446
+ if (params[@"penalty_freq"]) sparams.penalty_freq = [params[@"penalty_freq"] doubleValue];
447
+ if (params[@"penalty_present"]) sparams.penalty_present = [params[@"penalty_present"] doubleValue];
448
+
449
+ if (params[@"mirostat"]) sparams.mirostat = [params[@"mirostat"] intValue];
450
+ if (params[@"mirostat_tau"]) sparams.mirostat_tau = [params[@"mirostat_tau"] doubleValue];
451
+ if (params[@"mirostat_eta"]) sparams.mirostat_eta = [params[@"mirostat_eta"] doubleValue];
452
+
453
+ if (params[@"top_k"]) sparams.top_k = [params[@"top_k"] intValue];
454
+ if (params[@"top_p"]) sparams.top_p = [params[@"top_p"] doubleValue];
455
+ if (params[@"min_p"]) sparams.min_p = [params[@"min_p"] doubleValue];
456
+ if (params[@"xtc_threshold"]) sparams.xtc_threshold = [params[@"xtc_threshold"] doubleValue];
457
+ if (params[@"xtc_probability"]) sparams.xtc_probability = [params[@"xtc_probability"] doubleValue];
458
+ if (params[@"typical_p"]) sparams.typ_p = [params[@"typical_p"] doubleValue];
459
+
460
+ if (params[@"dry_multiplier"]) sparams.dry_multiplier = [params[@"dry_multiplier"] doubleValue];
461
+ if (params[@"dry_base"]) sparams.dry_base = [params[@"dry_base"] doubleValue];
462
+ if (params[@"dry_allowed_length"]) sparams.dry_allowed_length = [params[@"dry_allowed_length"] intValue];
463
+ if (params[@"dry_penalty_last_n"]) sparams.dry_penalty_last_n = [params[@"dry_penalty_last_n"] intValue];
464
+
465
+ if (params[@"top_n_sigma"]) sparams.top_n_sigma = [params[@"top_n_sigma"] doubleValue];
466
+
467
+ // dry break seq
468
+ if (params[@"dry_sequence_breakers"] && [params[@"dry_sequence_breakers"] isKindOfClass:[NSArray class]]) {
469
+ NSArray *dry_sequence_breakers = params[@"dry_sequence_breakers"];
470
+ for (NSString *s in dry_sequence_breakers) {
471
+ sparams.dry_sequence_breakers.push_back([s UTF8String]);
472
+ }
473
+ }
474
+
475
+ if (params[@"grammar"]) {
476
+ sparams.grammar = [params[@"grammar"] UTF8String];
477
+ }
478
+
479
+ if (params[@"json_schema"] && !params[@"grammar"]) {
480
+ sparams.grammar = json_schema_to_grammar(json::parse([params[@"json_schema"] UTF8String]));
481
+ }
482
+
483
+ if (params[@"grammar_lazy"]) {
484
+ sparams.grammar_lazy = [params[@"grammar_lazy"] boolValue];
485
+ }
486
+
487
+ if (params[@"preserved_tokens"] && [params[@"preserved_tokens"] isKindOfClass:[NSArray class]]) {
488
+ NSArray *preserved_tokens = params[@"preserved_tokens"];
489
+ for (NSString *token in preserved_tokens) {
490
+ auto ids = common_tokenize(llama->ctx, [token UTF8String], /* add_special= */ false, /* parse_special= */ true);
491
+ if (ids.size() == 1) {
492
+ sparams.preserved_tokens.insert(ids[0]);
493
+ } else {
494
+ // LOG_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", [token UTF8String]);
495
+ }
496
+ }
497
+ }
498
+
499
+ if (params[@"grammar_triggers"] && [params[@"grammar_triggers"] isKindOfClass:[NSArray class]]) {
500
+ NSArray *grammar_triggers = params[@"grammar_triggers"];
501
+ for (NSDictionary *grammar_trigger in grammar_triggers) {
502
+ const auto type = static_cast<common_grammar_trigger_type>([grammar_trigger[@"type"] intValue]);
503
+ const auto & word = [grammar_trigger[@"value"] UTF8String];
504
+
505
+ if (type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
506
+ auto ids = common_tokenize(llama->ctx, word, /* add_special= */ false, /* parse_special= */ true);
507
+ if (ids.size() == 1) {
508
+ auto token = ids[0];
509
+ if (std::find(sparams.preserved_tokens.begin(), sparams.preserved_tokens.end(), (llama_token) token) == sparams.preserved_tokens.end()) {
510
+ throw std::runtime_error("Grammar trigger word should be marked as preserved token");
511
+ }
512
+ common_grammar_trigger trigger;
513
+ trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
514
+ trigger.value = word;
515
+ trigger.token = token;
516
+ sparams.grammar_triggers.push_back(std::move(trigger));
517
+ } else {
518
+ sparams.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
519
+ }
520
+ } else {
521
+ common_grammar_trigger trigger;
522
+ trigger.type = type;
523
+ trigger.value = word;
524
+ if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
525
+ const auto token = (llama_token) [grammar_trigger[@"token"] intValue];
526
+ trigger.token = token;
527
+ }
528
+ sparams.grammar_triggers.push_back(std::move(trigger));
529
+ }
530
+ }
531
+ }
532
+
533
+ llama->params.antiprompt.clear();
534
+ if (params[@"stop"]) {
535
+ NSArray *stop = params[@"stop"];
536
+ for (NSString *s in stop) {
537
+ llama->params.antiprompt.push_back([s UTF8String]);
538
+ }
539
+ }
540
+
541
+ const llama_model * model = llama_get_model(llama->ctx);
542
+ const llama_vocab * vocab = llama_model_get_vocab(model);
543
+
544
+ sparams.logit_bias.clear();
545
+ if (params[@"ignore_eos"] && [params[@"ignore_eos"] boolValue]) {
546
+ sparams.logit_bias[llama_vocab_eos(vocab)].bias = -INFINITY;
547
+ }
548
+
549
+ if (params[@"logit_bias"] && [params[@"logit_bias"] isKindOfClass:[NSArray class]]) {
550
+ const int n_vocab = llama_vocab_n_tokens(vocab);
551
+ NSArray *logit_bias = params[@"logit_bias"];
552
+ for (NSArray *el in logit_bias) {
553
+ if ([el isKindOfClass:[NSArray class]] && [el count] == 2) {
554
+ llama_token tok = [el[0] intValue];
555
+ if (tok >= 0 && tok < n_vocab) {
556
+ if ([el[1] isKindOfClass:[NSNumber class]]) {
557
+ sparams.logit_bias[tok].bias = [el[1] doubleValue];
558
+ } else if ([el[1] isKindOfClass:[NSNumber class]] && ![el[1] boolValue]) {
559
+ sparams.logit_bias[tok].bias = -INFINITY;
560
+ }
561
+ }
562
+ }
563
+ }
564
+ }
565
+
566
+ if (!llama->initSampling()) {
567
+ @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to initialize sampling" userInfo:nil];
568
+ }
569
+ llama->beginCompletion();
570
+ llama->loadPrompt();
571
+
572
+ size_t sent_count = 0;
573
+ size_t sent_token_probs_index = 0;
574
+
575
+ while (llama->has_next_token && !llama->is_interrupted) {
576
+ const rnllama::completion_token_output token_with_probs = llama->doCompletion();
577
+ if (token_with_probs.tok == -1 || llama->incomplete) {
578
+ continue;
579
+ }
580
+ const std::string token_text = common_token_to_piece(llama->ctx, token_with_probs.tok);
581
+
582
+ size_t pos = std::min(sent_count, llama->generated_text.size());
583
+
584
+ const std::string str_test = llama->generated_text.substr(pos);
585
+ bool is_stop_full = false;
586
+ size_t stop_pos =
587
+ llama->findStoppingStrings(str_test, token_text.size(), rnllama::STOP_FULL);
588
+ if (stop_pos != std::string::npos) {
589
+ is_stop_full = true;
590
+ llama->generated_text.erase(
591
+ llama->generated_text.begin() + pos + stop_pos,
592
+ llama->generated_text.end());
593
+ pos = std::min(sent_count, llama->generated_text.size());
594
+ } else {
595
+ is_stop_full = false;
596
+ stop_pos = llama->findStoppingStrings(str_test, token_text.size(),
597
+ rnllama::STOP_PARTIAL);
598
+ }
599
+
600
+ if (
601
+ stop_pos == std::string::npos ||
602
+ // Send rest of the text if we are at the end of the generation
603
+ (!llama->has_next_token && !is_stop_full && stop_pos > 0)
604
+ ) {
605
+ const std::string to_send = llama->generated_text.substr(pos, std::string::npos);
606
+
607
+ sent_count += to_send.size();
608
+
609
+ std::vector<rnllama::completion_token_output> probs_output = {};
610
+
611
+ NSMutableDictionary *tokenResult = [[NSMutableDictionary alloc] init];
612
+ tokenResult[@"token"] = [NSString stringWithUTF8String:to_send.c_str()];
613
+
614
+ if (llama->params.sampling.n_probs > 0) {
615
+ const std::vector<llama_token> to_send_toks = common_tokenize(llama->ctx, to_send, false);
616
+ size_t probs_pos = std::min(sent_token_probs_index, llama->generated_token_probs.size());
617
+ size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama->generated_token_probs.size());
618
+ if (probs_pos < probs_stop_pos) {
619
+ probs_output = std::vector<rnllama::completion_token_output>(llama->generated_token_probs.begin() + probs_pos, llama->generated_token_probs.begin() + probs_stop_pos);
620
+ }
621
+ sent_token_probs_index = probs_stop_pos;
622
+
623
+ tokenResult[@"completion_probabilities"] = [self tokenProbsToDict:probs_output];
624
+ }
625
+
626
+ onToken(tokenResult);
627
+ }
628
+ }
629
+
630
+ llama_perf_context_print(llama->ctx);
631
+ llama->is_predicting = false;
632
+
633
+ const auto timings = llama_perf_context(llama->ctx);
634
+
635
+ NSMutableArray *toolCalls = nil;
636
+ NSString *reasoningContent = nil;
637
+ NSString *content = nil;
638
+ if (!llama->is_interrupted) {
639
+ try {
640
+ auto chat_format = params[@"chat_format"] ? [params[@"chat_format"] intValue] : COMMON_CHAT_FORMAT_CONTENT_ONLY;
641
+ common_chat_msg message = common_chat_parse(llama->generated_text, static_cast<common_chat_format>(chat_format));
642
+ if (!message.reasoning_content.empty()) {
643
+ reasoningContent = [NSString stringWithUTF8String:message.reasoning_content.c_str()];
644
+ }
645
+ content = [NSString stringWithUTF8String:message.content.c_str()];
646
+ toolCalls = [[NSMutableArray alloc] init];
647
+ for (const auto &tc : message.tool_calls) {
648
+ [toolCalls addObject:@{
649
+ @"type": @"function",
650
+ @"function": @{
651
+ @"name": [NSString stringWithUTF8String:tc.name.c_str()],
652
+ @"arguments": [NSString stringWithUTF8String:tc.arguments.c_str()],
653
+ },
654
+ @"id": tc.id.empty() ? [NSNull null] : [NSString stringWithUTF8String:tc.id.c_str()],
655
+ }];
656
+ }
657
+ } catch (const std::exception &e) {
658
+ // NSLog(@"Error parsing tool calls: %s", e.what());
659
+ }
660
+ }
661
+
662
+ NSMutableDictionary *result = [[NSMutableDictionary alloc] init];
663
+ result[@"text"] = [NSString stringWithUTF8String:llama->generated_text.c_str()]; // Original text
664
+ if (content) result[@"content"] = content;
665
+ if (reasoningContent) result[@"reasoning_content"] = reasoningContent;
666
+ if (toolCalls && toolCalls.count > 0) result[@"tool_calls"] = toolCalls;
667
+ result[@"completion_probabilities"] = [self tokenProbsToDict:llama->generated_token_probs];
668
+ result[@"tokens_predicted"] = @(llama->num_tokens_predicted);
669
+ result[@"tokens_evaluated"] = @(llama->num_prompt_tokens);
670
+ result[@"truncated"] = @(llama->truncated);
671
+ result[@"stopped_eos"] = @(llama->stopped_eos);
672
+ result[@"stopped_word"] = @(llama->stopped_word);
673
+ result[@"stopped_limit"] = @(llama->stopped_limit);
674
+ result[@"stopping_word"] = [NSString stringWithUTF8String:llama->stopping_word.c_str()];
675
+ result[@"tokens_cached"] = @(llama->n_past);
676
+ result[@"timings"] = @{
677
+ @"prompt_n": @(timings.n_p_eval),
678
+ @"prompt_ms": @(timings.t_p_eval_ms),
679
+ @"prompt_per_token_ms": @(timings.t_p_eval_ms / timings.n_p_eval),
680
+ @"prompt_per_second": @(1e3 / timings.t_p_eval_ms * timings.n_p_eval),
681
+ @"predicted_n": @(timings.n_eval),
682
+ @"predicted_n": @(timings.n_eval),
683
+ @"predicted_ms": @(timings.t_eval_ms),
684
+ @"predicted_per_token_ms": @(timings.t_eval_ms / timings.n_eval),
685
+ @"predicted_per_second": @(1e3 / timings.t_eval_ms * timings.n_eval),
686
+ };
687
+ return result;
688
+ }
689
+
690
+ - (void)stopCompletion {
691
+ llama->is_interrupted = true;
692
+ }
693
+
694
+ - (NSArray *)tokenize:(NSString *)text {
695
+ const std::vector<llama_token> toks = common_tokenize(llama->ctx, [text UTF8String], false);
696
+ NSMutableArray *result = [[NSMutableArray alloc] init];
697
+ for (llama_token tok : toks) {
698
+ [result addObject:@(tok)];
699
+ }
700
+ return result;
701
+ }
702
+
703
+ - (NSString *)detokenize:(NSArray *)tokens {
704
+ std::vector<llama_token> toks;
705
+ for (NSNumber *tok in tokens) {
706
+ toks.push_back([tok intValue]);
707
+ }
708
+ const std::string text = rnllama::tokens_to_str(llama->ctx, toks.cbegin(), toks.cend());
709
+ return [NSString stringWithUTF8String:text.c_str()];
710
+ }
711
+
712
+ - (NSDictionary *)embedding:(NSString *)text params:(NSDictionary *)params {
713
+ if (llama->params.embedding != true) {
714
+ @throw [NSException exceptionWithName:@"LlamaException" reason:@"Embedding is not enabled" userInfo:nil];
715
+ }
716
+
717
+ common_params embdParams;
718
+ embdParams.embedding = true;
719
+ embdParams.embd_normalize = llama->params.embd_normalize;
720
+
721
+ if (params[@"embd_normalize"] && [params[@"embd_normalize"] isKindOfClass:[NSNumber class]]) {
722
+ embdParams.embd_normalize = [params[@"embd_normalize"] intValue];
723
+ }
724
+
725
+ llama->rewind();
726
+
727
+ llama_perf_context_reset(llama->ctx);
728
+
729
+ llama->params.prompt = [text UTF8String];
730
+
731
+ llama->params.n_predict = 0;
732
+
733
+ if (!llama->initSampling()) {
734
+ @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to initialize sampling" userInfo:nil];
735
+ }
736
+ llama->beginCompletion();
737
+ llama->loadPrompt();
738
+ llama->doCompletion();
739
+
740
+ std::vector<float> result = llama->getEmbedding(embdParams);
741
+
742
+ NSMutableDictionary *resultDict = [[NSMutableDictionary alloc] init];
743
+ NSMutableArray *embeddingResult = [[NSMutableArray alloc] init];
744
+ for (float f : result) {
745
+ [embeddingResult addObject:@(f)];
746
+ }
747
+ resultDict[@"embedding"] = embeddingResult;
748
+ NSMutableArray *promptTokens = [[NSMutableArray alloc] init];
749
+ for (llama_token tok : llama->embd) {
750
+ [promptTokens addObject:[NSString stringWithUTF8String:common_token_to_piece(llama->ctx, tok).c_str()]];
751
+ }
752
+ resultDict[@"prompt_tokens"] = promptTokens;
753
+
754
+ llama->is_predicting = false;
755
+ return resultDict;
756
+ }
757
+
758
+ - (NSDictionary *)loadSession:(NSString *)path {
759
+ if (!path || [path length] == 0) {
760
+ @throw [NSException exceptionWithName:@"LlamaException" reason:@"Session path is empty" userInfo:nil];
761
+ }
762
+ if (![[NSFileManager defaultManager] fileExistsAtPath:path]) {
763
+ @throw [NSException exceptionWithName:@"LlamaException" reason:@"Session file does not exist" userInfo:nil];
764
+ }
765
+
766
+ size_t n_token_count_out = 0;
767
+ llama->embd.resize(llama->params.n_ctx);
768
+ if (!llama_state_load_file(llama->ctx, [path UTF8String], llama->embd.data(), llama->embd.capacity(), &n_token_count_out)) {
769
+ @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to load session" userInfo:nil];
770
+ }
771
+ llama->embd.resize(n_token_count_out);
772
+ const std::string text = rnllama::tokens_to_str(llama->ctx, llama->embd.cbegin(), llama->embd.cend());
773
+ return @{
774
+ @"tokens_loaded": @(n_token_count_out),
775
+ @"prompt": [NSString stringWithUTF8String:text.c_str()]
776
+ };
777
+ }
778
+
779
+ - (int)saveSession:(NSString *)path size:(int)size {
780
+ if (!path || [path length] == 0) {
781
+ @throw [NSException exceptionWithName:@"LlamaException" reason:@"Session path is empty" userInfo:nil];
782
+ }
783
+ std::vector<llama_token> session_tokens = llama->embd;
784
+ int default_size = session_tokens.size();
785
+ int save_size = size > 0 && size <= default_size ? size : default_size;
786
+ if (!llama_state_save_file(llama->ctx, [path UTF8String], session_tokens.data(), save_size)) {
787
+ @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to save session" userInfo:nil];
788
+ }
789
+ return session_tokens.size();
790
+ }
791
+
792
+ - (NSString *)bench:(int)pp tg:(int)tg pl:(int)pl nr:(int)nr {
793
+ return [NSString stringWithUTF8String:llama->bench(pp, tg, pl, nr).c_str()];
794
+ }
795
+
796
+ - (void)applyLoraAdapters:(NSArray *)loraAdapters {
797
+ std::vector<common_adapter_lora_info> lora_adapters;
798
+ for (NSDictionary *loraAdapter in loraAdapters) {
799
+ common_adapter_lora_info la;
800
+ la.path = [loraAdapter[@"path"] UTF8String];
801
+ la.scale = [loraAdapter[@"scaled"] doubleValue];
802
+ la.ptr = llama_adapter_lora_init(llama->model, la.path.c_str());
803
+ if (la.ptr == nullptr) {
804
+ @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to apply lora adapter" userInfo:nil];
805
+ }
806
+ lora_adapters.push_back(la);
807
+ }
808
+ int result = llama->applyLoraAdapters(lora_adapters);
809
+ if (result != 0) {
810
+ @throw [NSException exceptionWithName:@"LlamaException" reason:@"Failed to apply lora adapters" userInfo:nil];
811
+ }
812
+ }
813
+
814
+ - (void)removeLoraAdapters {
815
+ llama->removeLoraAdapters();
816
+ }
817
+
818
+ - (NSArray *)getLoadedLoraAdapters {
819
+ std::vector<common_adapter_lora_info> loaded_lora_adapters = llama->getLoadedLoraAdapters();
820
+ NSMutableArray *result = [[NSMutableArray alloc] init];
821
+ for (common_adapter_lora_info &la : loaded_lora_adapters) {
822
+ [result addObject:@{
823
+ @"path": [NSString stringWithUTF8String:la.path.c_str()],
824
+ @"scale": @(la.scale)
825
+ }];
826
+ }
827
+ return result;
828
+ }
829
+
830
+ - (void)invalidate {
831
+ delete llama;
832
+ // llama_backend_free();
833
+ }
834
+
835
+ @end