cui-llama.rn 1.7.3 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. package/README.md +217 -17
  2. package/android/src/main/CMakeLists.txt +34 -15
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +94 -8
  4. package/android/src/main/java/com/rnllama/RNLlama.java +247 -0
  5. package/android/src/main/jni.cpp +213 -14
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
  16. package/cpp/README.md +1 -1
  17. package/cpp/chat-parser.cpp +385 -0
  18. package/cpp/chat-parser.h +120 -0
  19. package/cpp/chat.cpp +726 -596
  20. package/cpp/chat.h +71 -6
  21. package/cpp/common.cpp +56 -38
  22. package/cpp/common.h +9 -3
  23. package/cpp/ggml-backend-reg.cpp +5 -0
  24. package/cpp/ggml-backend.cpp +10 -2
  25. package/cpp/ggml-common.h +4 -0
  26. package/cpp/ggml-cpu/amx/amx.cpp +1 -1
  27. package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
  28. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  29. package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
  30. package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
  31. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  32. package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
  33. package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  34. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  35. package/cpp/ggml-cpu/common.h +4 -3
  36. package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
  37. package/cpp/ggml-cpu/ggml-cpu.c +123 -104
  38. package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
  39. package/cpp/ggml-cpu/ops.cpp +330 -148
  40. package/cpp/ggml-cpu/ops.h +1 -0
  41. package/cpp/ggml-cpu/quants.c +1158 -0
  42. package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  43. package/cpp/ggml-cpu/repack.cpp +1571 -0
  44. package/cpp/ggml-cpu/repack.h +98 -0
  45. package/cpp/ggml-cpu/simd-mappings.h +330 -38
  46. package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  47. package/cpp/ggml-cpu/vec.cpp +87 -18
  48. package/cpp/ggml-cpu/vec.h +249 -94
  49. package/cpp/ggml-cpu.h +1 -0
  50. package/cpp/ggml-impl.h +63 -183
  51. package/cpp/ggml-llama-sim.metallib +0 -0
  52. package/cpp/ggml-llama.metallib +0 -0
  53. package/cpp/ggml-metal.m +152 -45
  54. package/cpp/ggml-quants.c +0 -2
  55. package/cpp/ggml.c +61 -21
  56. package/cpp/ggml.h +22 -3
  57. package/cpp/gguf.cpp +24 -3
  58. package/cpp/json-partial.cpp +256 -0
  59. package/cpp/json-partial.h +38 -0
  60. package/cpp/json-schema-to-grammar.cpp +5 -47
  61. package/cpp/json-schema-to-grammar.h +4 -4
  62. package/cpp/llama-arch.cpp +153 -3
  63. package/cpp/llama-arch.h +27 -1
  64. package/cpp/llama-batch.cpp +741 -272
  65. package/cpp/llama-batch.h +112 -54
  66. package/cpp/llama-chat.cpp +30 -8
  67. package/cpp/llama-chat.h +1 -0
  68. package/cpp/llama-context.cpp +524 -339
  69. package/cpp/llama-context.h +38 -17
  70. package/cpp/llama-cparams.cpp +4 -0
  71. package/cpp/llama-cparams.h +2 -0
  72. package/cpp/llama-grammar.cpp +12 -2
  73. package/cpp/llama-graph.cpp +431 -356
  74. package/cpp/llama-graph.h +126 -58
  75. package/cpp/llama-hparams.cpp +10 -2
  76. package/cpp/llama-hparams.h +19 -2
  77. package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
  78. package/cpp/llama-kv-cache-unified-iswa.h +128 -0
  79. package/cpp/llama-kv-cache-unified.cpp +1841 -0
  80. package/cpp/llama-kv-cache-unified.h +303 -0
  81. package/cpp/llama-kv-cells.h +439 -0
  82. package/cpp/llama-memory-hybrid.cpp +246 -0
  83. package/cpp/llama-memory-hybrid.h +138 -0
  84. package/cpp/llama-memory-recurrent.cpp +1112 -0
  85. package/cpp/llama-memory-recurrent.h +183 -0
  86. package/cpp/llama-memory.cpp +41 -0
  87. package/cpp/llama-memory.h +86 -5
  88. package/cpp/llama-mmap.cpp +1 -1
  89. package/cpp/llama-model-loader.cpp +42 -17
  90. package/cpp/llama-model-saver.cpp +1 -0
  91. package/cpp/llama-model.cpp +1639 -513
  92. package/cpp/llama-model.h +26 -0
  93. package/cpp/llama-sampling.cpp +2 -2
  94. package/cpp/llama-vocab.cpp +65 -28
  95. package/cpp/llama-vocab.h +1 -0
  96. package/cpp/llama.cpp +11 -7
  97. package/cpp/llama.h +150 -42
  98. package/cpp/minja/chat-template.hpp +1 -1
  99. package/cpp/minja/minja.hpp +1 -1
  100. package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
  101. package/cpp/nlohmann/json_fwd.hpp +187 -0
  102. package/cpp/regex-partial.cpp +204 -0
  103. package/cpp/regex-partial.h +56 -0
  104. package/cpp/rn-llama.cpp +646 -35
  105. package/cpp/rn-llama.h +32 -1
  106. package/cpp/rn-tts.h +39 -0
  107. package/cpp/sampling.cpp +7 -8
  108. package/cpp/tools/mtmd/clip-impl.h +5 -0
  109. package/cpp/tools/mtmd/clip.cpp +572 -436
  110. package/cpp/tools/mtmd/clip.h +14 -4
  111. package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
  112. package/cpp/tools/mtmd/mtmd-audio.h +2 -17
  113. package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
  114. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  115. package/cpp/tools/mtmd/mtmd.cpp +368 -248
  116. package/cpp/tools/mtmd/mtmd.h +6 -70
  117. package/cpp/unicode.cpp +5 -0
  118. package/ios/CMakeLists.txt +26 -6
  119. package/ios/RNLlama.h +1 -1
  120. package/ios/RNLlama.mm +153 -3
  121. package/ios/RNLlamaContext.h +9 -1
  122. package/ios/RNLlamaContext.mm +112 -9
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  131. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  132. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  133. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  134. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  135. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  136. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  137. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  138. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  139. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  140. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  141. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  142. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  143. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  144. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  145. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  146. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  147. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
  148. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  165. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  166. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  167. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  168. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  169. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  170. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  171. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  172. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  173. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  174. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  175. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  176. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  177. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  178. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  179. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  180. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  181. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  182. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  183. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  184. package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  185. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  186. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  187. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  188. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  189. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  190. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  191. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  192. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
  193. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
  194. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  195. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  196. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  197. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
  198. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  199. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  200. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  201. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  202. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  203. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  204. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  205. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  206. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  207. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  208. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  209. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  210. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  211. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  212. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  213. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  214. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  215. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
  216. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  217. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  218. package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  219. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  220. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  221. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  222. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  223. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  224. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  225. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  226. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  227. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  228. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  229. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  230. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  231. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  232. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  233. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  234. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  235. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  236. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  237. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  238. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  239. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  240. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  241. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  242. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  243. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  244. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  245. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  246. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  247. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  248. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  249. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  250. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  251. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  252. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
  253. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  254. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  255. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  256. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  257. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  258. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  259. package/jest/mock.js +24 -0
  260. package/package.json +1 -1
  261. package/src/NativeRNLlama.ts +46 -2
  262. package/src/index.ts +105 -1
  263. package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  264. package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
  265. package/cpp/ggml-cpu/sgemm.cpp +0 -3544
  266. package/cpp/ggml-cpu/sgemm.h +0 -14
  267. package/cpp/llama-kv-cache.cpp +0 -2827
  268. package/cpp/llama-kv-cache.h +0 -515
  269. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  270. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  271. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  272. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
  273. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  274. /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  275. /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
  276. /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
package/cpp/rn-llama.cpp CHANGED
@@ -1,7 +1,9 @@
1
1
  #include "rn-llama.h"
2
+ #include "rn-tts.h"
2
3
 
3
4
  // Include multimodal support
4
5
  #include "tools/mtmd/mtmd.h"
6
+ #include "tools/mtmd/mtmd-helper.h"
5
7
  #include "tools/mtmd/clip.h"
6
8
 
7
9
  namespace rnllama {
@@ -23,38 +25,39 @@ static const std::string base64_chars =
23
25
  "abcdefghijklmnopqrstuvwxyz"
24
26
  "0123456789+/";
25
27
 
26
- // Base64 decoding function
27
- static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
28
- std::vector<uint8_t> decoded;
29
- int in_len = encoded_string.size();
28
+ static inline bool is_base64(uint8_t c) {
29
+ return (isalnum(c) || (c == '+') || (c == '/'));
30
+ }
31
+
32
+ using raw_buffer = std::vector<uint8_t>;
33
+
34
+ static inline raw_buffer base64_decode(const std::string & encoded_string) {
30
35
  int i = 0;
31
36
  int j = 0;
32
37
  int in_ = 0;
33
- unsigned char char_array_4[4], char_array_3[3];
34
38
 
35
- while (in_len-- && (encoded_string[in_] != '=')) {
36
- if (isspace(encoded_string[in_])) {
37
- in_++;
38
- continue;
39
- }
39
+ int in_len = encoded_string.size();
40
40
 
41
- if (encoded_string[in_] == '=' || base64_chars.find(encoded_string[in_]) == std::string::npos) {
42
- break;
43
- }
41
+ uint8_t char_array_4[4];
42
+ uint8_t char_array_3[3];
44
43
 
44
+ raw_buffer ret;
45
+
46
+ while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
45
47
  char_array_4[i++] = encoded_string[in_]; in_++;
46
48
  if (i == 4) {
47
49
  for (i = 0; i < 4; i++) {
48
50
  char_array_4[i] = base64_chars.find(char_array_4[i]);
49
51
  }
50
52
 
51
- char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
53
+ char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
52
54
  char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
53
- char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
55
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
54
56
 
55
- for (i = 0; i < 3; i++) {
56
- decoded.push_back(char_array_3[i]);
57
+ for (i = 0; (i < 3); i++) {
58
+ ret.push_back(char_array_3[i]);
57
59
  }
60
+
58
61
  i = 0;
59
62
  }
60
63
  }
@@ -68,16 +71,16 @@ static std::vector<uint8_t> base64_decode(const std::string &encoded_string) {
68
71
  char_array_4[j] = base64_chars.find(char_array_4[j]);
69
72
  }
70
73
 
71
- char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
74
+ char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
72
75
  char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
73
- char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
76
+ char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
74
77
 
75
78
  for (j = 0; j < i - 1; j++) {
76
- decoded.push_back(char_array_3[j]);
79
+ ret.push_back(char_array_3[j]);
77
80
  }
78
81
  }
79
82
 
80
- return decoded;
83
+ return ret;
81
84
  }
82
85
 
83
86
  static const std::vector<lm_ggml_type> kv_cache_types = {
@@ -248,6 +251,7 @@ void llama_rn_context::rewind() {
248
251
  generated_text = "";
249
252
  generated_text.reserve(params.n_ctx);
250
253
  generated_token_probs.clear();
254
+ audio_tokens.clear();
251
255
  truncated = false;
252
256
  context_full = false;
253
257
  stopped_eos = false;
@@ -258,6 +262,8 @@ void llama_rn_context::rewind() {
258
262
  n_remain = 0;
259
263
  n_past = 0;
260
264
  params.sampling.n_prev = n_ctx;
265
+ next_token_uses_guide_token = true;
266
+ guide_tokens.clear();
261
267
  }
262
268
 
263
269
  bool llama_rn_context::initSampling() {
@@ -305,7 +311,8 @@ common_chat_params llama_rn_context::getFormattedChatWithJinja(
305
311
  const std::string &json_schema,
306
312
  const std::string &tools,
307
313
  const bool &parallel_tool_calls,
308
- const std::string &tool_choice
314
+ const std::string &tool_choice,
315
+ const bool &enable_thinking
309
316
  ) const {
310
317
  common_chat_templates_inputs inputs;
311
318
  inputs.use_jinja = true;
@@ -321,7 +328,7 @@ common_chat_params llama_rn_context::getFormattedChatWithJinja(
321
328
  if (!json_schema.empty()) {
322
329
  inputs.json_schema = json::parse(json_schema);
323
330
  }
324
- inputs.extract_reasoning = params.reasoning_format != COMMON_REASONING_FORMAT_NONE;
331
+ inputs.enable_thinking = enable_thinking;
325
332
 
326
333
  // If chat_template is provided, create new one and use it (probably slow)
327
334
  if (!chat_template.empty()) {
@@ -419,7 +426,8 @@ void llama_rn_context::loadPrompt(const std::vector<std::string> &media_paths) {
419
426
  }
420
427
 
421
428
  // Manage KV cache
422
- llama_kv_self_seq_rm(ctx, 0, n_past, -1);
429
+ auto * kv = llama_get_memory(ctx);
430
+ llama_memory_seq_rm(kv, 0, n_past, -1);
423
431
 
424
432
  LOG_VERBOSE("prompt ingested, n_past: %d, cached: %s, to_eval: %s",
425
433
  n_past,
@@ -438,6 +446,10 @@ void llama_rn_context::loadPrompt(const std::vector<std::string> &media_paths) {
438
446
  n_past, embd.size(), num_prompt_tokens, has_media ? 1 : 0);
439
447
  }
440
448
 
449
+ void llama_rn_context::setGuideTokens(const std::vector<llama_token> &tokens) {
450
+ guide_tokens = tokens;
451
+ }
452
+
441
453
  void llama_rn_context::beginCompletion() {
442
454
  // number of tokens to keep when resetting context
443
455
  n_remain = params.n_predict;
@@ -469,8 +481,9 @@ completion_token_output llama_rn_context::nextToken()
469
481
  const int n_left = n_past - params.n_keep - 1;
470
482
  const int n_discard = n_left/2;
471
483
 
472
- llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
473
- llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
484
+ auto * kv = llama_get_memory(ctx);
485
+ llama_memory_seq_rm (kv, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
486
+ llama_memory_seq_add(kv, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
474
487
 
475
488
  for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
476
489
  {
@@ -528,7 +541,14 @@ completion_token_output llama_rn_context::nextToken()
528
541
  std::vector<llama_token_data> candidates;
529
542
  candidates.reserve(llama_vocab_n_tokens(vocab));
530
543
 
531
- result.tok = common_sampler_sample(ctx_sampling, ctx, -1);
544
+ llama_token new_token_id = common_sampler_sample(ctx_sampling, ctx, -1);
545
+
546
+ if (next_token_uses_guide_token && !guide_tokens.empty() && !llama_vocab_is_control(vocab, new_token_id) && !llama_vocab_is_eog(vocab, new_token_id)) {
547
+ new_token_id = guide_tokens[0];
548
+ guide_tokens.erase(guide_tokens.begin());
549
+ }
550
+ next_token_uses_guide_token = (new_token_id == 198);
551
+ result.tok = new_token_id;
532
552
 
533
553
  llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling);
534
554
 
@@ -611,6 +631,13 @@ completion_token_output llama_rn_context::doCompletion()
611
631
  const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok);
612
632
  generated_text += token_text;
613
633
 
634
+ if (isVocoderEnabled()) {
635
+ tts_type type = getTTSType();
636
+ if ((type == OUTETTS_V0_2 || type == OUTETTS_V0_3) && (token_with_probs.tok >= 151672 && token_with_probs.tok <= 155772)) {
637
+ audio_tokens.push_back(token_with_probs.tok);
638
+ }
639
+ }
640
+
614
641
  if (params.sampling.n_probs > 0)
615
642
  {
616
643
  generated_token_probs.push_back(token_with_probs);
@@ -687,6 +714,94 @@ std::vector<float> llama_rn_context::getEmbedding(common_params &embd_params)
687
714
  return out;
688
715
  }
689
716
 
717
+ // Helper function to format rerank task: [BOS]query[EOS][SEP]doc[EOS]
718
+ static std::vector<llama_token> format_rerank(const llama_vocab * vocab, const std::vector<llama_token> & query, const std::vector<llama_token> & doc) {
719
+ std::vector<llama_token> result;
720
+
721
+ // Get EOS token - use SEP token as fallback if EOS is not available
722
+ llama_token eos_token = llama_vocab_eos(vocab);
723
+ if (eos_token == LLAMA_TOKEN_NULL) {
724
+ eos_token = llama_vocab_sep(vocab);
725
+ }
726
+
727
+ result.reserve(doc.size() + query.size() + 4);
728
+ if (llama_vocab_get_add_bos(vocab)) {
729
+ result.push_back(llama_vocab_bos(vocab));
730
+ }
731
+ result.insert(result.end(), query.begin(), query.end());
732
+ if (llama_vocab_get_add_eos(vocab)) {
733
+ result.push_back(eos_token);
734
+ }
735
+ if (llama_vocab_get_add_sep(vocab)) {
736
+ result.push_back(llama_vocab_sep(vocab));
737
+ }
738
+ result.insert(result.end(), doc.begin(), doc.end());
739
+ if (llama_vocab_get_add_eos(vocab)) {
740
+ result.push_back(eos_token);
741
+ }
742
+
743
+ return result;
744
+ }
745
+
746
+ std::vector<float> llama_rn_context::rerank(const std::string &query, const std::vector<std::string> &documents)
747
+ {
748
+ std::vector<float> scores;
749
+
750
+ // Check if this model supports reranking (requires rank pooling type)
751
+ const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
752
+ if (pooling_type != LLAMA_POOLING_TYPE_RANK) {
753
+ throw std::runtime_error("reranking not supported, pooling_type: " + std::to_string(pooling_type));
754
+ }
755
+
756
+ if (!params.embedding) {
757
+ throw std::runtime_error("embedding disabled but required for reranking");
758
+ }
759
+
760
+ const llama_vocab * vocab = llama_model_get_vocab(model);
761
+ std::vector<llama_token> query_tokens = common_tokenize(vocab, query, false, true);
762
+
763
+ scores.reserve(documents.size());
764
+
765
+ for (size_t i = 0; i < documents.size(); ++i) {
766
+ rewind();
767
+ embd = {};
768
+
769
+ const std::string & document = documents[i];
770
+
771
+ std::vector<llama_token> doc_tokens = common_tokenize(vocab, document, false, true);
772
+
773
+ std::vector<llama_token> rerank_tokens = format_rerank(vocab, query_tokens, doc_tokens);
774
+
775
+ llama_memory_clear(llama_get_memory(ctx), false);
776
+
777
+ // Process the rerank input
778
+ try {
779
+ params.prompt = tokens_to_str(ctx, rerank_tokens.begin(), rerank_tokens.end());
780
+ initSampling();
781
+ loadPrompt({}); // No media paths for rerank
782
+ beginCompletion();
783
+ doCompletion();
784
+
785
+ // Get the rerank score (single embedding value for rank pooling)
786
+ float *data = llama_get_embeddings_seq(ctx, 0);
787
+ if (data) {
788
+ scores.push_back(data[0]); // For rank pooling, the score is the first (and only) dimension
789
+ } else {
790
+ scores.push_back(-1e6f); // Default low score if computation failed
791
+ }
792
+ } catch (const std::exception &e) {
793
+ LOG_WARNING("rerank computation failed for document %zu: %s", i, e.what());
794
+ scores.push_back(-1e6f);
795
+ }
796
+ endCompletion();
797
+
798
+ // Clear KV cache again to prepare for next document or restore original state
799
+ llama_memory_clear(llama_get_memory(ctx), false);
800
+ }
801
+
802
+ return scores;
803
+ }
804
+
690
805
  std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
691
806
  {
692
807
  if (is_predicting) {
@@ -721,7 +836,7 @@ std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
721
836
  }
722
837
  batch.logits[batch.n_tokens - 1] = 1; // true
723
838
 
724
- llama_kv_self_clear(ctx);
839
+ llama_memory_clear(llama_get_memory(ctx), true);
725
840
 
726
841
  const int64_t t_pp_start = llama_time_us();
727
842
  if (llama_decode(ctx, batch) != 0)
@@ -729,7 +844,8 @@ std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
729
844
  LOG_ERROR("llama_decode() failed during prompt", "");
730
845
  }
731
846
  const int64_t t_pp_end = llama_time_us();
732
- llama_kv_self_clear(ctx);
847
+
848
+ llama_memory_clear(llama_get_memory(ctx), true);
733
849
 
734
850
  if (is_interrupted) break;
735
851
 
@@ -753,7 +869,7 @@ std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
753
869
 
754
870
  const int64_t t_tg_end = llama_time_us();
755
871
 
756
- llama_kv_self_clear(ctx);
872
+ llama_memory_clear(llama_get_memory(ctx), true);
757
873
 
758
874
  const double t_pp = (t_pp_end - t_pp_start) / 1000000.0;
759
875
  const double t_tg = (t_tg_end - t_tg_start) / 1000000.0;
@@ -779,7 +895,7 @@ std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
779
895
  tg_std = 0;
780
896
  }
781
897
 
782
- if (is_interrupted) llama_kv_self_clear(ctx);
898
+ if (is_interrupted) llama_memory_clear(llama_get_memory(ctx), true);
783
899
  endCompletion();
784
900
 
785
901
  char model_desc[128];
@@ -903,11 +1019,11 @@ mtmd_tokenize_result tokenizeWithMedia(llama_rn_context_mtmd *mtmd_wrapper, cons
903
1019
  }
904
1020
 
905
1021
  // Decode base64
906
- std::vector<uint8_t> media_data = base64_decode(base64_data);
1022
+ raw_buffer media_data = base64_decode(base64_data);
907
1023
  LOG_INFO("[DEBUG] Base64 decoded, size: %zu bytes", media_data.size());
908
1024
 
909
1025
  // Load bitmap from memory buffer using direct initialization
910
- mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(media_data.data(), media_data.size()));
1026
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mtmd_wrapper->mtmd_ctx, media_data.data(), media_data.size()));
911
1027
  if (!bmp.ptr) {
912
1028
  bitmaps.entries.clear();
913
1029
  throw std::runtime_error("Failed to load base64 media");
@@ -942,7 +1058,7 @@ mtmd_tokenize_result tokenizeWithMedia(llama_rn_context_mtmd *mtmd_wrapper, cons
942
1058
  fclose(file);
943
1059
 
944
1060
  // Create bitmap directly
945
- mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(media_path.c_str()));
1061
+ mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(mtmd_wrapper->mtmd_ctx, media_path.c_str()));
946
1062
  if (!bmp.ptr) {
947
1063
  bitmaps.entries.clear();
948
1064
  throw std::runtime_error("Failed to load media");
@@ -1176,7 +1292,8 @@ void llama_rn_context::processMedia(
1176
1292
  }
1177
1293
 
1178
1294
  // Clear all KV cache entries after position n_past
1179
- llama_kv_self_seq_rm(ctx, 0, n_past, -1);
1295
+ auto * kv = llama_get_memory(ctx);
1296
+ llama_memory_seq_rm(kv, 0, n_past, -1);
1180
1297
 
1181
1298
  LOG_INFO("[DEBUG] Evaluating chunks: n_past=%d, n_batch=%d", n_past, params.n_batch);
1182
1299
 
@@ -1282,4 +1399,498 @@ void llama_rn_context::releaseMultimodal() {
1282
1399
  }
1283
1400
  }
1284
1401
 
1402
+ struct llama_rn_context_vocoder {
1403
+ common_init_result init_result;
1404
+ llama_model *model = nullptr;
1405
+ llama_context *ctx = nullptr;
1406
+ tts_type type = UNKNOWN;
1407
+ };
1408
+
1409
+ bool llama_rn_context::initVocoder(const std::string &vocoder_model_path) {
1410
+ if (vocoder_wrapper != nullptr) {
1411
+ return true;
1412
+ }
1413
+ params.model.path = vocoder_model_path;
1414
+ params.embedding = true;
1415
+ params.ctx_shift = false;
1416
+ params.n_ubatch = params.n_batch;
1417
+
1418
+ llama_rn_context_vocoder *wrapper = new llama_rn_context_vocoder{
1419
+ .init_result = common_init_from_params(params),
1420
+ };
1421
+
1422
+ wrapper->model = wrapper->init_result.model.get();
1423
+ wrapper->ctx = wrapper->init_result.context.get();
1424
+
1425
+ if (wrapper->model == nullptr || wrapper->ctx == nullptr) {
1426
+ LOG_ERROR("Failed to load vocoder model: %s", vocoder_model_path.c_str());
1427
+ delete wrapper;
1428
+ return false;
1429
+ }
1430
+
1431
+ wrapper->type = getTTSType();
1432
+ vocoder_wrapper = wrapper;
1433
+ has_vocoder = true;
1434
+ return true;
1435
+ }
1436
+
1437
+ bool llama_rn_context::isVocoderEnabled() const {
1438
+ return has_vocoder && vocoder_wrapper != nullptr;
1439
+ }
1440
+
1441
+ void llama_rn_context::releaseVocoder() {
1442
+ if (vocoder_wrapper != nullptr) {
1443
+ delete vocoder_wrapper;
1444
+ vocoder_wrapper = nullptr;
1445
+ }
1446
+ has_vocoder = false;
1447
+ }
1448
+
1449
+ tts_type llama_rn_context::getTTSType(json speaker) {
1450
+ if (vocoder_wrapper == nullptr) {
1451
+ return UNKNOWN;
1452
+ }
1453
+ if (speaker.is_object() && speaker.contains("version")) {
1454
+ std::string version = speaker["version"].get<std::string>();
1455
+ if (version == "0.2") {
1456
+ return OUTETTS_V0_2;
1457
+ } else if (version == "0.3") {
1458
+ return OUTETTS_V0_3;
1459
+ } else {
1460
+ LOG_ERROR("Unsupported speaker version '%s'\n", version.c_str());
1461
+ }
1462
+ }
1463
+ if (vocoder_wrapper->type != UNKNOWN) {
1464
+ return vocoder_wrapper->type;
1465
+ }
1466
+ const char *chat_template = llama_model_chat_template(model, nullptr);
1467
+ if (chat_template && std::string(chat_template) == "outetts-0.3") {
1468
+ return OUTETTS_V0_3;
1469
+ }
1470
+ return OUTETTS_V0_2;
1471
+ }
1472
+
1473
+ static std::string audio_text_from_speaker(json speaker, const tts_type type = OUTETTS_V0_2) {
1474
+ std::string audio_text = "<|text_start|>";
1475
+
1476
+ if (type == OUTETTS_V0_2 || type == OUTETTS_V0_3) {
1477
+ std::string separator = (type == OUTETTS_V0_3) ? "<|space|>" : "<|text_sep|>";
1478
+ for (const auto &word : speaker["words"]) {
1479
+ audio_text += word["word"].get<std::string>() + separator;
1480
+ }
1481
+ }
1482
+
1483
+ return audio_text;
1484
+ }
1485
+
1486
+ static std::string audio_data_from_speaker(json speaker, const tts_type type = OUTETTS_V0_2) {
1487
+ std::string audio_data = "<|audio_start|>\n";
1488
+
1489
+ if (type == OUTETTS_V0_2 || type == OUTETTS_V0_3) {
1490
+ std::string code_start = (type == OUTETTS_V0_3) ? "" : "<|code_start|>";
1491
+ std::string code_end = (type == OUTETTS_V0_3) ? "<|space|>" : "<|code_end|>";
1492
+ for (const auto &word : speaker["words"]) {
1493
+ std::string word_text = word["word"].get<std::string>();
1494
+ double duration = word["duration"].get<double>();
1495
+ std::vector<int> codes = word["codes"].get<std::vector<int>>();
1496
+
1497
+ // Create the audio output entry
1498
+ std::ostringstream word_entry;
1499
+ word_entry << word_text << "<|t_" << std::fixed << std::setprecision(2)
1500
+ << duration << "|>" + code_start;
1501
+ for (const auto &Code : codes) {
1502
+ word_entry << "<|" << Code << "|>";
1503
+ }
1504
+ word_entry << code_end << "\n";
1505
+ audio_data += word_entry.str();
1506
+ }
1507
+ }
1508
+
1509
+ return audio_data;
1510
+ }
1511
+
1512
+ static const std::map<int, std::string> ones = {
1513
+ {0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"},
1514
+ {5, "five"}, {6, "six"}, {7, "seven"}, {8, "eight"}, {9, "nine"},
1515
+ {10, "ten"}, {11, "eleven"}, {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"},
1516
+ {15, "fifteen"}, {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"}
1517
+ };
1518
+
1519
+ static const std::map<int, std::string> tens = {
1520
+ {2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"},
1521
+ {6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"}
1522
+ };
1523
+
1524
+ // Convert a number less than 1000 to words
1525
+ static std::string convert_less_than_thousand(int num) {
1526
+ std::string result;
1527
+
1528
+ if (num >= 100) {
1529
+ result += ones.at(num / 100) + " hundred ";
1530
+ num %= 100;
1531
+ }
1532
+
1533
+ if (num >= 20) {
1534
+ result += tens.at(num / 10);
1535
+ if (num % 10 > 0) {
1536
+ result += "-" + ones.at(num % 10);
1537
+ }
1538
+ } else if (num > 0) {
1539
+ result += ones.at(num);
1540
+ }
1541
+
1542
+ return result;
1543
+ }
1544
+
1545
+ static std::string number_to_words(const std::string & number_str) {
1546
+ try {
1547
+ size_t decimal_pos = number_str.find('.');
1548
+ std::string integer_part = number_str.substr(0, decimal_pos);
1549
+
1550
+ int int_number = std::stoi(integer_part);
1551
+ std::string result;
1552
+
1553
+ if (int_number == 0) {
1554
+ result = "zero";
1555
+ } else {
1556
+ if (int_number >= 1000000000) {
1557
+ int billions = int_number / 1000000000;
1558
+ result += convert_less_than_thousand(billions) + " billion ";
1559
+ int_number %= 1000000000;
1560
+ }
1561
+
1562
+ if (int_number >= 1000000) {
1563
+ int millions = int_number / 1000000;
1564
+ result += convert_less_than_thousand(millions) + " million ";
1565
+ int_number %= 1000000;
1566
+ }
1567
+
1568
+ if (int_number >= 1000) {
1569
+ int thousands = int_number / 1000;
1570
+ result += convert_less_than_thousand(thousands) + " thousand ";
1571
+ int_number %= 1000;
1572
+ }
1573
+
1574
+ if (int_number > 0) {
1575
+ result += convert_less_than_thousand(int_number);
1576
+ }
1577
+ }
1578
+
1579
+ // Handle decimal part
1580
+ if (decimal_pos != std::string::npos) {
1581
+ result += " point";
1582
+ std::string decimal_part = number_str.substr(decimal_pos + 1);
1583
+ for (char digit : decimal_part) {
1584
+ result += " " + ones.at(digit - '0');
1585
+ }
1586
+ }
1587
+
1588
+ return result;
1589
+ } catch (const std::exception& e) {
1590
+ // Skip if fails
1591
+ return " ";
1592
+ }
1593
+ }
1594
+
1595
+ static std::string replace_numbers_with_words(const std::string & input_text) {
1596
+ std::regex number_pattern(R"(\d+(\.\d+)?)");
1597
+ std::string result;
1598
+ auto it = std::sregex_iterator(input_text.begin(), input_text.end(), number_pattern);
1599
+ auto end = std::sregex_iterator();
1600
+
1601
+ size_t last_pos = 0;
1602
+ for (std::sregex_iterator i = it; i != end; ++i) {
1603
+ const std::smatch& match = *i;
1604
+ result.append(input_text, last_pos, match.position() - last_pos);
1605
+ result.append(number_to_words(match.str()));
1606
+ last_pos = match.position() + match.length();
1607
+ }
1608
+ result.append(input_text, last_pos);
1609
+
1610
+ return result;
1611
+ }
1612
+
1613
+ // Based on: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/version/v1/prompt_processor.py#L39
1614
+ static std::string process_text(const std::string & text, const tts_type tts_type = OUTETTS_V0_2) {
1615
+
1616
+ // For now I skipped text romanization as I am unsure how to handle
1617
+ // uroman and MeCab implementations in C++
1618
+ // maybe something like https://github.com/anyascii/anyascii/ could work.
1619
+ // currently only English would be supported in this function
1620
+
1621
+ std::string processed_text = replace_numbers_with_words(text);
1622
+
1623
+ std::transform(processed_text.begin(), processed_text.end(),
1624
+ processed_text.begin(), ::tolower);
1625
+
1626
+ std::regex special_chars(R"([-_/,\.\\])");
1627
+ processed_text = std::regex_replace(processed_text, special_chars, " ");
1628
+
1629
+ std::regex non_alpha(R"([^a-z\s])");
1630
+ processed_text = std::regex_replace(processed_text, non_alpha, "");
1631
+
1632
+ std::regex multiple_spaces(R"(\s+)");
1633
+ processed_text = std::regex_replace(processed_text, multiple_spaces, " ");
1634
+
1635
+ processed_text = std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), "");
1636
+
1637
+ /*
1638
+ Replace spaces with the separator token same as in line 365
1639
+
1640
+ for (auto & c : prompt_user) {
1641
+ if (c == ' ') {
1642
+ prompt_clean += "<|text_sep|>";
1643
+ */
1644
+ std::string separator = (tts_type == OUTETTS_V0_3) ? "<|space|>" : "<|text_sep|>";
1645
+ processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), separator);
1646
+
1647
+ return processed_text;
1648
+ }
1649
+
1650
+ std::string llama_rn_context::getFormattedAudioCompletion(const std::string &speaker_json_str, const std::string &text_to_speak) {
1651
+ if (!isVocoderEnabled()) {
1652
+ throw std::runtime_error("Vocoder is not enabled but audio completion is requested");
1653
+ }
1654
+ std::string audio_text = default_audio_text;
1655
+ std::string audio_data = default_audio_data;
1656
+
1657
+ json speaker = speaker_json_str.empty() ? json::object() : json::parse(speaker_json_str);
1658
+ const tts_type type = getTTSType(speaker);
1659
+ if (type == UNKNOWN) {
1660
+ LOG_ERROR("Unknown TTS version");
1661
+ return "";
1662
+ }
1663
+
1664
+ if (type == OUTETTS_V0_3) {
1665
+ audio_text = std::regex_replace(audio_text, std::regex(R"(<\|text_sep\|>)"), "<|space|>");
1666
+ audio_data = std::regex_replace(audio_data, std::regex(R"(<\|code_start\|>)"), "");
1667
+ audio_data = std::regex_replace(audio_data, std::regex(R"(<\|code_end\|>)"), "<|space|>");
1668
+ }
1669
+
1670
+ if (!speaker_json_str.empty()) {
1671
+ audio_text = audio_text_from_speaker(speaker, type);
1672
+ audio_data = audio_data_from_speaker(speaker, type);
1673
+ }
1674
+
1675
+ return "<|im_start|>\n" + audio_text + process_text(text_to_speak, type) + "<|text_end|>\n" + audio_data + "\n";
1676
+ }
1677
+
1678
+ std::vector<llama_token> llama_rn_context::getAudioCompletionGuideTokens(const std::string &text_to_speak) {
1679
+ const llama_vocab * vocab = llama_model_get_vocab(model);
1680
+ const tts_type type = getTTSType();
1681
+ std::string clean_text = process_text(text_to_speak, type);
1682
+
1683
+ const std::string& delimiter = (type == OUTETTS_V0_3 ? "<|space|>" : "<|text_sep|>");
1684
+
1685
+ std::vector<llama_token> result;
1686
+ size_t start = 0;
1687
+ size_t end = clean_text.find(delimiter);
1688
+
1689
+ //first token is always a newline, as it was not previously added
1690
+ result.push_back(common_tokenize(vocab, "\n", false, true)[0]);
1691
+
1692
+ while (end != std::string::npos) {
1693
+ std::string current_word = clean_text.substr(start, end - start);
1694
+ auto tmp = common_tokenize(vocab, current_word, false, true);
1695
+ result.push_back(tmp[0]);
1696
+ start = end + delimiter.length();
1697
+ end = clean_text.find(delimiter, start);
1698
+ }
1699
+
1700
+ // Add the last part
1701
+ std::string current_word = clean_text.substr(start);
1702
+ auto tmp = common_tokenize(vocab, current_word, false, true);
1703
+ if (tmp.size() > 0) {
1704
+ result.push_back(tmp[0]);
1705
+ }
1706
+ return result;
1707
+ }
1708
+
1709
+ static void fill_hann_window(int length, bool periodic, float * output) {
1710
+ int offset = -1;
1711
+ if (periodic) {
1712
+ offset = 0;
1713
+ }
1714
+ for (int i = 0; i < length; i++) {
1715
+ output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
1716
+ }
1717
+ }
1718
+
1719
+ static void twiddle(float * real, float * imag, int k, int N) {
1720
+ float angle = 2 * M_PI * k / N;
1721
+ *real = cos(angle);
1722
+ *imag = sin(angle);
1723
+ }
1724
+
1725
+ static void irfft(int n, const float * inp_cplx, float * out_real) {
1726
+ int N = n / 2 + 1;
1727
+
1728
+ std::vector<float> real_input(N);
1729
+ std::vector<float> imag_input(N);
1730
+ for (int i = 0; i < N; ++i) {
1731
+ real_input[i] = inp_cplx[2 * i];
1732
+ imag_input[i] = inp_cplx[2 * i + 1];
1733
+ }
1734
+
1735
+ std::vector<float> real_output(n);
1736
+ std::vector<float> imag_output(n);
1737
+
1738
+ for (int k = 0; k < n; ++k) {
1739
+ real_output[k] = 0.0f;
1740
+ imag_output[k] = 0.0f;
1741
+ for (int m = 0; m < N; ++m) {
1742
+ float twiddle_real;
1743
+ float twiddle_imag;
1744
+
1745
+ twiddle(&twiddle_real, &twiddle_imag, k * m, n);
1746
+
1747
+ real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag;
1748
+ imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real;
1749
+ }
1750
+ }
1751
+
1752
+ for (int i = 0; i < n; ++i) {
1753
+ out_real[i] = real_output[i] / N;
1754
+ }
1755
+ }
1756
+
1757
+ static void fold(const std::vector<float> & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector<float> & output) {
1758
+ int64_t output_height = n_out;
1759
+ int64_t kernel_w = n_win;
1760
+ int64_t stride_w = n_hop;
1761
+ int64_t width = n_out;
1762
+
1763
+ output.resize(width, 0.0f);
1764
+
1765
+ int64_t col_idx = 0;
1766
+ for (int64_t w_col = 0; w_col < width; ++w_col) {
1767
+ int64_t start = w_col * stride_w - n_pad;
1768
+ int64_t end = start + kernel_w;
1769
+
1770
+ for (int64_t w_im = start; w_im < end; ++w_im) {
1771
+ if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) {
1772
+ output[w_im] += data[col_idx];
1773
+ }
1774
+ col_idx++;
1775
+ }
1776
+ }
1777
+
1778
+ output.resize(n_out - 2 * n_pad);
1779
+ }
1780
+
1781
+ static std::vector<float> embd_to_audio(
1782
+ const float * embd,
1783
+ const int n_codes,
1784
+ const int n_embd,
1785
+ const int n_thread) {
1786
+ const int n_fft = 1280;
1787
+ const int n_hop = 320;
1788
+ const int n_win = 1280;
1789
+ const int n_pad = (n_win - n_hop)/2;
1790
+ const int n_out = (n_codes - 1)*n_hop + n_win;
1791
+
1792
+ std::vector<float> hann(n_fft);
1793
+
1794
+ fill_hann_window(hann.size(), true, hann.data());
1795
+
1796
+ int n_spec = n_embd*n_codes;
1797
+
1798
+ std::vector<float> E (n_spec);
1799
+ std::vector<float> S (n_spec);
1800
+ std::vector<float> ST(n_spec);
1801
+
1802
+ for (int l = 0; l < n_codes; ++l) {
1803
+ for (int k = 0; k < n_embd; ++k) {
1804
+ E[k*n_codes + l] = embd[l*n_embd + k];
1805
+ }
1806
+ }
1807
+
1808
+ for (int k = 0; k < n_embd/2; ++k) {
1809
+ for (int l = 0; l < n_codes; ++l) {
1810
+ float mag = E[(k )*n_codes + l];
1811
+ float phi = E[(k + n_embd/2)*n_codes + l];
1812
+
1813
+ mag = exp(mag);
1814
+
1815
+ if (mag > 1e2) {
1816
+ mag = 1e2;
1817
+ }
1818
+ S[2*(k*n_codes + l) + 0] = mag*cosf(phi);
1819
+ S[2*(k*n_codes + l) + 1] = mag*sinf(phi);
1820
+ }
1821
+ }
1822
+
1823
+ for (int l = 0; l < n_codes; ++l) {
1824
+ for (int k = 0; k < n_embd/2; ++k) {
1825
+ ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0];
1826
+ ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1];
1827
+ }
1828
+ }
1829
+
1830
+ std::vector<float> res (n_codes*n_fft);
1831
+ std::vector<float> hann2(n_codes*n_fft);
1832
+
1833
+ std::vector<std::thread> workers(n_thread);
1834
+ for (int i = 0; i < n_thread; ++i) {
1835
+ workers[i] = std::thread([&, i]() {
1836
+ for (int l = i; l < n_codes; l += n_thread) {
1837
+ irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft);
1838
+ for (int j = 0; j < n_fft; ++j) {
1839
+ res [l*n_fft + j] *= hann[j];
1840
+ hann2[l*n_fft + j] = hann[j] * hann[j];
1841
+ }
1842
+ }
1843
+ });
1844
+ }
1845
+ for (int i = 0; i < n_thread; ++i) {
1846
+ workers[i].join();
1847
+ }
1848
+
1849
+ std::vector<float> audio;
1850
+ std::vector<float> env;
1851
+
1852
+ fold(res, n_out, n_win, n_hop, n_pad, audio);
1853
+ fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once
1854
+
1855
+ for (size_t i = 0; i < audio.size(); ++i) {
1856
+ audio[i] /= env[i];
1857
+ }
1858
+
1859
+ return audio;
1860
+ }
1861
+
1862
+ std::vector<float> llama_rn_context::decodeAudioTokens(const std::vector<llama_token> &tokens) {
1863
+ if (!isVocoderEnabled()) {
1864
+ throw std::runtime_error("Vocoder is not enabled but audio completion is requested");
1865
+ }
1866
+ std::vector<llama_token> tokens_audio = tokens;
1867
+ tts_type type = getTTSType();
1868
+ if (type == OUTETTS_V0_3 || type == OUTETTS_V0_2) {
1869
+ tokens_audio.erase(std::remove_if(tokens_audio.begin(), tokens_audio.end(), [](llama_token t) { return t < 151672 || t > 155772; }), tokens_audio.end());
1870
+ for (auto & token : tokens_audio) {
1871
+ token -= 151672;
1872
+ }
1873
+ } else {
1874
+ LOG_ERROR("Unsupported audio tokens");
1875
+ return std::vector<float>();
1876
+ }
1877
+ const int n_codes = tokens_audio.size();
1878
+ llama_batch batch = llama_batch_init(n_codes, 0, 1);
1879
+ for (size_t i = 0; i < tokens_audio.size(); ++i) {
1880
+ llama_batch_add(&batch, tokens_audio[i], i, { 0 }, true);
1881
+ }
1882
+ if (batch.n_tokens != n_codes) {
1883
+ LOG_ERROR("batch.n_tokens != n_codes: %d != %d", batch.n_tokens, n_codes);
1884
+ return std::vector<float>();
1885
+ }
1886
+ if (llama_encode(vocoder_wrapper->ctx, batch) != 0) {
1887
+ LOG_ERROR("llama_encode() failed");
1888
+ return std::vector<float>();
1889
+ }
1890
+ llama_synchronize(vocoder_wrapper->ctx);
1891
+ const int n_embd = llama_model_n_embd(vocoder_wrapper->model);
1892
+ const float * embd = llama_get_embeddings(vocoder_wrapper->ctx);
1893
+ return embd_to_audio(embd, n_codes, n_embd, params.cpuparams.n_threads);
1894
+ }
1895
+
1285
1896
  }