cui-llama.rn 1.7.3 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. package/README.md +217 -17
  2. package/android/src/main/CMakeLists.txt +34 -15
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +94 -8
  4. package/android/src/main/java/com/rnllama/RNLlama.java +247 -0
  5. package/android/src/main/jni.cpp +213 -14
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
  16. package/cpp/README.md +1 -1
  17. package/cpp/chat-parser.cpp +385 -0
  18. package/cpp/chat-parser.h +120 -0
  19. package/cpp/chat.cpp +726 -596
  20. package/cpp/chat.h +71 -6
  21. package/cpp/common.cpp +56 -38
  22. package/cpp/common.h +9 -3
  23. package/cpp/ggml-backend-reg.cpp +5 -0
  24. package/cpp/ggml-backend.cpp +10 -2
  25. package/cpp/ggml-common.h +4 -0
  26. package/cpp/ggml-cpu/amx/amx.cpp +1 -1
  27. package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
  28. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  29. package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
  30. package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
  31. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  32. package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
  33. package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  34. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  35. package/cpp/ggml-cpu/common.h +4 -3
  36. package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
  37. package/cpp/ggml-cpu/ggml-cpu.c +123 -104
  38. package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
  39. package/cpp/ggml-cpu/ops.cpp +330 -148
  40. package/cpp/ggml-cpu/ops.h +1 -0
  41. package/cpp/ggml-cpu/quants.c +1158 -0
  42. package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  43. package/cpp/ggml-cpu/repack.cpp +1571 -0
  44. package/cpp/ggml-cpu/repack.h +98 -0
  45. package/cpp/ggml-cpu/simd-mappings.h +330 -38
  46. package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  47. package/cpp/ggml-cpu/vec.cpp +87 -18
  48. package/cpp/ggml-cpu/vec.h +249 -94
  49. package/cpp/ggml-cpu.h +1 -0
  50. package/cpp/ggml-impl.h +63 -183
  51. package/cpp/ggml-llama-sim.metallib +0 -0
  52. package/cpp/ggml-llama.metallib +0 -0
  53. package/cpp/ggml-metal.m +152 -45
  54. package/cpp/ggml-quants.c +0 -2
  55. package/cpp/ggml.c +61 -21
  56. package/cpp/ggml.h +22 -3
  57. package/cpp/gguf.cpp +24 -3
  58. package/cpp/json-partial.cpp +256 -0
  59. package/cpp/json-partial.h +38 -0
  60. package/cpp/json-schema-to-grammar.cpp +5 -47
  61. package/cpp/json-schema-to-grammar.h +4 -4
  62. package/cpp/llama-arch.cpp +153 -3
  63. package/cpp/llama-arch.h +27 -1
  64. package/cpp/llama-batch.cpp +741 -272
  65. package/cpp/llama-batch.h +112 -54
  66. package/cpp/llama-chat.cpp +30 -8
  67. package/cpp/llama-chat.h +1 -0
  68. package/cpp/llama-context.cpp +524 -339
  69. package/cpp/llama-context.h +38 -17
  70. package/cpp/llama-cparams.cpp +4 -0
  71. package/cpp/llama-cparams.h +2 -0
  72. package/cpp/llama-grammar.cpp +12 -2
  73. package/cpp/llama-graph.cpp +431 -356
  74. package/cpp/llama-graph.h +126 -58
  75. package/cpp/llama-hparams.cpp +10 -2
  76. package/cpp/llama-hparams.h +19 -2
  77. package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
  78. package/cpp/llama-kv-cache-unified-iswa.h +128 -0
  79. package/cpp/llama-kv-cache-unified.cpp +1841 -0
  80. package/cpp/llama-kv-cache-unified.h +303 -0
  81. package/cpp/llama-kv-cells.h +439 -0
  82. package/cpp/llama-memory-hybrid.cpp +246 -0
  83. package/cpp/llama-memory-hybrid.h +138 -0
  84. package/cpp/llama-memory-recurrent.cpp +1112 -0
  85. package/cpp/llama-memory-recurrent.h +183 -0
  86. package/cpp/llama-memory.cpp +41 -0
  87. package/cpp/llama-memory.h +86 -5
  88. package/cpp/llama-mmap.cpp +1 -1
  89. package/cpp/llama-model-loader.cpp +42 -17
  90. package/cpp/llama-model-saver.cpp +1 -0
  91. package/cpp/llama-model.cpp +1639 -513
  92. package/cpp/llama-model.h +26 -0
  93. package/cpp/llama-sampling.cpp +2 -2
  94. package/cpp/llama-vocab.cpp +65 -28
  95. package/cpp/llama-vocab.h +1 -0
  96. package/cpp/llama.cpp +11 -7
  97. package/cpp/llama.h +150 -42
  98. package/cpp/minja/chat-template.hpp +1 -1
  99. package/cpp/minja/minja.hpp +1 -1
  100. package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
  101. package/cpp/nlohmann/json_fwd.hpp +187 -0
  102. package/cpp/regex-partial.cpp +204 -0
  103. package/cpp/regex-partial.h +56 -0
  104. package/cpp/rn-llama.cpp +646 -35
  105. package/cpp/rn-llama.h +32 -1
  106. package/cpp/rn-tts.h +39 -0
  107. package/cpp/sampling.cpp +7 -8
  108. package/cpp/tools/mtmd/clip-impl.h +5 -0
  109. package/cpp/tools/mtmd/clip.cpp +572 -436
  110. package/cpp/tools/mtmd/clip.h +14 -4
  111. package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
  112. package/cpp/tools/mtmd/mtmd-audio.h +2 -17
  113. package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
  114. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  115. package/cpp/tools/mtmd/mtmd.cpp +368 -248
  116. package/cpp/tools/mtmd/mtmd.h +6 -70
  117. package/cpp/unicode.cpp +5 -0
  118. package/ios/CMakeLists.txt +26 -6
  119. package/ios/RNLlama.h +1 -1
  120. package/ios/RNLlama.mm +153 -3
  121. package/ios/RNLlamaContext.h +9 -1
  122. package/ios/RNLlamaContext.mm +112 -9
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  131. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  132. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  133. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  134. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  135. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  136. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  137. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  138. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  139. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  140. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  141. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  142. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  143. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  144. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  145. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  146. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  147. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
  148. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  165. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  166. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  167. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  168. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  169. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  170. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  171. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  172. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  173. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  174. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  175. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  176. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  177. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  178. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  179. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  180. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  181. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  182. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  183. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  184. package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  185. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  186. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  187. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  188. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  189. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  190. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  191. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  192. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
  193. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
  194. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  195. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  196. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  197. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
  198. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  199. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  200. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  201. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  202. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  203. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  204. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  205. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  206. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  207. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  208. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  209. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  210. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  211. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  212. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  213. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  214. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  215. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
  216. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  217. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  218. package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  219. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  220. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  221. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  222. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  223. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  224. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  225. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  226. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  227. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  228. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  229. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  230. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  231. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  232. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  233. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  234. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  235. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  236. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  237. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  238. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  239. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  240. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  241. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  242. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  243. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  244. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  245. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  246. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  247. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  248. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  249. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  250. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  251. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  252. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
  253. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  254. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  255. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  256. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  257. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  258. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  259. package/jest/mock.js +24 -0
  260. package/package.json +1 -1
  261. package/src/NativeRNLlama.ts +46 -2
  262. package/src/index.ts +105 -1
  263. package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  264. package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
  265. package/cpp/ggml-cpu/sgemm.cpp +0 -3544
  266. package/cpp/ggml-cpu/sgemm.h +0 -14
  267. package/cpp/llama-kv-cache.cpp +0 -2827
  268. package/cpp/llama-kv-cache.h +0 -515
  269. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  270. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  271. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  272. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
  273. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  274. /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  275. /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
  276. /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
@@ -3,11 +3,11 @@
3
3
 
4
4
  #include "ggml-backend-impl.h"
5
5
  #include "ggml-backend.h"
6
- #include "ggml-cpu-traits.h"
6
+ #include "traits.h"
7
7
  #include "ggml-cpu-impl.h"
8
8
  #include "ggml-cpu.h"
9
9
  #include "ggml-impl.h"
10
- #include "ggml-cpu-quants.h"
10
+ #include "quants.h"
11
11
  #include "ggml-threading.h"
12
12
  #include "unary-ops.h"
13
13
  #include "binary-ops.h"
@@ -72,15 +72,13 @@
72
72
  #define UNUSED LM_GGML_UNUSED
73
73
  #define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
74
74
 
75
+ // precomputed f32 table for f16 (256 KB) (simd-mappings.h)
76
+ float lm_ggml_table_f32_f16[1 << 16];
77
+
75
78
  #if defined(__ARM_ARCH)
76
79
  struct lm_ggml_arm_arch_features_type {
77
- int has_neon;
78
- int has_dotprod;
79
- int has_i8mm;
80
- int has_sve;
81
80
  int sve_cnt;
82
- int has_sme;
83
- } lm_ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
81
+ } lm_ggml_arm_arch_features = { 0 };
84
82
  #endif
85
83
 
86
84
 
@@ -270,7 +268,11 @@ static const struct lm_ggml_type_traits_cpu type_traits_cpu[LM_GGML_TYPE_COUNT]
270
268
  .from_float = quantize_row_q4_K,
271
269
  .vec_dot = lm_ggml_vec_dot_q4_K_q8_K,
272
270
  .vec_dot_type = LM_GGML_TYPE_Q8_K,
271
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
272
+ .nrows = 2,
273
+ #else
273
274
  .nrows = 1,
275
+ #endif
274
276
  },
275
277
  [LM_GGML_TYPE_Q5_K] = {
276
278
  .from_float = quantize_row_q5_K,
@@ -555,6 +557,14 @@ void lm_ggml_barrier(struct lm_ggml_threadpool * tp) {
555
557
  #endif
556
558
  }
557
559
 
560
+ void lm_ggml_threadpool_chunk_set(struct lm_ggml_threadpool * tp, int value) {
561
+ atomic_store_explicit(&tp->current_chunk, value, memory_order_relaxed);
562
+ }
563
+
564
+ int lm_ggml_threadpool_chunk_add(struct lm_ggml_threadpool * tp, int value) {
565
+ return atomic_fetch_add_explicit(&tp->current_chunk, value, memory_order_relaxed);
566
+ }
567
+
558
568
  #if defined(__gnu_linux__)
559
569
  static cpu_set_t lm_ggml_get_numa_affinity(void) {
560
570
  cpu_set_t cpuset;
@@ -666,87 +676,15 @@ bool lm_ggml_is_numa(void) {
666
676
 
667
677
  #if defined(__linux__) && defined(__aarch64__)
668
678
  #include <sys/auxv.h>
669
- #elif defined(__APPLE__)
670
- #include <sys/sysctl.h>
671
- #endif
672
-
673
- #if !defined(HWCAP2_I8MM)
674
- #define HWCAP2_I8MM (1 << 13)
675
- #endif
676
-
677
- #if !defined(HWCAP2_SME)
678
- #define HWCAP2_SME (1 << 23)
679
679
  #endif
680
680
 
681
681
  static void lm_ggml_init_arm_arch_features(void) {
682
- #if defined(__linux__) && defined(__aarch64__)
683
- uint32_t hwcap = getauxval(AT_HWCAP);
684
- uint32_t hwcap2 = getauxval(AT_HWCAP2);
685
-
686
- lm_ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
687
- lm_ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
688
- lm_ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
689
- lm_ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
690
- lm_ggml_arm_arch_features.has_sme = !!(hwcap2 & HWCAP2_SME);
691
-
692
- #if defined(__ARM_FEATURE_SVE)
682
+ #if defined(__linux__) && defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
693
683
  lm_ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
694
684
  #endif
695
- #elif defined(__APPLE__)
696
- int oldp = 0;
697
- size_t size = sizeof(oldp);
698
- if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
699
- oldp = 0;
700
- }
701
- lm_ggml_arm_arch_features.has_neon = oldp;
702
-
703
- if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
704
- oldp = 0;
705
- }
706
- lm_ggml_arm_arch_features.has_dotprod = oldp;
707
-
708
- if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
709
- oldp = 0;
710
- }
711
- lm_ggml_arm_arch_features.has_i8mm = oldp;
712
-
713
- if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) != 0) {
714
- oldp = 0;
715
- }
716
- lm_ggml_arm_arch_features.has_sme = oldp;
717
-
718
- lm_ggml_arm_arch_features.has_sve = 0;
719
- lm_ggml_arm_arch_features.sve_cnt = 0;
720
- #else
721
- // Run-time CPU feature detection not implemented for this platform, fallback to compile time
722
- #if defined(__ARM_NEON)
723
- lm_ggml_arm_arch_features.has_neon = 1;
724
- #else
725
- lm_ggml_arm_arch_features.has_neon = 0;
726
- #endif
727
-
728
- #if defined(__ARM_FEATURE_MATMUL_INT8)
729
- lm_ggml_arm_arch_features.has_i8mm = 1;
730
- #else
731
- lm_ggml_arm_arch_features.has_i8mm = 0;
732
- #endif
733
-
734
- #if defined(__ARM_FEATURE_SVE)
735
- lm_ggml_arm_arch_features.has_sve = 1;
736
- lm_ggml_arm_arch_features.sve_cnt = 16;
737
- #else
738
- lm_ggml_arm_arch_features.has_sve = 0;
739
- lm_ggml_arm_arch_features.sve_cnt = 0;
740
- #endif
741
-
742
- #if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_SME2)
743
- lm_ggml_arm_arch_features.has_sme = 1;
744
- #else
745
- lm_ggml_arm_arch_features.has_sme = 0;
746
- #endif
747
- #endif
748
685
  }
749
- #endif
686
+
687
+ #endif // __ARM_ARCH
750
688
 
751
689
  struct lm_ggml_tensor * lm_ggml_new_i32(struct lm_ggml_context * ctx, int32_t value) {
752
690
  LM_GGML_ASSERT(!lm_ggml_get_no_alloc(ctx));
@@ -801,7 +739,7 @@ struct lm_ggml_tensor * lm_ggml_set_i32 (struct lm_ggml_tensor * tensor, int32_t
801
739
  {
802
740
  assert(tensor->nb[0] == sizeof(lm_ggml_fp16_t));
803
741
  for (int i = 0; i < n; i++) {
804
- lm_ggml_vec_set_f16(nc, (lm_ggml_fp16_t *)(data + i*n1), LM_GGML_FP32_TO_FP16(value));
742
+ lm_ggml_vec_set_f16(nc, (lm_ggml_fp16_t *)(data + i*n1), LM_GGML_CPU_FP32_TO_FP16(value));
805
743
  }
806
744
  } break;
807
745
  case LM_GGML_TYPE_BF16:
@@ -860,7 +798,7 @@ struct lm_ggml_tensor * lm_ggml_set_f32(struct lm_ggml_tensor * tensor, float va
860
798
  {
861
799
  assert(tensor->nb[0] == sizeof(lm_ggml_fp16_t));
862
800
  for (int i = 0; i < n; i++) {
863
- lm_ggml_vec_set_f16(nc, (lm_ggml_fp16_t *)(data + i*n1), LM_GGML_FP32_TO_FP16(value));
801
+ lm_ggml_vec_set_f16(nc, (lm_ggml_fp16_t *)(data + i*n1), LM_GGML_CPU_FP32_TO_FP16(value));
864
802
  }
865
803
  } break;
866
804
  case LM_GGML_TYPE_BF16:
@@ -911,7 +849,7 @@ int32_t lm_ggml_get_i32_1d(const struct lm_ggml_tensor * tensor, int i) {
911
849
  case LM_GGML_TYPE_F16:
912
850
  {
913
851
  LM_GGML_ASSERT(tensor->nb[0] == sizeof(lm_ggml_fp16_t));
914
- return LM_GGML_FP16_TO_FP32(((lm_ggml_fp16_t *)(tensor->data))[i]);
852
+ return LM_GGML_CPU_FP16_TO_FP32(((lm_ggml_fp16_t *)(tensor->data))[i]);
915
853
  }
916
854
  case LM_GGML_TYPE_BF16:
917
855
  {
@@ -956,7 +894,7 @@ void lm_ggml_set_i32_1d(const struct lm_ggml_tensor * tensor, int i, int32_t val
956
894
  case LM_GGML_TYPE_F16:
957
895
  {
958
896
  LM_GGML_ASSERT(tensor->nb[0] == sizeof(lm_ggml_fp16_t));
959
- ((lm_ggml_fp16_t *)(tensor->data))[i] = LM_GGML_FP32_TO_FP16(value);
897
+ ((lm_ggml_fp16_t *)(tensor->data))[i] = LM_GGML_CPU_FP32_TO_FP16(value);
960
898
  } break;
961
899
  case LM_GGML_TYPE_BF16:
962
900
  {
@@ -985,7 +923,7 @@ int32_t lm_ggml_get_i32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1,
985
923
  case LM_GGML_TYPE_I32:
986
924
  return ((int32_t *) data)[0];
987
925
  case LM_GGML_TYPE_F16:
988
- return LM_GGML_FP16_TO_FP32(((lm_ggml_fp16_t *) data)[0]);
926
+ return LM_GGML_CPU_FP16_TO_FP32(((lm_ggml_fp16_t *) data)[0]);
989
927
  case LM_GGML_TYPE_BF16:
990
928
  return LM_GGML_BF16_TO_FP32(((lm_ggml_bf16_t *) data)[0]);
991
929
  case LM_GGML_TYPE_F32:
@@ -1012,7 +950,7 @@ void lm_ggml_set_i32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, in
1012
950
  } break;
1013
951
  case LM_GGML_TYPE_F16:
1014
952
  {
1015
- ((lm_ggml_fp16_t *)(data))[0] = LM_GGML_FP32_TO_FP16(value);
953
+ ((lm_ggml_fp16_t *)(data))[0] = LM_GGML_CPU_FP32_TO_FP16(value);
1016
954
  } break;
1017
955
  case LM_GGML_TYPE_BF16:
1018
956
  {
@@ -1050,7 +988,7 @@ float lm_ggml_get_f32_1d(const struct lm_ggml_tensor * tensor, int i) {
1050
988
  }
1051
989
  case LM_GGML_TYPE_F16:
1052
990
  {
1053
- return LM_GGML_FP16_TO_FP32(((lm_ggml_fp16_t *)(tensor->data))[i]);
991
+ return LM_GGML_CPU_FP16_TO_FP32(((lm_ggml_fp16_t *)(tensor->data))[i]);
1054
992
  }
1055
993
  case LM_GGML_TYPE_BF16:
1056
994
  {
@@ -1089,7 +1027,7 @@ void lm_ggml_set_f32_1d(const struct lm_ggml_tensor * tensor, int i, float value
1089
1027
  } break;
1090
1028
  case LM_GGML_TYPE_F16:
1091
1029
  {
1092
- ((lm_ggml_fp16_t *)(tensor->data))[i] = LM_GGML_FP32_TO_FP16(value);
1030
+ ((lm_ggml_fp16_t *)(tensor->data))[i] = LM_GGML_CPU_FP32_TO_FP16(value);
1093
1031
  } break;
1094
1032
  case LM_GGML_TYPE_BF16:
1095
1033
  {
@@ -1116,7 +1054,7 @@ float lm_ggml_get_f32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, i
1116
1054
  case LM_GGML_TYPE_I32:
1117
1055
  return ((int32_t *) data)[0];
1118
1056
  case LM_GGML_TYPE_F16:
1119
- return LM_GGML_FP16_TO_FP32(((lm_ggml_fp16_t *) data)[0]);
1057
+ return LM_GGML_CPU_FP16_TO_FP32(((lm_ggml_fp16_t *) data)[0]);
1120
1058
  case LM_GGML_TYPE_BF16:
1121
1059
  return LM_GGML_BF16_TO_FP32(((lm_ggml_bf16_t *) data)[0]);
1122
1060
  case LM_GGML_TYPE_F32:
@@ -1143,7 +1081,7 @@ void lm_ggml_set_f32_nd(const struct lm_ggml_tensor * tensor, int i0, int i1, in
1143
1081
  } break;
1144
1082
  case LM_GGML_TYPE_F16:
1145
1083
  {
1146
- ((lm_ggml_fp16_t *)(data))[0] = LM_GGML_FP32_TO_FP16(value);
1084
+ ((lm_ggml_fp16_t *)(data))[0] = LM_GGML_CPU_FP32_TO_FP16(value);
1147
1085
  } break;
1148
1086
  case LM_GGML_TYPE_BF16:
1149
1087
  {
@@ -1955,6 +1893,10 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
1955
1893
  {
1956
1894
  lm_ggml_compute_forward_pad_reflect_1d(params, tensor);
1957
1895
  } break;
1896
+ case LM_GGML_OP_ROLL:
1897
+ {
1898
+ lm_ggml_compute_forward_roll(params, tensor);
1899
+ } break;
1958
1900
  case LM_GGML_OP_ARANGE:
1959
1901
  {
1960
1902
  lm_ggml_compute_forward_arange(params, tensor);
@@ -2279,6 +2221,7 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) {
2279
2221
  case LM_GGML_OP_UPSCALE:
2280
2222
  case LM_GGML_OP_PAD:
2281
2223
  case LM_GGML_OP_PAD_REFLECT_1D:
2224
+ case LM_GGML_OP_ROLL:
2282
2225
  case LM_GGML_OP_ARANGE:
2283
2226
  case LM_GGML_OP_TIMESTEP_EMBEDDING:
2284
2227
  case LM_GGML_OP_ARGSORT:
@@ -2414,12 +2357,32 @@ static bool lm_ggml_thread_apply_priority(int32_t prio) {
2414
2357
  // This is up to the applications.
2415
2358
  DWORD p = THREAD_PRIORITY_NORMAL;
2416
2359
  switch (prio) {
2360
+ case LM_GGML_SCHED_PRIO_LOW: p = THREAD_PRIORITY_BELOW_NORMAL; break;
2417
2361
  case LM_GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break;
2418
2362
  case LM_GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break;
2419
2363
  case LM_GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break;
2420
2364
  case LM_GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
2421
2365
  }
2422
2366
 
2367
+ if (prio != LM_GGML_SCHED_PRIO_LOW) {
2368
+ // Tell Windows that this thread should not be throttled (needs its own CPU core).
2369
+ // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
2370
+ // all our threads onto the first 4 cores which results in terrible performance with
2371
+ // n_threads > 4
2372
+ #if _WIN32_WINNT >= 0x0602
2373
+ THREAD_POWER_THROTTLING_STATE t;
2374
+ ZeroMemory(&t, sizeof(t));
2375
+ t.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION;
2376
+ t.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED;
2377
+ t.StateMask = 0;
2378
+
2379
+ if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &t, sizeof(t))) {
2380
+ LM_GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError());
2381
+ return false;
2382
+ }
2383
+ #endif
2384
+ }
2385
+
2423
2386
  if (prio == LM_GGML_SCHED_PRIO_NORMAL) {
2424
2387
  // Keep inherited policy/priority
2425
2388
  return true;
@@ -2447,6 +2410,8 @@ static bool lm_ggml_thread_apply_priority(int32_t prio) {
2447
2410
  struct sched_param p;
2448
2411
  int32_t policy = SCHED_OTHER;
2449
2412
  switch (prio) {
2413
+ // TODO: there seems to be no way to set lower prio on Apple platforms
2414
+ case LM_GGML_SCHED_PRIO_LOW: policy = SCHED_OTHER; p.sched_priority = 0; break;
2450
2415
  case LM_GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
2451
2416
  case LM_GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
2452
2417
  case LM_GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
@@ -2503,6 +2468,7 @@ static bool lm_ggml_thread_apply_priority(int32_t prio) {
2503
2468
  struct sched_param p;
2504
2469
  int32_t policy = SCHED_OTHER;
2505
2470
  switch (prio) {
2471
+ case LM_GGML_SCHED_PRIO_LOW: policy = SCHED_BATCH; p.sched_priority = 0; break;
2506
2472
  case LM_GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
2507
2473
  case LM_GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
2508
2474
  case LM_GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
@@ -3178,9 +3144,24 @@ void lm_ggml_cpu_fp32_to_fp16(const float * x, lm_ggml_fp16_t * y, int64_t n) {
3178
3144
  __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3179
3145
  _mm_storel_epi64((__m128i *)(y + i), y_vec);
3180
3146
  }
3147
+ #elif defined(__NNPA__)
3148
+ for (; i + 7 < n; i += 8) {
3149
+ float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
3150
+ float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
3151
+ uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
3152
+ uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3153
+ vec_xst(v_y, 0, (lm_ggml_fp16_t *)(y + i));
3154
+ }
3155
+ for (; i + 3 < n; i += 4) {
3156
+ float32x4_t v_x = vec_xl(0, (const float *)(x + i));
3157
+ float32x4_t v_zero = vec_splats(0.0f);
3158
+ uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
3159
+ uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3160
+ vec_xst(v_y, 0, (lm_ggml_fp16_t *)(y + i));
3161
+ }
3181
3162
  #endif
3182
3163
  for (; i < n; ++i) {
3183
- y[i] = LM_GGML_FP32_TO_FP16(x[i]);
3164
+ y[i] = LM_GGML_CPU_FP32_TO_FP16(x[i]);
3184
3165
  }
3185
3166
  }
3186
3167
 
@@ -3204,9 +3185,25 @@ void lm_ggml_cpu_fp16_to_fp32(const lm_ggml_fp16_t * x, float * y, int64_t n) {
3204
3185
  __m128 y_vec = _mm_cvtph_ps(x_vec);
3205
3186
  _mm_storeu_ps(y + i, y_vec);
3206
3187
  }
3188
+ #elif defined(__NNPA__)
3189
+ for (; i + 7 < n; i += 8) {
3190
+ uint16x8_t v_x = vec_xl(0, (const lm_ggml_fp16_t *)(x + i));
3191
+ uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3192
+ float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3193
+ float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
3194
+ vec_xst(v_yh, 0, (float *)(y + i + 0));
3195
+ vec_xst(v_yl, 0, (float *)(y + i + 4));
3196
+ }
3197
+ for (; i + 3 < n; i += 4) {
3198
+ uint16x8_t v_x = vec_xl(0, (const lm_ggml_fp16_t *)(x + i));
3199
+ uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3200
+ float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3201
+ vec_xst(v_yh, 0, (float *)(y + i));
3202
+ }
3207
3203
  #endif
3204
+
3208
3205
  for (; i < n; ++i) {
3209
- y[i] = LM_GGML_FP16_TO_FP32(x[i]);
3206
+ y[i] = LM_GGML_CPU_FP16_TO_FP32(x[i]);
3210
3207
  }
3211
3208
  }
3212
3209
 
@@ -3406,9 +3403,17 @@ int lm_ggml_cpu_has_vxe(void) {
3406
3403
  #endif
3407
3404
  }
3408
3405
 
3406
+ int lm_ggml_cpu_has_nnpa(void) {
3407
+ #if defined(LM_GGML_NNPA)
3408
+ return 1;
3409
+ #else
3410
+ return 0;
3411
+ #endif
3412
+ }
3413
+
3409
3414
  int lm_ggml_cpu_has_neon(void) {
3410
3415
  #if defined(__ARM_ARCH) && defined(__ARM_NEON)
3411
- return lm_ggml_arm_arch_features.has_neon;
3416
+ return 1;
3412
3417
  #else
3413
3418
  return 0;
3414
3419
  #endif
@@ -3416,7 +3421,7 @@ int lm_ggml_cpu_has_neon(void) {
3416
3421
 
3417
3422
  int lm_ggml_cpu_has_dotprod(void) {
3418
3423
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
3419
- return lm_ggml_arm_arch_features.has_dotprod;
3424
+ return 1;
3420
3425
  #else
3421
3426
  return 0;
3422
3427
  #endif
@@ -3424,7 +3429,7 @@ int lm_ggml_cpu_has_dotprod(void) {
3424
3429
 
3425
3430
  int lm_ggml_cpu_has_sve(void) {
3426
3431
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
3427
- return lm_ggml_arm_arch_features.has_sve;
3432
+ return 1;
3428
3433
  #else
3429
3434
  return 0;
3430
3435
  #endif
@@ -3432,7 +3437,7 @@ int lm_ggml_cpu_has_sve(void) {
3432
3437
 
3433
3438
  int lm_ggml_cpu_has_matmul_int8(void) {
3434
3439
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
3435
- return lm_ggml_arm_arch_features.has_i8mm;
3440
+ return 1;
3436
3441
  #else
3437
3442
  return 0;
3438
3443
  #endif
@@ -3448,14 +3453,14 @@ int lm_ggml_cpu_get_sve_cnt(void) {
3448
3453
 
3449
3454
  int lm_ggml_cpu_has_sme(void) {
3450
3455
  #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
3451
- return lm_ggml_arm_arch_features.has_sme;
3456
+ return 1;
3452
3457
  #else
3453
3458
  return 0;
3454
3459
  #endif
3455
3460
  }
3456
3461
 
3457
3462
  void lm_ggml_cpu_init(void) {
3458
- // needed to initialize f16 tables
3463
+ // needed to initialize lm_ggml_time
3459
3464
  {
3460
3465
  struct lm_ggml_init_params params = { 0, NULL, false };
3461
3466
  struct lm_ggml_context * ctx = lm_ggml_init(params);
@@ -3476,14 +3481,28 @@ void lm_ggml_cpu_init(void) {
3476
3481
  uint16_t u16;
3477
3482
  lm_ggml_fp16_t fp16;
3478
3483
  } u = {i};
3479
- float f = LM_GGML_FP16_TO_FP32(u.fp16);
3480
- lm_ggml_table_gelu_f16[i] = LM_GGML_FP32_TO_FP16(lm_ggml_gelu_f32(f));
3481
- lm_ggml_table_gelu_quick_f16[i] = LM_GGML_FP32_TO_FP16(lm_ggml_gelu_quick_f32(f));
3484
+ float f = LM_GGML_COMPUTE_FP16_TO_FP32(u.fp16);
3485
+ lm_ggml_table_f32_f16[i] = f;
3486
+ lm_ggml_table_gelu_f16[i] = LM_GGML_CPU_FP32_TO_FP16(lm_ggml_gelu_f32(f));
3487
+ lm_ggml_table_gelu_quick_f16[i] = LM_GGML_CPU_FP32_TO_FP16(lm_ggml_gelu_quick_f32(f));
3482
3488
  }
3483
3489
 
3484
3490
  const uint64_t t_end = lm_ggml_time_us(); UNUSED(t_end);
3485
3491
 
3486
3492
  LM_GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0);
3493
+
3494
+ #ifdef LM_GGML_USE_OPENMP
3495
+ //if (!getenv("OMP_WAIT_POLICY")) {
3496
+ // // set the wait policy to active, so that OpenMP threads don't sleep
3497
+ // putenv("OMP_WAIT_POLICY=active");
3498
+ //}
3499
+
3500
+ if (!getenv("KMP_BLOCKTIME")) {
3501
+ // set the time to wait before sleeping a thread
3502
+ // this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
3503
+ putenv("KMP_BLOCKTIME=200"); // 200ms
3504
+ }
3505
+ #endif
3487
3506
  }
3488
3507
 
3489
3508
  #if defined(__ARM_ARCH)
@@ -1,8 +1,8 @@
1
1
  #include "ggml-backend.h"
2
2
  #include "ggml-backend-impl.h"
3
3
  #include "ggml-cpu.h"
4
- #include "ggml-cpu-aarch64.h"
5
- #include "ggml-cpu-traits.h"
4
+ #include "repack.h"
5
+ #include "traits.h"
6
6
  #include "ggml-impl.h"
7
7
  #include "amx/amx.h"
8
8
 
@@ -11,7 +11,7 @@
11
11
  #include <vector>
12
12
 
13
13
  #ifdef LM_GGML_USE_CPU_HBM
14
- # include "ggml-cpu-hbm.h"
14
+ # include "hbm.h"
15
15
  #endif
16
16
 
17
17
  #ifdef LM_GGML_USE_CPU_KLEIDIAI
@@ -51,9 +51,9 @@ std::vector<lm_ggml_backend_buffer_type_t>& lm_ggml_backend_cpu_get_extra_buffer
51
51
  }
52
52
  #endif
53
53
 
54
- #ifdef LM_GGML_USE_CPU_AARCH64
55
- if (lm_ggml_backend_cpu_aarch64_buffer_type()) {
56
- bufts.push_back(lm_ggml_backend_cpu_aarch64_buffer_type());
54
+ #ifdef LM_GGML_USE_CPU_REPACK
55
+ if (lm_ggml_backend_cpu_repack_buffer_type()) {
56
+ bufts.push_back(lm_ggml_backend_cpu_repack_buffer_type());
57
57
  }
58
58
  #endif
59
59
 
@@ -578,6 +578,9 @@ static lm_ggml_backend_feature * lm_ggml_backend_cpu_get_features(lm_ggml_backen
578
578
  if (lm_ggml_cpu_has_vxe()) {
579
579
  features.push_back({ "VXE", "1" });
580
580
  }
581
+ if (lm_ggml_cpu_has_nnpa()) {
582
+ features.push_back({ "NNPA", "1" });
583
+ }
581
584
  if (lm_ggml_cpu_has_wasm_simd()) {
582
585
  features.push_back({ "WASM_SIMD", "1" });
583
586
  }
@@ -596,8 +599,8 @@ static lm_ggml_backend_feature * lm_ggml_backend_cpu_get_features(lm_ggml_backen
596
599
  #ifdef LM_GGML_USE_CPU_KLEIDIAI
597
600
  features.push_back({ "KLEIDIAI", "1" });
598
601
  #endif
599
- #ifdef LM_GGML_USE_CPU_AARCH64
600
- features.push_back({ "AARCH64_REPACK", "1" });
602
+ #ifdef LM_GGML_USE_CPU_REPACK
603
+ features.push_back({ "REPACK", "1" });
601
604
  #endif
602
605
 
603
606
  features.push_back({ nullptr, nullptr });