cui-llama.rn 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (285) hide show
  1. package/README.md +35 -7
  2. package/android/src/main/CMakeLists.txt +22 -11
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +42 -6
  4. package/android/src/main/java/com/rnllama/RNLlama.java +139 -4
  5. package/android/src/main/jni.cpp +173 -18
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +24 -4
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +22 -2
  16. package/cpp/LICENSE +21 -0
  17. package/cpp/chat.cpp +129 -107
  18. package/cpp/chat.h +2 -0
  19. package/cpp/common.cpp +58 -78
  20. package/cpp/common.h +29 -21
  21. package/cpp/ggml-alloc.c +4 -1
  22. package/cpp/ggml-backend.cpp +9 -5
  23. package/cpp/ggml-backend.h +4 -4
  24. package/cpp/ggml-cpp.h +1 -1
  25. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  26. package/cpp/ggml-cpu/amx/amx.h +8 -0
  27. package/cpp/ggml-cpu/amx/common.h +91 -0
  28. package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
  29. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  30. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/binary-ops.h +1 -1
  31. package/cpp/ggml-cpu/common.h +72 -0
  32. package/cpp/{ggml-cpu-aarch64.cpp → ggml-cpu/ggml-cpu-aarch64.cpp} +809 -103
  33. package/cpp/{ggml-cpu-quants.c → ggml-cpu/ggml-cpu-quants.c} +306 -6
  34. package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} +114 -55
  35. package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} +32 -16
  36. package/cpp/{ops.cpp → ggml-cpu/ops.cpp} +353 -173
  37. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/ops.h +2 -20
  38. package/cpp/{sgemm.cpp → ggml-cpu/sgemm.cpp} +501 -0
  39. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/simd-mappings.h +7 -3
  40. package/{ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/unary-ops.h +1 -1
  41. package/cpp/{vec.cpp → ggml-cpu/vec.cpp} +0 -6
  42. package/{ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/vec.h +16 -0
  43. package/cpp/ggml-cpu.h +5 -0
  44. package/cpp/ggml-impl.h +16 -9
  45. package/cpp/ggml-llama-sim.metallib +0 -0
  46. package/cpp/ggml-llama.metallib +0 -0
  47. package/cpp/ggml-metal-impl.h +36 -11
  48. package/cpp/ggml-metal.m +810 -176
  49. package/cpp/ggml-opt.cpp +373 -190
  50. package/cpp/ggml-opt.h +49 -28
  51. package/cpp/ggml-quants.c +0 -6
  52. package/cpp/ggml.c +227 -282
  53. package/cpp/ggml.h +82 -101
  54. package/cpp/gguf.cpp +33 -33
  55. package/cpp/json-schema-to-grammar.cpp +3 -0
  56. package/cpp/llama-adapter.cpp +6 -0
  57. package/cpp/llama-arch.cpp +49 -17
  58. package/cpp/llama-arch.h +9 -0
  59. package/cpp/llama-batch.cpp +8 -2
  60. package/cpp/llama-batch.h +2 -1
  61. package/cpp/llama-chat.cpp +39 -16
  62. package/cpp/llama-chat.h +4 -2
  63. package/cpp/llama-context.cpp +440 -611
  64. package/cpp/llama-context.h +44 -33
  65. package/cpp/llama-cparams.h +1 -0
  66. package/cpp/llama-graph.cpp +214 -291
  67. package/cpp/llama-graph.h +69 -21
  68. package/cpp/llama-hparams.cpp +17 -1
  69. package/cpp/llama-hparams.h +39 -5
  70. package/cpp/llama-kv-cache.cpp +2067 -620
  71. package/cpp/llama-kv-cache.h +410 -108
  72. package/cpp/llama-memory.h +12 -1
  73. package/cpp/llama-model-loader.cpp +24 -15
  74. package/cpp/llama-model-saver.cpp +281 -0
  75. package/cpp/llama-model-saver.h +37 -0
  76. package/cpp/llama-model.cpp +1089 -359
  77. package/cpp/llama-model.h +19 -3
  78. package/cpp/llama-sampling.cpp +20 -7
  79. package/cpp/llama-vocab.cpp +54 -9
  80. package/cpp/llama-vocab.h +6 -0
  81. package/cpp/llama.cpp +14 -0
  82. package/cpp/llama.h +86 -142
  83. package/cpp/minja/chat-template.hpp +9 -5
  84. package/cpp/minja/minja.hpp +69 -36
  85. package/cpp/rn-llama.cpp +602 -190
  86. package/cpp/rn-llama.h +34 -8
  87. package/cpp/sampling.cpp +57 -50
  88. package/cpp/tools/mtmd/clip-impl.h +462 -0
  89. package/cpp/tools/mtmd/clip.cpp +4024 -0
  90. package/cpp/tools/mtmd/clip.h +101 -0
  91. package/cpp/tools/mtmd/miniaudio.h +93468 -0
  92. package/cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  93. package/cpp/tools/mtmd/mtmd-audio.h +62 -0
  94. package/cpp/tools/mtmd/mtmd-helper.cpp +297 -0
  95. package/cpp/tools/mtmd/mtmd.cpp +942 -0
  96. package/cpp/tools/mtmd/mtmd.h +362 -0
  97. package/cpp/tools/mtmd/stb_image.h +7988 -0
  98. package/ios/CMakeLists.txt +20 -10
  99. package/ios/RNLlama.h +6 -0
  100. package/ios/RNLlama.mm +82 -3
  101. package/ios/RNLlamaContext.h +5 -1
  102. package/ios/RNLlamaContext.mm +131 -38
  103. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +2 -0
  104. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +29 -21
  105. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  106. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  107. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  108. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  109. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  110. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  111. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +82 -101
  112. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  113. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  114. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  115. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  116. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  117. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  118. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  119. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  120. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  121. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  122. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +86 -142
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  131. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  132. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  133. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  134. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  135. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  136. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  137. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  138. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  139. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  140. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  141. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  142. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  143. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  144. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  145. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  146. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  147. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  148. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  149. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  150. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  151. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  152. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  153. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  154. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  155. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  156. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  160. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +2 -0
  161. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +29 -21
  162. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  163. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  164. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  165. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  166. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  167. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  168. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +82 -101
  169. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  170. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  171. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  172. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  173. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  174. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  175. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  176. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  177. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  178. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  179. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  180. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  181. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +86 -142
  182. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  183. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  184. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  185. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
  186. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  187. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  188. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  189. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  190. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  191. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  192. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  193. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  194. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  195. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  196. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  197. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  198. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  199. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  200. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  201. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  202. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  203. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  204. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  205. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  206. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  207. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  208. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  209. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  210. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  211. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  212. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  213. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  214. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  215. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  216. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  217. package/jest/mock.js +33 -7
  218. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  219. package/lib/commonjs/index.js +153 -21
  220. package/lib/commonjs/index.js.map +1 -1
  221. package/lib/module/NativeRNLlama.js.map +1 -1
  222. package/lib/module/index.js +152 -20
  223. package/lib/module/index.js.map +1 -1
  224. package/lib/typescript/NativeRNLlama.d.ts +54 -4
  225. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  226. package/lib/typescript/index.d.ts +72 -6
  227. package/lib/typescript/index.d.ts.map +1 -1
  228. package/package.json +1 -1
  229. package/src/NativeRNLlama.ts +72 -4
  230. package/src/index.ts +212 -38
  231. package/cpp/binary-ops.h +0 -16
  232. package/cpp/ops.h +0 -128
  233. package/cpp/simd-mappings.h +0 -888
  234. package/cpp/unary-ops.h +0 -28
  235. package/cpp/vec.h +0 -802
  236. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  237. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  238. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  239. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  240. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  241. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +0 -128
  242. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  243. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  244. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +0 -802
  245. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  246. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  247. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  248. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  249. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  250. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  251. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  252. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  253. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  254. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  255. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  256. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  257. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +0 -128
  258. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  259. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  260. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +0 -28
  261. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +0 -16
  262. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  263. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  264. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  265. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  266. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +0 -128
  267. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  268. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +0 -888
  269. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  270. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  271. package/lib/commonjs/chat.js +0 -37
  272. package/lib/commonjs/chat.js.map +0 -1
  273. package/lib/module/chat.js +0 -33
  274. package/lib/module/chat.js.map +0 -1
  275. package/lib/typescript/chat.d.ts +0 -10
  276. package/lib/typescript/chat.d.ts.map +0 -1
  277. package/src/chat.ts +0 -44
  278. /package/cpp/{binary-ops.cpp → ggml-cpu/binary-ops.cpp} +0 -0
  279. /package/cpp/{ggml-cpu-aarch64.h → ggml-cpu/ggml-cpu-aarch64.h} +0 -0
  280. /package/cpp/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -0
  281. /package/cpp/{ggml-cpu-quants.h → ggml-cpu/ggml-cpu-quants.h} +0 -0
  282. /package/cpp/{ggml-cpu-traits.cpp → ggml-cpu/ggml-cpu-traits.cpp} +0 -0
  283. /package/cpp/{ggml-cpu-traits.h → ggml-cpu/ggml-cpu-traits.h} +0 -0
  284. /package/cpp/{sgemm.h → ggml-cpu/sgemm.h} +0 -0
  285. /package/cpp/{unary-ops.cpp → ggml-cpu/unary-ops.cpp} +0 -0
@@ -20,12 +20,6 @@
20
20
  #define GROUP_MAX_EPS_IQ1_M 1e-7f
21
21
  #define GROUP_MAX_EPS_IQ1_S 1e-12f
22
22
 
23
- #if defined(_MSC_VER)
24
- // disable "possible loss of data" to avoid warnings for hundreds of casts
25
- // we should just be careful :)
26
- #pragma warning(disable: 4244 4267)
27
- #endif
28
-
29
23
  #define UNUSED LM_GGML_UNUSED
30
24
 
31
25
  // some compilers don't provide _mm256_set_m128i, e.g. gcc 7
@@ -6596,7 +6590,118 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, con
6596
6590
  }
6597
6591
 
6598
6592
  *s = hsum_float_8(acc);
6593
+ #elif defined(__VXE__) || defined(__VXE2__)
6594
+ uint32_t aux[3];
6595
+ uint32_t utmp[4];
6596
+
6597
+ const int32x4_t v_z = vec_splat_s32(0);
6598
+ const uint8x16_t v_3m = vec_splat_u8(0x03);
6599
+
6600
+ const uint8x16_t v_0c = vec_splat_u8(1);
6601
+ const uint8x16_t v_1c = vec_sl(v_0c, 1);
6602
+ const uint8x16_t v_2c = vec_sl(v_0c, 2);
6603
+ const uint8x16_t v_3c = vec_sl(v_0c, 3);
6604
+
6605
+ uint8x16_t q3h[4];
6606
+ uint8x16_t q3b[2];
6607
+ int8x16_t q3bytes[4];
6608
+ int8x16_t q8bytes[4];
6609
+ uint8x16_t qhbits[2];
6610
+
6611
+ float sum = 0;
6612
+
6613
+ for (int i = 0; i < nb; ++i) {
6614
+ const float d = y[i].d * LM_GGML_FP16_TO_FP32(x[i].d);
6615
+
6616
+ const uint8_t * restrict x0l = x[i].qs;
6617
+ const uint8_t * restrict x0h = x[i].hmask;
6618
+ const int8_t * restrict y0 = y[i].qs;
6619
+
6620
+ qhbits[0] = vec_xl(0 , x0h);
6621
+ qhbits[1] = vec_xl(16, x0h);
6622
+
6623
+ int32_t isum = 0;
6624
+
6625
+ memcpy(aux, x[i].scales, 12);
6626
+ utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
6627
+ utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
6628
+ utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
6629
+ utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
6630
+
6631
+ int8_t * scale = (int8_t *)utmp;
6632
+ for (int j = 0; j < 16; ++j) scale[j] -= 32;
6633
+
6634
+ for (int j = 0; j < QK_K/128; ++j) {
6635
+ int32x4_t isum0, isum1, isum2, isum3;
6636
+
6637
+ q3b[0] = vec_xl(0 , x0l);
6638
+ q3b[1] = vec_xl(16, x0l);
6639
+ x0l += 32;
6640
+
6641
+ q8bytes[0] = vec_xl(0 , y0);
6642
+ q8bytes[1] = vec_xl(16 , y0);
6643
+ q8bytes[2] = vec_xl(32 , y0);
6644
+ q8bytes[3] = vec_xl(48 , y0);
6645
+ q8bytes[4] = vec_xl(64 , y0);
6646
+ q8bytes[5] = vec_xl(80 , y0);
6647
+ q8bytes[6] = vec_xl(96 , y0);
6648
+ q8bytes[7] = vec_xl(112, y0);
6649
+ y0 += 128;
6650
+
6651
+ q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2);
6652
+ q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2);
6653
+ q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1);
6654
+ q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1);
6655
+
6656
+ q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]);
6657
+ q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]);
6658
+ q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]);
6659
+ q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]);
6660
+
6661
+ isum0 = lm_ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]);
6662
+ isum1 = lm_ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]);
6663
+ isum2 = lm_ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]);
6664
+ isum3 = lm_ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]);
6665
+
6666
+ isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
6667
+ isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
6668
+ isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
6669
+ isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
6670
+
6671
+ scale += 4;
6672
+
6673
+ q3h[0] = vec_andc(v_2c, qhbits[0]);
6674
+ q3h[1] = vec_andc(v_2c, qhbits[1]);
6675
+ q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1);
6676
+ q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1);
6677
+
6678
+ q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]);
6679
+ q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]);
6680
+ q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]);
6681
+ q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]);
6682
+
6683
+ isum0 = lm_ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]);
6684
+ isum1 = lm_ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]);
6685
+ isum2 = lm_ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
6686
+ isum3 = lm_ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
6687
+
6688
+ isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
6689
+ isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
6690
+ isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
6691
+ isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
6692
+
6693
+ scale += 4;
6694
+
6695
+ if (j == 0) {
6696
+ qhbits[0] = vec_sr(qhbits[0], 4);
6697
+ qhbits[1] = vec_sr(qhbits[1], 4);
6698
+ }
6699
+ }
6700
+
6701
+ sum += d * isum;
6702
+ }
6599
6703
 
6704
+ *s = sum;
6600
6705
  #else
6601
6706
  // scalar version
6602
6707
  // This function is written like this so the compiler can manage to vectorize most of it
@@ -8414,7 +8519,11 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, con
8414
8519
 
8415
8520
  void lm_ggml_vec_dot_q6_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, size_t bx, const void * LM_GGML_RESTRICT vy, size_t by, int nrc) {
8416
8521
  assert(n % QK_K == 0);
8522
+ #ifdef __ARM_FEATURE_MATMUL_INT8
8523
+ assert((nrc == 2) || (nrc == 1));
8524
+ #else
8417
8525
  assert(nrc == 1);
8526
+ #endif
8418
8527
  UNUSED(nrc);
8419
8528
  UNUSED(bx);
8420
8529
  UNUSED(by);
@@ -8425,6 +8534,197 @@ void lm_ggml_vec_dot_q6_K_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, con
8425
8534
 
8426
8535
  const int nb = n / QK_K;
8427
8536
 
8537
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
8538
+ if (nrc == 2) {
8539
+ const block_q6_K * LM_GGML_RESTRICT x0 = x;
8540
+ const block_q6_K * LM_GGML_RESTRICT x1 = (const block_q6_K *) ((const uint8_t *)vx + bx);
8541
+ const block_q8_K * LM_GGML_RESTRICT y0 = y;
8542
+ const block_q8_K * LM_GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by);
8543
+
8544
+ float32x4_t vfsum = vdupq_n_f32(0.0f);
8545
+
8546
+ for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) {
8547
+ const uint8_t * LM_GGML_RESTRICT ql0 = x0->ql;
8548
+ const uint8_t * LM_GGML_RESTRICT ql1 = x1->ql;
8549
+ const uint8_t * LM_GGML_RESTRICT qh0 = x0->qh;
8550
+ const uint8_t * LM_GGML_RESTRICT qh1 = x1->qh;
8551
+ const int8_t * LM_GGML_RESTRICT qy0 = y0->qs;
8552
+ const int8_t * LM_GGML_RESTRICT qy1 = y1->qs;
8553
+
8554
+ const uint8x16_t mone = vdupq_n_u8(0x30);
8555
+ const uint8x16_t m4b = vdupq_n_u8(0x0f);
8556
+
8557
+ int32x4_t visum = vdupq_n_s32(0);
8558
+
8559
+ // process 8 blocks per iteration, totally 16 blocks
8560
+ for (int j = 0; j < 2; ++j, qh0 += 32, ql0 += 64, qh1 += 32, ql1 += 64) {
8561
+ int8x16_t vx0[8], vx1[8];
8562
+
8563
+ // de-quantize vx0[8]
8564
+ {
8565
+ const uint8x16x2_t qh_bits = vld1q_u8_x2(qh0);
8566
+ const uint8x16x4_t ql_bits = vld1q_u8_x4(ql0);
8567
+
8568
+ uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4));
8569
+ uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4));
8570
+ uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2));
8571
+ uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2));
8572
+
8573
+ vx0[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0));
8574
+ vx0[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1));
8575
+ vx0[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2));
8576
+ vx0[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3));
8577
+
8578
+ q6h_0 = vandq_u8(mone, qh_bits.val[0]);
8579
+ q6h_1 = vandq_u8(mone, qh_bits.val[1]);
8580
+ q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2));
8581
+ q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2));
8582
+
8583
+ vx0[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0));
8584
+ vx0[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1));
8585
+ vx0[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2));
8586
+ vx0[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3));
8587
+ }
8588
+
8589
+ // de-quantize vx1[8]
8590
+ {
8591
+ const uint8x16x2_t qh_bits = vld1q_u8_x2(qh1);
8592
+ const uint8x16x4_t ql_bits = vld1q_u8_x4(ql1);
8593
+
8594
+ uint8x16_t q6h_0 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 4));
8595
+ uint8x16_t q6h_1 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 4));
8596
+ uint8x16_t q6h_2 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[0], 2));
8597
+ uint8x16_t q6h_3 = vandq_u8(mone, vshlq_n_u8(qh_bits.val[1], 2));
8598
+
8599
+ vx1[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[0], m4b), q6h_0));
8600
+ vx1[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[1], m4b), q6h_1));
8601
+ vx1[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[2], m4b), q6h_2));
8602
+ vx1[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(ql_bits.val[3], m4b), q6h_3));
8603
+
8604
+ q6h_0 = vandq_u8(mone, qh_bits.val[0]);
8605
+ q6h_1 = vandq_u8(mone, qh_bits.val[1]);
8606
+ q6h_2 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[0], 2));
8607
+ q6h_3 = vandq_u8(mone, vshrq_n_u8(qh_bits.val[1], 2));
8608
+
8609
+ vx1[4] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[0], 4), q6h_0));
8610
+ vx1[5] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[1], 4), q6h_1));
8611
+ vx1[6] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[2], 4), q6h_2));
8612
+ vx1[7] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(ql_bits.val[3], 4), q6h_3));
8613
+ }
8614
+
8615
+ // process 16 elements (one block with same scale) per iteration
8616
+ // - vx = concat(ql, qh) - 32
8617
+ // - r1,r2,r3,r4 = smmla(vx, vy)
8618
+ for (int k = 0; k < 8; ++k) {
8619
+ const int blk = j * 8 + k;
8620
+
8621
+ const int8x16_t vy0 = vld1q_s8(qy0);
8622
+ const int8x16_t vy1 = vld1q_s8(qy1);
8623
+ qy0 += 16;
8624
+ qy1 += 16;
8625
+
8626
+ const int32x4_t block_scale = {
8627
+ x0->scales[blk],
8628
+ x0->scales[blk],
8629
+ x1->scales[blk],
8630
+ x1->scales[blk],
8631
+ };
8632
+
8633
+ // calculate four results at once with outer product
8634
+ const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k])));
8635
+ const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vx0[k]), vreinterpretq_s64_s8(vx1[k])));
8636
+ const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1)));
8637
+ const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(vy0), vreinterpretq_s64_s8(vy1)));
8638
+ int32x4_t vr = vdupq_n_s32(0);
8639
+ vr = vmmlaq_s32(vr, vx_l, vy_l);
8640
+ vr = vmmlaq_s32(vr, vx_h, vy_h);
8641
+
8642
+ // apply block scale, will NOT overflow
8643
+ // block_scale * sum_256(int6*int8) <= 2^(8+8+6+8) = 30 bits
8644
+ visum = vmlaq_s32(visum, vr, block_scale);
8645
+ }
8646
+ }
8647
+
8648
+ // adjust bias, apply superblock scale
8649
+ {
8650
+ int32_t bias[4];
8651
+ #ifdef __ARM_FEATURE_SVE
8652
+ const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8);
8653
+ const svbool_t pg8_8 = svptrue_pat_b8(SV_VL8);
8654
+ const svint16_t y0_q8sums_0 = svld1_s16(pg16_8, y0->bsums);
8655
+ const svint16_t y0_q8sums_1 = svld1_s16(pg16_8, y0->bsums + 8);
8656
+ const svint16_t y1_q8sums_0 = svld1_s16(pg16_8, y1->bsums);
8657
+ const svint16_t y1_q8sums_1 = svld1_s16(pg16_8, y1->bsums + 8);
8658
+ const svint16_t x0_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x0->scales));
8659
+ const svint16_t x0_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x0->scales + 8));
8660
+ const svint16_t x1_q6scales_0 = svunpklo_s16(svld1_s8(pg8_8, x1->scales));
8661
+ const svint16_t x1_q6scales_1 = svunpklo_s16(svld1_s8(pg8_8, x1->scales + 8));
8662
+ const svint64_t zero = svdup_n_s64(0);
8663
+ bias[0] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x0_q6scales_0),
8664
+ svdot_s64(zero, y0_q8sums_1, x0_q6scales_1)));
8665
+ bias[1] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x0_q6scales_0),
8666
+ svdot_s64(zero, y1_q8sums_1, x0_q6scales_1)));
8667
+ bias[2] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y0_q8sums_0, x1_q6scales_0),
8668
+ svdot_s64(zero, y0_q8sums_1, x1_q6scales_1)));
8669
+ bias[3] = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(zero, y1_q8sums_0, x1_q6scales_0),
8670
+ svdot_s64(zero, y1_q8sums_1, x1_q6scales_1)));
8671
+ #else
8672
+ // NEON doesn't support int16 dot product, fallback to separated mul and add
8673
+ const int16x8x2_t q8sums0 = vld1q_s16_x2(y0->bsums);
8674
+ const int16x8x2_t q8sums1 = vld1q_s16_x2(y1->bsums);
8675
+
8676
+ int8x16_t scales_s8 = vld1q_s8(x0->scales);
8677
+ const int16x8x2_t q6scales0 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}};
8678
+ scales_s8 = vld1q_s8(x1->scales);
8679
+ const int16x8x2_t q6scales1 = {{vmovl_s8(vget_low_s8(scales_s8)), vmovl_s8(vget_high_s8(scales_s8))}};
8680
+
8681
+ int32x4_t prod;
8682
+ prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales0.val[0])),
8683
+ vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales0.val[0]))),
8684
+ vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales0.val[1])),
8685
+ vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales0.val[1]))));
8686
+ bias[0] = vaddvq_s32(prod);
8687
+ prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales0.val[0])),
8688
+ vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales0.val[0]))),
8689
+ vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales0.val[1])),
8690
+ vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales0.val[1]))));
8691
+ bias[1] = vaddvq_s32(prod);
8692
+ prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[0]), vget_low_s16 (q6scales1.val[0])),
8693
+ vmull_s16(vget_high_s16(q8sums0.val[0]), vget_high_s16(q6scales1.val[0]))),
8694
+ vaddq_s32(vmull_s16(vget_low_s16 (q8sums0.val[1]), vget_low_s16 (q6scales1.val[1])),
8695
+ vmull_s16(vget_high_s16(q8sums0.val[1]), vget_high_s16(q6scales1.val[1]))));
8696
+ bias[2] = vaddvq_s32(prod);
8697
+ prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[0]), vget_low_s16 (q6scales1.val[0])),
8698
+ vmull_s16(vget_high_s16(q8sums1.val[0]), vget_high_s16(q6scales1.val[0]))),
8699
+ vaddq_s32(vmull_s16(vget_low_s16 (q8sums1.val[1]), vget_low_s16 (q6scales1.val[1])),
8700
+ vmull_s16(vget_high_s16(q8sums1.val[1]), vget_high_s16(q6scales1.val[1]))));
8701
+ bias[3] = vaddvq_s32(prod);
8702
+
8703
+ #endif
8704
+ const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32);
8705
+
8706
+ const float32x4_t superblock_scale = {
8707
+ LM_GGML_FP16_TO_FP32(x0->d) * y0->d,
8708
+ LM_GGML_FP16_TO_FP32(x0->d) * y1->d,
8709
+ LM_GGML_FP16_TO_FP32(x1->d) * y0->d,
8710
+ LM_GGML_FP16_TO_FP32(x1->d) * y1->d,
8711
+ };
8712
+
8713
+ visum = vsubq_s32(visum, vibias);
8714
+ vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
8715
+ }
8716
+ }
8717
+
8718
+ // vfsum = ABCD -> ACBD
8719
+ // AC -> s, BD -> (s+bs)
8720
+ vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2));
8721
+ vst1_f32(s, vget_low_f32 (vfsum));
8722
+ vst1_f32(s + bs, vget_high_f32(vfsum));
8723
+
8724
+ return;
8725
+ }
8726
+ #endif
8727
+
8428
8728
  #ifdef __ARM_FEATURE_SVE
8429
8729
  const int vector_length = lm_ggml_cpu_get_sve_cnt()*8;
8430
8730
  float sum = 0;
@@ -50,19 +50,6 @@
50
50
  #include "llamafile/sgemm.h"
51
51
  #endif
52
52
 
53
- #if defined(_MSC_VER)
54
- // disable "possible loss of data" to avoid hundreds of casts
55
- // we should just be careful :)
56
- #pragma warning(disable: 4244 4267)
57
-
58
- // disable POSIX deprecation warnings
59
- // these functions are never going away, anyway
60
- #pragma warning(disable: 4996)
61
-
62
- // unreachable code because of multiple instances of code after LM_GGML_ABORT
63
- #pragma warning(disable: 4702)
64
- #endif
65
-
66
53
  // Note: once we move threading into a separate C++ file
67
54
  // will use std::hardware_destructive_interference_size instead of hardcoding it here
68
55
  // and we'll use C++ attribute syntax.
@@ -215,7 +202,7 @@ static const struct lm_ggml_type_traits_cpu type_traits_cpu[LM_GGML_TYPE_COUNT]
215
202
  .nrows = 1,
216
203
  },
217
204
  [LM_GGML_TYPE_F16] = {
218
- .from_float = (lm_ggml_from_float_t) lm_ggml_fp32_to_fp16_row,
205
+ .from_float = (lm_ggml_from_float_t) lm_ggml_cpu_fp32_to_fp16,
219
206
  .vec_dot = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_f16,
220
207
  .vec_dot_type = LM_GGML_TYPE_F16,
221
208
  .nrows = 1,
@@ -295,7 +282,11 @@ static const struct lm_ggml_type_traits_cpu type_traits_cpu[LM_GGML_TYPE_COUNT]
295
282
  .from_float = quantize_row_q6_K,
296
283
  .vec_dot = lm_ggml_vec_dot_q6_K_q8_K,
297
284
  .vec_dot_type = LM_GGML_TYPE_Q8_K,
285
+ #if defined (__ARM_FEATURE_MATMUL_INT8)
286
+ .nrows = 2,
287
+ #else
298
288
  .nrows = 1,
289
+ #endif
299
290
  },
300
291
  [LM_GGML_TYPE_IQ2_XXS] = {
301
292
  .from_float = NULL,
@@ -356,7 +347,7 @@ static const struct lm_ggml_type_traits_cpu type_traits_cpu[LM_GGML_TYPE_COUNT]
356
347
  .from_float = quantize_row_q8_K,
357
348
  },
358
349
  [LM_GGML_TYPE_BF16] = {
359
- .from_float = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row,
350
+ .from_float = (lm_ggml_from_float_t) lm_ggml_cpu_fp32_to_bf16,
360
351
  .vec_dot = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_bf16,
361
352
  .vec_dot_type = LM_GGML_TYPE_BF16,
362
353
  .nrows = 1,
@@ -1932,6 +1923,10 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
1932
1923
  {
1933
1924
  lm_ggml_compute_forward_im2col_back_f32(params, tensor);
1934
1925
  } break;
1926
+ case LM_GGML_OP_CONV_2D_DW:
1927
+ {
1928
+ lm_ggml_compute_forward_conv_2d_dw(params, tensor);
1929
+ } break;
1935
1930
  case LM_GGML_OP_CONV_TRANSPOSE_2D:
1936
1931
  {
1937
1932
  lm_ggml_compute_forward_conv_transpose_2d(params, tensor);
@@ -2027,41 +2022,6 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
2027
2022
  {
2028
2023
  lm_ggml_compute_forward_rwkv_wkv7(params, tensor);
2029
2024
  } break;
2030
- case LM_GGML_OP_MAP_UNARY:
2031
- {
2032
- lm_ggml_unary_op_f32_t fun;
2033
- memcpy(&fun, tensor->op_params, sizeof(fun));
2034
- lm_ggml_compute_forward_map_unary(params, tensor, fun);
2035
- }
2036
- break;
2037
- case LM_GGML_OP_MAP_BINARY:
2038
- {
2039
- lm_ggml_binary_op_f32_t fun;
2040
- memcpy(&fun, tensor->op_params, sizeof(fun));
2041
- lm_ggml_compute_forward_map_binary(params, tensor, fun);
2042
- }
2043
- break;
2044
- case LM_GGML_OP_MAP_CUSTOM1_F32:
2045
- {
2046
- lm_ggml_custom1_op_f32_t fun;
2047
- memcpy(&fun, tensor->op_params, sizeof(fun));
2048
- lm_ggml_compute_forward_map_custom1_f32(params, tensor, fun);
2049
- }
2050
- break;
2051
- case LM_GGML_OP_MAP_CUSTOM2_F32:
2052
- {
2053
- lm_ggml_custom2_op_f32_t fun;
2054
- memcpy(&fun, tensor->op_params, sizeof(fun));
2055
- lm_ggml_compute_forward_map_custom2_f32(params, tensor, fun);
2056
- }
2057
- break;
2058
- case LM_GGML_OP_MAP_CUSTOM3_F32:
2059
- {
2060
- lm_ggml_custom3_op_f32_t fun;
2061
- memcpy(&fun, tensor->op_params, sizeof(fun));
2062
- lm_ggml_compute_forward_map_custom3_f32(params, tensor, fun);
2063
- }
2064
- break;
2065
2025
  case LM_GGML_OP_MAP_CUSTOM1:
2066
2026
  {
2067
2027
  lm_ggml_compute_forward_map_custom1(params, tensor);
@@ -2077,6 +2037,11 @@ static void lm_ggml_compute_forward(struct lm_ggml_compute_params * params, stru
2077
2037
  lm_ggml_compute_forward_map_custom3(params, tensor);
2078
2038
  }
2079
2039
  break;
2040
+ case LM_GGML_OP_CUSTOM:
2041
+ {
2042
+ lm_ggml_compute_forward_custom(params, tensor);
2043
+ }
2044
+ break;
2080
2045
  case LM_GGML_OP_CROSS_ENTROPY_LOSS:
2081
2046
  {
2082
2047
  lm_ggml_compute_forward_cross_entropy_loss(params, tensor);
@@ -2237,6 +2202,7 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) {
2237
2202
  } break;
2238
2203
 
2239
2204
  case LM_GGML_UNARY_OP_GELU:
2205
+ case LM_GGML_UNARY_OP_GELU_ERF:
2240
2206
  case LM_GGML_UNARY_OP_GELU_QUICK:
2241
2207
  case LM_GGML_UNARY_OP_SILU:
2242
2208
  {
@@ -2298,6 +2264,7 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) {
2298
2264
  } break;
2299
2265
  case LM_GGML_OP_IM2COL:
2300
2266
  case LM_GGML_OP_IM2COL_BACK:
2267
+ case LM_GGML_OP_CONV_2D_DW:
2301
2268
  case LM_GGML_OP_CONV_TRANSPOSE_1D:
2302
2269
  case LM_GGML_OP_CONV_TRANSPOSE_2D:
2303
2270
  {
@@ -2328,11 +2295,6 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) {
2328
2295
  case LM_GGML_OP_WIN_PART:
2329
2296
  case LM_GGML_OP_WIN_UNPART:
2330
2297
  case LM_GGML_OP_GET_REL_POS:
2331
- case LM_GGML_OP_MAP_UNARY:
2332
- case LM_GGML_OP_MAP_BINARY:
2333
- case LM_GGML_OP_MAP_CUSTOM1_F32:
2334
- case LM_GGML_OP_MAP_CUSTOM2_F32:
2335
- case LM_GGML_OP_MAP_CUSTOM3_F32:
2336
2298
  {
2337
2299
  n_tasks = 1;
2338
2300
  } break;
@@ -2366,6 +2328,16 @@ static int lm_ggml_get_n_tasks(struct lm_ggml_tensor * node, int n_threads) {
2366
2328
  n_tasks = MIN(p.n_tasks, n_threads);
2367
2329
  }
2368
2330
  } break;
2331
+ case LM_GGML_OP_CUSTOM:
2332
+ {
2333
+ struct lm_ggml_custom_op_params p;
2334
+ memcpy(&p, node->op_params, sizeof(p));
2335
+ if (p.n_tasks == LM_GGML_N_TASKS_MAX) {
2336
+ n_tasks = n_threads;
2337
+ } else {
2338
+ n_tasks = MIN(p.n_tasks, n_threads);
2339
+ }
2340
+ } break;
2369
2341
  case LM_GGML_OP_CROSS_ENTROPY_LOSS:
2370
2342
  case LM_GGML_OP_CROSS_ENTROPY_LOSS_BACK:
2371
2343
  case LM_GGML_OP_OPT_STEP_ADAMW:
@@ -3186,6 +3158,93 @@ enum lm_ggml_status lm_ggml_graph_compute_with_ctx(struct lm_ggml_context * ctx,
3186
3158
  return lm_ggml_graph_compute(cgraph, &cplan);
3187
3159
  }
3188
3160
 
3161
+ void lm_ggml_cpu_fp32_to_fp16(const float * x, lm_ggml_fp16_t * y, int64_t n) {
3162
+ int64_t i = 0;
3163
+ #if defined(__F16C__)
3164
+ #if defined(__AVX512F__)
3165
+ for (; i + 15 < n; i += 16) {
3166
+ __m512 x_vec = _mm512_loadu_ps(x + i);
3167
+ __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3168
+ _mm256_storeu_si256((__m256i *)(y + i), y_vec);
3169
+ }
3170
+ #endif
3171
+ for (; i + 7 < n; i += 8) {
3172
+ __m256 x_vec = _mm256_loadu_ps(x + i);
3173
+ __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3174
+ _mm_storeu_si128((__m128i *)(y + i), y_vec);
3175
+ }
3176
+ for (; i + 3 < n; i += 4) {
3177
+ __m128 x_vec = _mm_loadu_ps(x + i);
3178
+ __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3179
+ _mm_storel_epi64((__m128i *)(y + i), y_vec);
3180
+ }
3181
+ #endif
3182
+ for (; i < n; ++i) {
3183
+ y[i] = LM_GGML_FP32_TO_FP16(x[i]);
3184
+ }
3185
+ }
3186
+
3187
+ void lm_ggml_cpu_fp16_to_fp32(const lm_ggml_fp16_t * x, float * y, int64_t n) {
3188
+ int64_t i = 0;
3189
+ #if defined(__F16C__)
3190
+ #if defined(__AVX512F__)
3191
+ for (; i + 15 < n; i += 16) {
3192
+ __m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i));
3193
+ __m512 y_vec = _mm512_cvtph_ps(x_vec);
3194
+ _mm512_storeu_ps(y + i, y_vec);
3195
+ }
3196
+ #endif
3197
+ for (; i + 7 < n; i += 8) {
3198
+ __m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
3199
+ __m256 y_vec = _mm256_cvtph_ps(x_vec);
3200
+ _mm256_storeu_ps(y + i, y_vec);
3201
+ }
3202
+ for (; i + 3 < n; i += 4) {
3203
+ __m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i));
3204
+ __m128 y_vec = _mm_cvtph_ps(x_vec);
3205
+ _mm_storeu_ps(y + i, y_vec);
3206
+ }
3207
+ #endif
3208
+ for (; i < n; ++i) {
3209
+ y[i] = LM_GGML_FP16_TO_FP32(x[i]);
3210
+ }
3211
+ }
3212
+
3213
+ void lm_ggml_cpu_fp32_to_bf16(const float * x, lm_ggml_bf16_t * y, int64_t n) {
3214
+ int64_t i = 0;
3215
+ for (; i < n; ++i) {
3216
+ y[i] = LM_GGML_FP32_TO_BF16(x[i]);
3217
+ }
3218
+ }
3219
+
3220
+ void lm_ggml_cpu_bf16_to_fp32(const lm_ggml_bf16_t * x, float * y, int64_t n) {
3221
+ int64_t i = 0;
3222
+ #if defined(__AVX2__)
3223
+ #if defined(__AVX512F__)
3224
+ for (; i + 15 < n; i += 16) {
3225
+ _mm512_storeu_ps(y + i,
3226
+ _mm512_castsi512_ps(
3227
+ _mm512_slli_epi32(
3228
+ _mm512_cvtepu16_epi32(
3229
+ _mm256_loadu_si256(
3230
+ (const __m256i *)(x + i))),
3231
+ 16)));
3232
+ }
3233
+ #endif
3234
+ for (; i + 7 < n; i += 8) {
3235
+ _mm256_storeu_ps(y + i,
3236
+ _mm256_castsi256_ps(
3237
+ _mm256_slli_epi32(
3238
+ _mm256_cvtepu16_epi32(
3239
+ _mm_loadu_si128(
3240
+ (const __m128i *)(x + i))),
3241
+ 16)));
3242
+ }
3243
+ #endif
3244
+ for (; i < n; i++) {
3245
+ y[i] = LM_GGML_BF16_TO_FP32(x[i]);
3246
+ }
3247
+ }
3189
3248
 
3190
3249
  int lm_ggml_cpu_has_avx(void) {
3191
3250
  #if defined(__AVX__)
@@ -4,30 +4,33 @@
4
4
  #include "ggml-cpu-aarch64.h"
5
5
  #include "ggml-cpu-traits.h"
6
6
  #include "ggml-impl.h"
7
+ #include "amx/amx.h"
7
8
 
8
9
  #include <cctype>
9
10
  #include <string>
10
11
  #include <vector>
11
12
 
12
13
  #ifdef LM_GGML_USE_CPU_HBM
13
- #include "ggml-cpu-hbm.h"
14
+ # include "ggml-cpu-hbm.h"
14
15
  #endif
15
16
 
16
17
  #ifdef LM_GGML_USE_CPU_KLEIDIAI
17
- #include "kleidiai/kleidiai.h"
18
- #endif
19
-
20
- #if defined(__APPLE__)
21
- #include <sys/types.h>
22
- #include <sys/sysctl.h>
18
+ # include "kleidiai/kleidiai.h"
23
19
  #endif
24
20
 
25
21
  #if defined(_WIN32)
26
- #define WIN32_LEAN_AND_MEAN
27
- #ifndef NOMINMAX
28
- #define NOMINMAX
22
+ # define WIN32_LEAN_AND_MEAN
23
+ # ifndef NOMINMAX
24
+ # define NOMINMAX
25
+ # endif
26
+ # include <windows.h>
27
+ #else
28
+ # include <unistd.h>
29
29
  #endif
30
- #include <windows.h>
30
+
31
+ #if defined(__APPLE__)
32
+ # include <sys/sysctl.h>
33
+ # include <sys/types.h>
31
34
  #endif
32
35
 
33
36
  // ggml-backend interface
@@ -69,8 +72,10 @@ static lm_ggml_backend_buffer_type_t * lm_ggml_backend_cpu_device_get_extra_buff
69
72
  }
70
73
 
71
74
  static bool lm_ggml_backend_cpu_is_extra_buffer_type(lm_ggml_backend_buffer_type_t buft) {
72
- for (auto extra : lm_ggml_backend_cpu_get_extra_buffers_type()) {
73
- if (extra && extra == buft) return true;
75
+ for (auto * extra : lm_ggml_backend_cpu_get_extra_buffers_type()) {
76
+ if (extra && extra == buft) {
77
+ return true;
78
+ }
74
79
  }
75
80
  return false;
76
81
  }
@@ -329,9 +334,18 @@ static const char * lm_ggml_backend_cpu_device_get_description(lm_ggml_backend_d
329
334
  }
330
335
 
331
336
  static void lm_ggml_backend_cpu_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
332
- // TODO
333
- *free = 0;
334
- *total = 0;
337
+ #ifdef _WIN32
338
+ MEMORYSTATUSEX status;
339
+ status.dwLength = sizeof(status);
340
+ GlobalMemoryStatusEx(&status);
341
+ *total = status.ullTotalPhys;
342
+ *free = status.ullAvailPhys;
343
+ #else
344
+ long pages = sysconf(_SC_PHYS_PAGES);
345
+ long page_size = sysconf(_SC_PAGE_SIZE);
346
+ *total = pages * page_size;
347
+ *free = *total;
348
+ #endif
335
349
 
336
350
  LM_GGML_UNUSED(dev);
337
351
  }
@@ -424,6 +438,8 @@ static bool lm_ggml_backend_cpu_device_supports_op(lm_ggml_backend_dev_t dev, co
424
438
  }
425
439
  case LM_GGML_OP_IM2COL_BACK:
426
440
  return src0->type == LM_GGML_TYPE_F32 && src1->type == LM_GGML_TYPE_F32;
441
+ case LM_GGML_OP_GET_ROWS_BACK:
442
+ return src0->type == LM_GGML_TYPE_F32 || src0->type == LM_GGML_TYPE_F16;
427
443
  case LM_GGML_OP_OUT_PROD:
428
444
  return (src0->type == LM_GGML_TYPE_F32 || (lm_ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
429
445
  src1->type == LM_GGML_TYPE_F32 && op->type == LM_GGML_TYPE_F32;