cui-llama.rn 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (285) hide show
  1. package/README.md +35 -7
  2. package/android/src/main/CMakeLists.txt +22 -11
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +42 -6
  4. package/android/src/main/java/com/rnllama/RNLlama.java +139 -4
  5. package/android/src/main/jni.cpp +173 -18
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +24 -4
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +22 -2
  16. package/cpp/LICENSE +21 -0
  17. package/cpp/chat.cpp +129 -107
  18. package/cpp/chat.h +2 -0
  19. package/cpp/common.cpp +58 -78
  20. package/cpp/common.h +29 -21
  21. package/cpp/ggml-alloc.c +4 -1
  22. package/cpp/ggml-backend.cpp +9 -5
  23. package/cpp/ggml-backend.h +4 -4
  24. package/cpp/ggml-cpp.h +1 -1
  25. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  26. package/cpp/ggml-cpu/amx/amx.h +8 -0
  27. package/cpp/ggml-cpu/amx/common.h +91 -0
  28. package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
  29. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  30. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/binary-ops.h +1 -1
  31. package/cpp/ggml-cpu/common.h +72 -0
  32. package/cpp/{ggml-cpu-aarch64.cpp → ggml-cpu/ggml-cpu-aarch64.cpp} +809 -103
  33. package/cpp/{ggml-cpu-quants.c → ggml-cpu/ggml-cpu-quants.c} +306 -6
  34. package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} +114 -55
  35. package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} +32 -16
  36. package/cpp/{ops.cpp → ggml-cpu/ops.cpp} +353 -173
  37. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/ops.h +2 -20
  38. package/cpp/{sgemm.cpp → ggml-cpu/sgemm.cpp} +501 -0
  39. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/simd-mappings.h +7 -3
  40. package/{ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/unary-ops.h +1 -1
  41. package/cpp/{vec.cpp → ggml-cpu/vec.cpp} +0 -6
  42. package/{ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/vec.h +16 -0
  43. package/cpp/ggml-cpu.h +5 -0
  44. package/cpp/ggml-impl.h +16 -9
  45. package/cpp/ggml-llama-sim.metallib +0 -0
  46. package/cpp/ggml-llama.metallib +0 -0
  47. package/cpp/ggml-metal-impl.h +36 -11
  48. package/cpp/ggml-metal.m +810 -176
  49. package/cpp/ggml-opt.cpp +373 -190
  50. package/cpp/ggml-opt.h +49 -28
  51. package/cpp/ggml-quants.c +0 -6
  52. package/cpp/ggml.c +227 -282
  53. package/cpp/ggml.h +82 -101
  54. package/cpp/gguf.cpp +33 -33
  55. package/cpp/json-schema-to-grammar.cpp +3 -0
  56. package/cpp/llama-adapter.cpp +6 -0
  57. package/cpp/llama-arch.cpp +49 -17
  58. package/cpp/llama-arch.h +9 -0
  59. package/cpp/llama-batch.cpp +8 -2
  60. package/cpp/llama-batch.h +2 -1
  61. package/cpp/llama-chat.cpp +39 -16
  62. package/cpp/llama-chat.h +4 -2
  63. package/cpp/llama-context.cpp +440 -611
  64. package/cpp/llama-context.h +44 -33
  65. package/cpp/llama-cparams.h +1 -0
  66. package/cpp/llama-graph.cpp +214 -291
  67. package/cpp/llama-graph.h +69 -21
  68. package/cpp/llama-hparams.cpp +17 -1
  69. package/cpp/llama-hparams.h +39 -5
  70. package/cpp/llama-kv-cache.cpp +2067 -620
  71. package/cpp/llama-kv-cache.h +410 -108
  72. package/cpp/llama-memory.h +12 -1
  73. package/cpp/llama-model-loader.cpp +24 -15
  74. package/cpp/llama-model-saver.cpp +281 -0
  75. package/cpp/llama-model-saver.h +37 -0
  76. package/cpp/llama-model.cpp +1089 -359
  77. package/cpp/llama-model.h +19 -3
  78. package/cpp/llama-sampling.cpp +20 -7
  79. package/cpp/llama-vocab.cpp +54 -9
  80. package/cpp/llama-vocab.h +6 -0
  81. package/cpp/llama.cpp +14 -0
  82. package/cpp/llama.h +86 -142
  83. package/cpp/minja/chat-template.hpp +9 -5
  84. package/cpp/minja/minja.hpp +69 -36
  85. package/cpp/rn-llama.cpp +602 -190
  86. package/cpp/rn-llama.h +34 -8
  87. package/cpp/sampling.cpp +57 -50
  88. package/cpp/tools/mtmd/clip-impl.h +462 -0
  89. package/cpp/tools/mtmd/clip.cpp +4024 -0
  90. package/cpp/tools/mtmd/clip.h +101 -0
  91. package/cpp/tools/mtmd/miniaudio.h +93468 -0
  92. package/cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  93. package/cpp/tools/mtmd/mtmd-audio.h +62 -0
  94. package/cpp/tools/mtmd/mtmd-helper.cpp +297 -0
  95. package/cpp/tools/mtmd/mtmd.cpp +942 -0
  96. package/cpp/tools/mtmd/mtmd.h +362 -0
  97. package/cpp/tools/mtmd/stb_image.h +7988 -0
  98. package/ios/CMakeLists.txt +20 -10
  99. package/ios/RNLlama.h +6 -0
  100. package/ios/RNLlama.mm +82 -3
  101. package/ios/RNLlamaContext.h +5 -1
  102. package/ios/RNLlamaContext.mm +131 -38
  103. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +2 -0
  104. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +29 -21
  105. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  106. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  107. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  108. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  109. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  110. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  111. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +82 -101
  112. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  113. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  114. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  115. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  116. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  117. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  118. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  119. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  120. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  121. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  122. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +86 -142
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  131. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  132. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  133. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  134. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  135. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  136. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  137. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  138. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  139. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  140. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  141. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  142. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  143. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  144. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  145. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  146. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  147. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  148. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  149. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  150. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  151. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  152. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  153. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  154. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  155. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  156. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  160. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +2 -0
  161. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +29 -21
  162. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
  163. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  164. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  165. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  166. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  167. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
  168. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +82 -101
  169. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  170. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  171. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
  172. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +44 -33
  173. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
  174. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
  175. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
  176. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  177. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
  178. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
  179. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +19 -3
  180. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
  181. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +86 -142
  182. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  183. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
  184. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
  185. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
  186. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  187. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  188. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
  189. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
  190. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
  191. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  192. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  193. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  194. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
  195. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
  196. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
  197. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  198. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  199. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
  200. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
  201. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
  202. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
  203. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
  204. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
  205. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
  206. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
  207. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
  208. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
  209. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
  210. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
  211. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
  212. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
  213. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
  214. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
  215. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  216. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  217. package/jest/mock.js +33 -7
  218. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  219. package/lib/commonjs/index.js +153 -21
  220. package/lib/commonjs/index.js.map +1 -1
  221. package/lib/module/NativeRNLlama.js.map +1 -1
  222. package/lib/module/index.js +152 -20
  223. package/lib/module/index.js.map +1 -1
  224. package/lib/typescript/NativeRNLlama.d.ts +54 -4
  225. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  226. package/lib/typescript/index.d.ts +72 -6
  227. package/lib/typescript/index.d.ts.map +1 -1
  228. package/package.json +1 -1
  229. package/src/NativeRNLlama.ts +72 -4
  230. package/src/index.ts +212 -38
  231. package/cpp/binary-ops.h +0 -16
  232. package/cpp/ops.h +0 -128
  233. package/cpp/simd-mappings.h +0 -888
  234. package/cpp/unary-ops.h +0 -28
  235. package/cpp/vec.h +0 -802
  236. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  237. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  238. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  239. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  240. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  241. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +0 -128
  242. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  243. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  244. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +0 -802
  245. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  246. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  247. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  248. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  249. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  250. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  251. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  252. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  253. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  254. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  255. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  256. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  257. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +0 -128
  258. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  259. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  260. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +0 -28
  261. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +0 -16
  262. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  263. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  264. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  265. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  266. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +0 -128
  267. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  268. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +0 -888
  269. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  270. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  271. package/lib/commonjs/chat.js +0 -37
  272. package/lib/commonjs/chat.js.map +0 -1
  273. package/lib/module/chat.js +0 -33
  274. package/lib/module/chat.js.map +0 -1
  275. package/lib/typescript/chat.d.ts +0 -10
  276. package/lib/typescript/chat.d.ts.map +0 -1
  277. package/src/chat.ts +0 -44
  278. /package/cpp/{binary-ops.cpp → ggml-cpu/binary-ops.cpp} +0 -0
  279. /package/cpp/{ggml-cpu-aarch64.h → ggml-cpu/ggml-cpu-aarch64.h} +0 -0
  280. /package/cpp/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -0
  281. /package/cpp/{ggml-cpu-quants.h → ggml-cpu/ggml-cpu-quants.h} +0 -0
  282. /package/cpp/{ggml-cpu-traits.cpp → ggml-cpu/ggml-cpu-traits.cpp} +0 -0
  283. /package/cpp/{ggml-cpu-traits.h → ggml-cpu/ggml-cpu-traits.h} +0 -0
  284. /package/cpp/{sgemm.h → ggml-cpu/sgemm.h} +0 -0
  285. /package/cpp/{unary-ops.cpp → ggml-cpu/unary-ops.cpp} +0 -0
@@ -1,888 +0,0 @@
1
- #pragma once
2
-
3
- #include "ggml-cpu-impl.h"
4
-
5
- //
6
- // simd mappings
7
- //
8
-
9
- // we define a common set of C macros which map to specific intrinsics based on the current architecture
10
- // we then implement the fundamental computation operations below using only these macros
11
- // adding support for new architectures requires to define the corresponding SIMD macros
12
- //
13
- // LM_GGML_F32_STEP / LM_GGML_F16_STEP
14
- // number of elements to process in a single step
15
- //
16
- // LM_GGML_F32_EPR / LM_GGML_F16_EPR
17
- // number of elements to fit in a single register
18
- //
19
-
20
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
21
-
22
- #define LM_GGML_SIMD
23
-
24
- // F32 NEON
25
-
26
- #define LM_GGML_F32_STEP 16
27
- #define LM_GGML_F32_EPR 4
28
-
29
- #define LM_GGML_F32x4 float32x4_t
30
- #define LM_GGML_F32x4_ZERO vdupq_n_f32(0.0f)
31
- #define LM_GGML_F32x4_SET1(x) vdupq_n_f32(x)
32
- #define LM_GGML_F32x4_LOAD vld1q_f32
33
- #define LM_GGML_F32x4_STORE vst1q_f32
34
- #define LM_GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
35
- #define LM_GGML_F32x4_ADD vaddq_f32
36
- #define LM_GGML_F32x4_MUL vmulq_f32
37
- #define LM_GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
38
- #define LM_GGML_F32x4_REDUCE(res, x) \
39
- { \
40
- int offset = LM_GGML_F32_ARR >> 1; \
41
- for (int i = 0; i < offset; ++i) { \
42
- (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
43
- } \
44
- offset >>= 1; \
45
- for (int i = 0; i < offset; ++i) { \
46
- (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
47
- } \
48
- offset >>= 1; \
49
- for (int i = 0; i < offset; ++i) { \
50
- (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
51
- } \
52
- (res) = (lm_ggml_float) LM_GGML_F32x4_REDUCE_ONE((x)[0]); \
53
- }
54
-
55
- #define LM_GGML_F32_VEC LM_GGML_F32x4
56
- #define LM_GGML_F32_VEC_ZERO LM_GGML_F32x4_ZERO
57
- #define LM_GGML_F32_VEC_SET1 LM_GGML_F32x4_SET1
58
- #define LM_GGML_F32_VEC_LOAD LM_GGML_F32x4_LOAD
59
- #define LM_GGML_F32_VEC_STORE LM_GGML_F32x4_STORE
60
- #define LM_GGML_F32_VEC_FMA LM_GGML_F32x4_FMA
61
- #define LM_GGML_F32_VEC_ADD LM_GGML_F32x4_ADD
62
- #define LM_GGML_F32_VEC_MUL LM_GGML_F32x4_MUL
63
- #define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x4_REDUCE
64
-
65
- // F16 NEON
66
-
67
- #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
68
- #define LM_GGML_F16_STEP 32
69
- #define LM_GGML_F16_EPR 8
70
-
71
- #define LM_GGML_F16x8 float16x8_t
72
- #define LM_GGML_F16x8_ZERO vdupq_n_f16(0.0f)
73
- #define LM_GGML_F16x8_SET1(x) vdupq_n_f16(x)
74
- #define LM_GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
75
- #define LM_GGML_F16x8_STORE vst1q_f16
76
- #define LM_GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
77
- #define LM_GGML_F16x8_ADD vaddq_f16
78
- #define LM_GGML_F16x8_MUL vmulq_f16
79
- #define LM_GGML_F16x8_REDUCE(res, x) \
80
- do { \
81
- int offset = LM_GGML_F16_ARR >> 1; \
82
- for (int i = 0; i < offset; ++i) { \
83
- (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
84
- } \
85
- offset >>= 1; \
86
- for (int i = 0; i < offset; ++i) { \
87
- (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
88
- } \
89
- offset >>= 1; \
90
- for (int i = 0; i < offset; ++i) { \
91
- (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
92
- } \
93
- const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
94
- const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
95
- (res) = (lm_ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
96
- } while (0)
97
-
98
- #define LM_GGML_F16_VEC LM_GGML_F16x8
99
- #define LM_GGML_F16_VEC_ZERO LM_GGML_F16x8_ZERO
100
- #define LM_GGML_F16_VEC_SET1 LM_GGML_F16x8_SET1
101
- #define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F16x8_LOAD(p)
102
- #define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
103
- #define LM_GGML_F16_VEC_FMA LM_GGML_F16x8_FMA
104
- #define LM_GGML_F16_VEC_ADD LM_GGML_F16x8_ADD
105
- #define LM_GGML_F16_VEC_MUL LM_GGML_F16x8_MUL
106
- #define LM_GGML_F16_VEC_REDUCE LM_GGML_F16x8_REDUCE
107
- #else
108
- // if FP16 vector arithmetic is not supported, we use FP32 instead
109
- // and take advantage of the vcvt_ functions to convert to/from FP16
110
-
111
- #define LM_GGML_F16_STEP 16
112
- #define LM_GGML_F16_EPR 4
113
-
114
- #define LM_GGML_F32Cx4 float32x4_t
115
- #define LM_GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
116
- #define LM_GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
117
- #define LM_GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
118
- #define LM_GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
119
- #define LM_GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
120
- #define LM_GGML_F32Cx4_ADD vaddq_f32
121
- #define LM_GGML_F32Cx4_MUL vmulq_f32
122
- #define LM_GGML_F32Cx4_REDUCE LM_GGML_F32x4_REDUCE
123
-
124
- #define LM_GGML_F16_VEC LM_GGML_F32Cx4
125
- #define LM_GGML_F16_VEC_ZERO LM_GGML_F32Cx4_ZERO
126
- #define LM_GGML_F16_VEC_SET1 LM_GGML_F32Cx4_SET1
127
- #define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F32Cx4_LOAD(p)
128
- #define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
129
- #define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx4_FMA
130
- #define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx4_ADD
131
- #define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx4_MUL
132
- #define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx4_REDUCE
133
- #endif
134
-
135
- #elif defined(__AVX512F__)
136
-
137
- #define LM_GGML_SIMD
138
-
139
- // F32 AVX512
140
-
141
- #define LM_GGML_F32_STEP 64
142
- #define LM_GGML_F32_EPR 16
143
-
144
- #define LM_GGML_F32x16 __m512
145
- #define LM_GGML_F32x16_ZERO _mm512_setzero_ps()
146
- #define LM_GGML_F32x16_SET1(x) _mm512_set1_ps(x)
147
- #define LM_GGML_F32x16_LOAD _mm512_loadu_ps
148
- #define LM_GGML_F32x16_STORE _mm512_storeu_ps
149
- // _mm512_fmadd_ps is defined in AVX512F so no guard is required
150
- #define LM_GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
151
- #define LM_GGML_F32x16_ADD _mm512_add_ps
152
- #define LM_GGML_F32x16_MUL _mm512_mul_ps
153
- #define LM_GGML_F32x16_REDUCE(res, x) \
154
- do { \
155
- int offset = LM_GGML_F32_ARR >> 1; \
156
- for (int i = 0; i < offset; ++i) { \
157
- x[i] = _mm512_add_ps(x[i], x[offset+i]); \
158
- } \
159
- offset >>= 1; \
160
- for (int i = 0; i < offset; ++i) { \
161
- x[i] = _mm512_add_ps(x[i], x[offset+i]); \
162
- } \
163
- offset >>= 1; \
164
- for (int i = 0; i < offset; ++i) { \
165
- x[i] = _mm512_add_ps(x[i], x[offset+i]); \
166
- } \
167
- res = (lm_ggml_float) _mm512_reduce_add_ps(x[0]); \
168
- } while (0)
169
-
170
- // TODO: is this optimal ?
171
-
172
- #define LM_GGML_F32_VEC LM_GGML_F32x16
173
- #define LM_GGML_F32_VEC_ZERO LM_GGML_F32x16_ZERO
174
- #define LM_GGML_F32_VEC_SET1 LM_GGML_F32x16_SET1
175
- #define LM_GGML_F32_VEC_LOAD LM_GGML_F32x16_LOAD
176
- #define LM_GGML_F32_VEC_STORE LM_GGML_F32x16_STORE
177
- #define LM_GGML_F32_VEC_FMA LM_GGML_F32x16_FMA
178
- #define LM_GGML_F32_VEC_ADD LM_GGML_F32x16_ADD
179
- #define LM_GGML_F32_VEC_MUL LM_GGML_F32x16_MUL
180
- #define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x16_REDUCE
181
-
182
- // F16 AVX512
183
-
184
- // F16 AVX
185
-
186
- #define LM_GGML_F16_STEP 64
187
- #define LM_GGML_F16_EPR 16
188
-
189
- // AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
190
-
191
- #define LM_GGML_F32Cx16 __m512
192
- #define LM_GGML_F32Cx16_ZERO _mm512_setzero_ps()
193
- #define LM_GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
194
-
195
- // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
196
- // so F16C guard isn't required
197
- #define LM_GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
198
- #define LM_GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
199
-
200
- #define LM_GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
201
- #define LM_GGML_F32Cx16_ADD _mm512_add_ps
202
- #define LM_GGML_F32Cx16_MUL _mm512_mul_ps
203
- #define LM_GGML_F32Cx16_REDUCE(res, x) \
204
- do { \
205
- int offset = LM_GGML_F32_ARR >> 1; \
206
- for (int i = 0; i < offset; ++i) { \
207
- x[i] = _mm512_add_ps(x[i], x[offset+i]); \
208
- } \
209
- offset >>= 1; \
210
- for (int i = 0; i < offset; ++i) { \
211
- x[i] = _mm512_add_ps(x[i], x[offset+i]); \
212
- } \
213
- offset >>= 1; \
214
- for (int i = 0; i < offset; ++i) { \
215
- x[i] = _mm512_add_ps(x[i], x[offset+i]); \
216
- } \
217
- res = (lm_ggml_float) _mm512_reduce_add_ps(x[0]); \
218
- } while (0)
219
-
220
- #define LM_GGML_F16_VEC LM_GGML_F32Cx16
221
- #define LM_GGML_F16_VEC_ZERO LM_GGML_F32Cx16_ZERO
222
- #define LM_GGML_F16_VEC_SET1 LM_GGML_F32Cx16_SET1
223
- #define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F32Cx16_LOAD(p)
224
- #define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F32Cx16_STORE(p, r[i])
225
- #define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx16_FMA
226
- #define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx16_ADD
227
- #define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx16_MUL
228
-
229
- #define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx16_REDUCE
230
- #elif defined(__AVX__)
231
-
232
- #define LM_GGML_SIMD
233
-
234
- // F32 AVX
235
-
236
- #define LM_GGML_F32_STEP 32
237
- #define LM_GGML_F32_EPR 8
238
-
239
- #define LM_GGML_F32x8 __m256
240
- #define LM_GGML_F32x8_ZERO _mm256_setzero_ps()
241
- #define LM_GGML_F32x8_SET1(x) _mm256_set1_ps(x)
242
- #define LM_GGML_F32x8_LOAD _mm256_loadu_ps
243
- #define LM_GGML_F32x8_STORE _mm256_storeu_ps
244
- #if defined(__FMA__)
245
- #define LM_GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
246
- #else
247
- #define LM_GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
248
- #endif
249
- #define LM_GGML_F32x8_ADD _mm256_add_ps
250
- #define LM_GGML_F32x8_MUL _mm256_mul_ps
251
- #define LM_GGML_F32x8_REDUCE(res, x) \
252
- do { \
253
- int offset = LM_GGML_F32_ARR >> 1; \
254
- for (int i = 0; i < offset; ++i) { \
255
- x[i] = _mm256_add_ps(x[i], x[offset+i]); \
256
- } \
257
- offset >>= 1; \
258
- for (int i = 0; i < offset; ++i) { \
259
- x[i] = _mm256_add_ps(x[i], x[offset+i]); \
260
- } \
261
- offset >>= 1; \
262
- for (int i = 0; i < offset; ++i) { \
263
- x[i] = _mm256_add_ps(x[i], x[offset+i]); \
264
- } \
265
- const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
266
- _mm256_extractf128_ps(x[0], 1)); \
267
- const __m128 t1 = _mm_hadd_ps(t0, t0); \
268
- res = (lm_ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
269
- } while (0)
270
- // TODO: is this optimal ?
271
-
272
- #define LM_GGML_F32_VEC LM_GGML_F32x8
273
- #define LM_GGML_F32_VEC_ZERO LM_GGML_F32x8_ZERO
274
- #define LM_GGML_F32_VEC_SET1 LM_GGML_F32x8_SET1
275
- #define LM_GGML_F32_VEC_LOAD LM_GGML_F32x8_LOAD
276
- #define LM_GGML_F32_VEC_STORE LM_GGML_F32x8_STORE
277
- #define LM_GGML_F32_VEC_FMA LM_GGML_F32x8_FMA
278
- #define LM_GGML_F32_VEC_ADD LM_GGML_F32x8_ADD
279
- #define LM_GGML_F32_VEC_MUL LM_GGML_F32x8_MUL
280
- #define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x8_REDUCE
281
-
282
- // F16 AVX
283
-
284
- #define LM_GGML_F16_STEP 32
285
- #define LM_GGML_F16_EPR 8
286
-
287
- // F16 arithmetic is not supported by AVX, so we use F32 instead
288
-
289
- #define LM_GGML_F32Cx8 __m256
290
- #define LM_GGML_F32Cx8_ZERO _mm256_setzero_ps()
291
- #define LM_GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
292
-
293
- #if defined(__F16C__)
294
- // the _mm256_cvt intrinsics require F16C
295
- #define LM_GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
296
- #define LM_GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
297
- #else
298
- static inline __m256 __avx_f32cx8_load(const lm_ggml_fp16_t * x) {
299
- float tmp[8];
300
-
301
- for (int i = 0; i < 8; i++) {
302
- tmp[i] = LM_GGML_FP16_TO_FP32(x[i]);
303
- }
304
-
305
- return _mm256_loadu_ps(tmp);
306
- }
307
- static inline void __avx_f32cx8_store(lm_ggml_fp16_t *x, __m256 y) {
308
- float arr[8];
309
-
310
- _mm256_storeu_ps(arr, y);
311
-
312
- for (int i = 0; i < 8; i++)
313
- x[i] = LM_GGML_FP32_TO_FP16(arr[i]);
314
- }
315
- #define LM_GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
316
- #define LM_GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
317
- #endif
318
-
319
- #define LM_GGML_F32Cx8_FMA LM_GGML_F32x8_FMA
320
- #define LM_GGML_F32Cx8_ADD _mm256_add_ps
321
- #define LM_GGML_F32Cx8_MUL _mm256_mul_ps
322
- #define LM_GGML_F32Cx8_REDUCE LM_GGML_F32x8_REDUCE
323
-
324
- #define LM_GGML_F16_VEC LM_GGML_F32Cx8
325
- #define LM_GGML_F16_VEC_ZERO LM_GGML_F32Cx8_ZERO
326
- #define LM_GGML_F16_VEC_SET1 LM_GGML_F32Cx8_SET1
327
- #define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F32Cx8_LOAD(p)
328
- #define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F32Cx8_STORE(p, r[i])
329
- #define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx8_FMA
330
- #define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx8_ADD
331
- #define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx8_MUL
332
- #define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx8_REDUCE
333
-
334
- #elif defined(__POWER9_VECTOR__)
335
-
336
- #define LM_GGML_SIMD
337
-
338
- // F32 POWER9
339
-
340
- #define LM_GGML_F32_STEP 32
341
- #define LM_GGML_F32_EPR 4
342
-
343
- #define LM_GGML_F32x4 vector float
344
- #define LM_GGML_F32x4_ZERO 0.0f
345
- #define LM_GGML_F32x4_SET1 vec_splats
346
- #define LM_GGML_F32x4_LOAD(p) vec_xl(0, p)
347
- #define LM_GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
348
- #define LM_GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
349
- #define LM_GGML_F32x4_ADD vec_add
350
- #define LM_GGML_F32x4_MUL vec_mul
351
- #define LM_GGML_F32x4_REDUCE(res, x) \
352
- { \
353
- int offset = LM_GGML_F32_ARR >> 1; \
354
- for (int i = 0; i < offset; ++i) { \
355
- x[i] = vec_add(x[i], x[offset+i]); \
356
- } \
357
- offset >>= 1; \
358
- for (int i = 0; i < offset; ++i) { \
359
- x[i] = vec_add(x[i], x[offset+i]); \
360
- } \
361
- offset >>= 1; \
362
- for (int i = 0; i < offset; ++i) { \
363
- x[i] = vec_add(x[i], x[offset+i]); \
364
- } \
365
- res = vec_extract(x[0], 0) + \
366
- vec_extract(x[0], 1) + \
367
- vec_extract(x[0], 2) + \
368
- vec_extract(x[0], 3); \
369
- }
370
-
371
- #define LM_GGML_F32_VEC LM_GGML_F32x4
372
- #define LM_GGML_F32_VEC_ZERO LM_GGML_F32x4_ZERO
373
- #define LM_GGML_F32_VEC_SET1 LM_GGML_F32x4_SET1
374
- #define LM_GGML_F32_VEC_LOAD LM_GGML_F32x4_LOAD
375
- #define LM_GGML_F32_VEC_STORE LM_GGML_F32x4_STORE
376
- #define LM_GGML_F32_VEC_FMA LM_GGML_F32x4_FMA
377
- #define LM_GGML_F32_VEC_ADD LM_GGML_F32x4_ADD
378
- #define LM_GGML_F32_VEC_MUL LM_GGML_F32x4_MUL
379
- #define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x4_REDUCE
380
-
381
- // F16 POWER9
382
- #define LM_GGML_F16_STEP LM_GGML_F32_STEP
383
- #define LM_GGML_F16_EPR LM_GGML_F32_EPR
384
- #define LM_GGML_F16_VEC LM_GGML_F32x4
385
- #define LM_GGML_F16_VEC_ZERO LM_GGML_F32x4_ZERO
386
- #define LM_GGML_F16_VEC_SET1 LM_GGML_F32x4_SET1
387
- #define LM_GGML_F16_VEC_FMA LM_GGML_F32x4_FMA
388
- #define LM_GGML_F16_VEC_ADD LM_GGML_F32x4_ADD
389
- #define LM_GGML_F16_VEC_MUL LM_GGML_F32x4_MUL
390
- #define LM_GGML_F16_VEC_REDUCE LM_GGML_F32x4_REDUCE
391
- // Use vec_xl, not vec_ld, in case the load address is not aligned.
392
- #define LM_GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
393
- vec_extract_fp32_from_shorth(vec_xl(0, p - LM_GGML_F16_EPR)) : \
394
- vec_extract_fp32_from_shortl(vec_xl(0, p))
395
- static inline unsigned char lm_ggml_endian_byte(int i) {
396
- uint16_t tmp_val = 1;
397
- return ((unsigned char *)&tmp_val)[i];
398
- }
399
- #define LM_GGML_ENDIAN_BYTE(i) lm_ggml_endian_byte(i)
400
- #define LM_GGML_F16_VEC_STORE(p, r, i) \
401
- if (i & 0x1) \
402
- vec_xst(vec_pack_to_short_fp32(r[i - LM_GGML_ENDIAN_BYTE(1)], \
403
- r[i - LM_GGML_ENDIAN_BYTE(0)]), \
404
- 0, p - LM_GGML_F16_EPR)
405
-
406
- #elif defined(__wasm_simd128__)
407
-
408
- #define LM_GGML_SIMD
409
-
410
- // F32 WASM
411
-
412
- #define LM_GGML_F32_STEP 16
413
- #define LM_GGML_F32_EPR 4
414
-
415
- #define LM_GGML_F32x4 v128_t
416
- #define LM_GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
417
- #define LM_GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
418
- #define LM_GGML_F32x4_LOAD wasm_v128_load
419
- #define LM_GGML_F32x4_STORE wasm_v128_store
420
- #define LM_GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
421
- #define LM_GGML_F32x4_ADD wasm_f32x4_add
422
- #define LM_GGML_F32x4_MUL wasm_f32x4_mul
423
- #define LM_GGML_F32x4_REDUCE(res, x) \
424
- { \
425
- int offset = LM_GGML_F32_ARR >> 1; \
426
- for (int i = 0; i < offset; ++i) { \
427
- x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
428
- } \
429
- offset >>= 1; \
430
- for (int i = 0; i < offset; ++i) { \
431
- x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
432
- } \
433
- offset >>= 1; \
434
- for (int i = 0; i < offset; ++i) { \
435
- x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
436
- } \
437
- res = wasm_f32x4_extract_lane(x[0], 0) + \
438
- wasm_f32x4_extract_lane(x[0], 1) + \
439
- wasm_f32x4_extract_lane(x[0], 2) + \
440
- wasm_f32x4_extract_lane(x[0], 3); \
441
- }
442
-
443
- #define LM_GGML_F32_VEC LM_GGML_F32x4
444
- #define LM_GGML_F32_VEC_ZERO LM_GGML_F32x4_ZERO
445
- #define LM_GGML_F32_VEC_SET1 LM_GGML_F32x4_SET1
446
- #define LM_GGML_F32_VEC_LOAD LM_GGML_F32x4_LOAD
447
- #define LM_GGML_F32_VEC_STORE LM_GGML_F32x4_STORE
448
- #define LM_GGML_F32_VEC_FMA LM_GGML_F32x4_FMA
449
- #define LM_GGML_F32_VEC_ADD LM_GGML_F32x4_ADD
450
- #define LM_GGML_F32_VEC_MUL LM_GGML_F32x4_MUL
451
- #define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x4_REDUCE
452
-
453
- // F16 WASM
454
-
455
- #define LM_GGML_F16_STEP 16
456
- #define LM_GGML_F16_EPR 4
457
-
458
- inline static v128_t __wasm_f16x4_load(const lm_ggml_fp16_t * p) {
459
- float tmp[4];
460
-
461
- tmp[0] = LM_GGML_FP16_TO_FP32(p[0]);
462
- tmp[1] = LM_GGML_FP16_TO_FP32(p[1]);
463
- tmp[2] = LM_GGML_FP16_TO_FP32(p[2]);
464
- tmp[3] = LM_GGML_FP16_TO_FP32(p[3]);
465
-
466
- return wasm_v128_load(tmp);
467
- }
468
-
469
- inline static void __wasm_f16x4_store(lm_ggml_fp16_t * p, v128_t x) {
470
- float tmp[4];
471
-
472
- wasm_v128_store(tmp, x);
473
-
474
- p[0] = LM_GGML_FP32_TO_FP16(tmp[0]);
475
- p[1] = LM_GGML_FP32_TO_FP16(tmp[1]);
476
- p[2] = LM_GGML_FP32_TO_FP16(tmp[2]);
477
- p[3] = LM_GGML_FP32_TO_FP16(tmp[3]);
478
- }
479
-
480
- #define LM_GGML_F16x4 v128_t
481
- #define LM_GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
482
- #define LM_GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
483
- #define LM_GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
484
- #define LM_GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
485
- #define LM_GGML_F16x4_FMA LM_GGML_F32x4_FMA
486
- #define LM_GGML_F16x4_ADD wasm_f32x4_add
487
- #define LM_GGML_F16x4_MUL wasm_f32x4_mul
488
- #define LM_GGML_F16x4_REDUCE(res, x) \
489
- { \
490
- int offset = LM_GGML_F16_ARR >> 1; \
491
- for (int i = 0; i < offset; ++i) { \
492
- x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
493
- } \
494
- offset >>= 1; \
495
- for (int i = 0; i < offset; ++i) { \
496
- x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
497
- } \
498
- offset >>= 1; \
499
- for (int i = 0; i < offset; ++i) { \
500
- x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
501
- } \
502
- res = (lm_ggml_float) (wasm_f32x4_extract_lane(x[0], 0) + \
503
- wasm_f32x4_extract_lane(x[0], 1) + \
504
- wasm_f32x4_extract_lane(x[0], 2) + \
505
- wasm_f32x4_extract_lane(x[0], 3)); \
506
- }
507
-
508
- #define LM_GGML_F16_VEC LM_GGML_F16x4
509
- #define LM_GGML_F16_VEC_ZERO LM_GGML_F16x4_ZERO
510
- #define LM_GGML_F16_VEC_SET1 LM_GGML_F16x4_SET1
511
- #define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F16x4_LOAD(p)
512
- #define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F16x4_STORE(p, r[i])
513
- #define LM_GGML_F16_VEC_FMA LM_GGML_F16x4_FMA
514
- #define LM_GGML_F16_VEC_ADD LM_GGML_F16x4_ADD
515
- #define LM_GGML_F16_VEC_MUL LM_GGML_F16x4_MUL
516
- #define LM_GGML_F16_VEC_REDUCE LM_GGML_F16x4_REDUCE
517
-
518
- #elif defined(__SSE3__)
519
-
520
- #define LM_GGML_SIMD
521
-
522
- // F32 SSE
523
-
524
- #define LM_GGML_F32_STEP 32
525
- #define LM_GGML_F32_EPR 4
526
-
527
- #define LM_GGML_F32x4 __m128
528
- #define LM_GGML_F32x4_ZERO _mm_setzero_ps()
529
- #define LM_GGML_F32x4_SET1(x) _mm_set1_ps(x)
530
- #define LM_GGML_F32x4_LOAD _mm_loadu_ps
531
- #define LM_GGML_F32x4_STORE _mm_storeu_ps
532
- #if defined(__FMA__)
533
- // TODO: Does this work?
534
- #define LM_GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
535
- #else
536
- #define LM_GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
537
- #endif
538
- #define LM_GGML_F32x4_ADD _mm_add_ps
539
- #define LM_GGML_F32x4_MUL _mm_mul_ps
540
- #define LM_GGML_F32x4_REDUCE(res, x) \
541
- { \
542
- int offset = LM_GGML_F32_ARR >> 1; \
543
- for (int i = 0; i < offset; ++i) { \
544
- x[i] = _mm_add_ps(x[i], x[offset+i]); \
545
- } \
546
- offset >>= 1; \
547
- for (int i = 0; i < offset; ++i) { \
548
- x[i] = _mm_add_ps(x[i], x[offset+i]); \
549
- } \
550
- offset >>= 1; \
551
- for (int i = 0; i < offset; ++i) { \
552
- x[i] = _mm_add_ps(x[i], x[offset+i]); \
553
- } \
554
- const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
555
- res = (lm_ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
556
- }
557
- // TODO: is this optimal ?
558
-
559
- #define LM_GGML_F32_VEC LM_GGML_F32x4
560
- #define LM_GGML_F32_VEC_ZERO LM_GGML_F32x4_ZERO
561
- #define LM_GGML_F32_VEC_SET1 LM_GGML_F32x4_SET1
562
- #define LM_GGML_F32_VEC_LOAD LM_GGML_F32x4_LOAD
563
- #define LM_GGML_F32_VEC_STORE LM_GGML_F32x4_STORE
564
- #define LM_GGML_F32_VEC_FMA LM_GGML_F32x4_FMA
565
- #define LM_GGML_F32_VEC_ADD LM_GGML_F32x4_ADD
566
- #define LM_GGML_F32_VEC_MUL LM_GGML_F32x4_MUL
567
- #define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x4_REDUCE
568
-
569
- // F16 SSE
570
-
571
- #define LM_GGML_F16_STEP 32
572
- #define LM_GGML_F16_EPR 4
573
-
574
- static inline __m128 __sse_f16x4_load(const lm_ggml_fp16_t * x) {
575
- float tmp[4];
576
-
577
- tmp[0] = LM_GGML_FP16_TO_FP32(x[0]);
578
- tmp[1] = LM_GGML_FP16_TO_FP32(x[1]);
579
- tmp[2] = LM_GGML_FP16_TO_FP32(x[2]);
580
- tmp[3] = LM_GGML_FP16_TO_FP32(x[3]);
581
-
582
- return _mm_loadu_ps(tmp);
583
- }
584
-
585
- static inline void __sse_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
586
- float arr[4];
587
-
588
- _mm_storeu_ps(arr, y);
589
-
590
- x[0] = LM_GGML_FP32_TO_FP16(arr[0]);
591
- x[1] = LM_GGML_FP32_TO_FP16(arr[1]);
592
- x[2] = LM_GGML_FP32_TO_FP16(arr[2]);
593
- x[3] = LM_GGML_FP32_TO_FP16(arr[3]);
594
- }
595
-
596
- #define LM_GGML_F32Cx4 __m128
597
- #define LM_GGML_F32Cx4_ZERO _mm_setzero_ps()
598
- #define LM_GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
599
- #define LM_GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
600
- #define LM_GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
601
- #define LM_GGML_F32Cx4_FMA LM_GGML_F32x4_FMA
602
- #define LM_GGML_F32Cx4_ADD _mm_add_ps
603
- #define LM_GGML_F32Cx4_MUL _mm_mul_ps
604
- #define LM_GGML_F32Cx4_REDUCE LM_GGML_F32x4_REDUCE
605
-
606
- #define LM_GGML_F16_VEC LM_GGML_F32Cx4
607
- #define LM_GGML_F16_VEC_ZERO LM_GGML_F32Cx4_ZERO
608
- #define LM_GGML_F16_VEC_SET1 LM_GGML_F32Cx4_SET1
609
- #define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F32Cx4_LOAD(p)
610
- #define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F32Cx4_STORE(p, r[i])
611
- #define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx4_FMA
612
- #define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx4_ADD
613
- #define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx4_MUL
614
- #define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx4_REDUCE
615
-
616
- #elif defined(__loongarch_asx)
617
-
618
- #define LM_GGML_SIMD
619
-
620
- // F32 LASX
621
- #define LM_GGML_F32_STEP 32
622
- #define LM_GGML_F32_EPR 8
623
-
624
- #define LM_GGML_F32x8 __m256
625
- #define LM_GGML_F32x8_ZERO (__m256)__lasx_xvldi(0)
626
- #define LM_GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
627
- #define LM_GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
628
- #define LM_GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0)
629
- #define LM_GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
630
- #define LM_GGML_F32x8_ADD __lasx_xvfadd_s
631
- #define LM_GGML_F32x8_MUL __lasx_xvfmul_s
632
- #define LM_GGML_F32x8_REDUCE(res, x) \
633
- do { \
634
- int offset = LM_GGML_F32_ARR >> 1; \
635
- for (int i = 0; i < offset; ++i) { \
636
- x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
637
- } \
638
- offset >>= 1; \
639
- for (int i = 0; i < offset; ++i) { \
640
- x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
641
- } \
642
- offset >>= 1; \
643
- for (int i = 0; i < offset; ++i) { \
644
- x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
645
- } \
646
- float *tmp_p = (float *)&x[0]; \
647
- res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \
648
- } while (0)
649
- // TODO: is this optimal ?
650
-
651
- #define LM_GGML_F32_VEC LM_GGML_F32x8
652
- #define LM_GGML_F32_VEC_ZERO LM_GGML_F32x8_ZERO
653
- #define LM_GGML_F32_VEC_SET1 LM_GGML_F32x8_SET1
654
- #define LM_GGML_F32_VEC_LOAD LM_GGML_F32x8_LOAD
655
- #define LM_GGML_F32_VEC_STORE LM_GGML_F32x8_STORE
656
- #define LM_GGML_F32_VEC_FMA LM_GGML_F32x8_FMA
657
- #define LM_GGML_F32_VEC_ADD LM_GGML_F32x8_ADD
658
- #define LM_GGML_F32_VEC_MUL LM_GGML_F32x8_MUL
659
- #define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x8_REDUCE
660
-
661
- // F16 LASX
662
-
663
- #define LM_GGML_F16_STEP 32
664
- #define LM_GGML_F16_EPR 8
665
-
666
- // F16 arithmetic is not supported by LASX, so we use F32 instead
667
-
668
- #define LM_GGML_F32Cx8 __m256
669
- #define LM_GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
670
- #define LM_GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
671
-
672
- static inline __m256 __lasx_f32cx8_load(const lm_ggml_fp16_t * x) {
673
- __m256i a;
674
- memcpy(&a, x, sizeof(lm_ggml_fp16_t) * 8);
675
- a = __lasx_xvpermi_d(a, 0 | (1 << 4));
676
- return __lasx_xvfcvtl_s_h(a);
677
- }
678
-
679
- static inline void __lasx_f32cx8_store(lm_ggml_fp16_t * x, __m256 y) {
680
- __m256i a = __lasx_xvfcvt_h_s(y, y);
681
- a = __lasx_xvpermi_d(a, 0 | (2 << 2));
682
- memcpy(x, &a, sizeof(lm_ggml_fp16_t) * 8);
683
- }
684
- #define LM_GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
685
- #define LM_GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
686
-
687
- #define LM_GGML_F32Cx8_FMA LM_GGML_F32x8_FMA
688
- #define LM_GGML_F32Cx8_ADD __lasx_xvfadd_s
689
- #define LM_GGML_F32Cx8_MUL __lasx_xvfmul_s
690
- #define LM_GGML_F32Cx8_REDUCE LM_GGML_F32x8_REDUCE
691
-
692
- #define LM_GGML_F16_VEC LM_GGML_F32Cx8
693
- #define LM_GGML_F16_VEC_ZERO LM_GGML_F32Cx8_ZERO
694
- #define LM_GGML_F16_VEC_SET1 LM_GGML_F32Cx8_SET1
695
- #define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F32Cx8_LOAD(p)
696
- #define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F32Cx8_STORE(p, r[i])
697
- #define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx8_FMA
698
- #define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx8_ADD
699
- #define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx8_MUL
700
- #define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx8_REDUCE
701
-
702
- #elif defined(__loongarch_sx)
703
-
704
- #define LM_GGML_SIMD
705
-
706
- // F32 LSX
707
-
708
- #define LM_GGML_F32_STEP 32
709
- #define LM_GGML_F32_EPR 4
710
-
711
- #define LM_GGML_F32x4 __m128
712
- #define LM_GGML_F32x4_ZERO __lsx_vldi(0)
713
- #define LM_GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
714
- #define LM_GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
715
- #define LM_GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0)
716
- #define LM_GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
717
- #define LM_GGML_F32x4_ADD __lsx_vfadd_s
718
- #define LM_GGML_F32x4_MUL __lsx_vfmul_s
719
- #define LM_GGML_F32x4_REDUCE(res, x) \
720
- { \
721
- int offset = LM_GGML_F32_ARR >> 1; \
722
- for (int i = 0; i < offset; ++i) { \
723
- x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
724
- } \
725
- offset >>= 1; \
726
- for (int i = 0; i < offset; ++i) { \
727
- x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
728
- } \
729
- offset >>= 1; \
730
- for (int i = 0; i < offset; ++i) { \
731
- x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
732
- } \
733
- __m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
734
- tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
735
- tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
736
- const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
737
- tmp = __lsx_vsrli_d((__m128i) t0, 32); \
738
- tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
739
- tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
740
- res = (lm_ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
741
- }
742
-
743
- #define LM_GGML_F32_VEC LM_GGML_F32x4
744
- #define LM_GGML_F32_VEC_ZERO LM_GGML_F32x4_ZERO
745
- #define LM_GGML_F32_VEC_SET1 LM_GGML_F32x4_SET1
746
- #define LM_GGML_F32_VEC_LOAD LM_GGML_F32x4_LOAD
747
- #define LM_GGML_F32_VEC_STORE LM_GGML_F32x4_STORE
748
- #define LM_GGML_F32_VEC_FMA LM_GGML_F32x4_FMA
749
- #define LM_GGML_F32_VEC_ADD LM_GGML_F32x4_ADD
750
- #define LM_GGML_F32_VEC_MUL LM_GGML_F32x4_MUL
751
- #define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x4_REDUCE
752
-
753
- // F16 LSX
754
-
755
- #define LM_GGML_F16_STEP 32
756
- #define LM_GGML_F16_EPR 4
757
-
758
- static inline __m128 __lsx_f16x4_load(const lm_ggml_fp16_t * x) {
759
- float tmp[4];
760
-
761
- tmp[0] = LM_GGML_FP16_TO_FP32(x[0]);
762
- tmp[1] = LM_GGML_FP16_TO_FP32(x[1]);
763
- tmp[2] = LM_GGML_FP16_TO_FP32(x[2]);
764
- tmp[3] = LM_GGML_FP16_TO_FP32(x[3]);
765
-
766
- return __lsx_vld(tmp, 0);
767
- }
768
-
769
- static inline void __lsx_f16x4_store(lm_ggml_fp16_t * x, __m128 y) {
770
- float arr[4];
771
-
772
- __lsx_vst(y, arr, 0);
773
-
774
- x[0] = LM_GGML_FP32_TO_FP16(arr[0]);
775
- x[1] = LM_GGML_FP32_TO_FP16(arr[1]);
776
- x[2] = LM_GGML_FP32_TO_FP16(arr[2]);
777
- x[3] = LM_GGML_FP32_TO_FP16(arr[3]);
778
- }
779
-
780
- #define LM_GGML_F32Cx4 __m128
781
- #define LM_GGML_F32Cx4_ZERO __lsx_vldi(0)
782
- #define LM_GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
783
- #define LM_GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
784
- #define LM_GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
785
- #define LM_GGML_F32Cx4_FMA LM_GGML_F32x4_FMA
786
- #define LM_GGML_F32Cx4_ADD __lsx_vfadd_s
787
- #define LM_GGML_F32Cx4_MUL __lsx_vfmul_s
788
- #define LM_GGML_F32Cx4_REDUCE LM_GGML_F32x4_REDUCE
789
-
790
- #define LM_GGML_F16_VEC LM_GGML_F32Cx4
791
- #define LM_GGML_F16_VEC_ZERO LM_GGML_F32Cx4_ZERO
792
- #define LM_GGML_F16_VEC_SET1 LM_GGML_F32Cx4_SET1
793
- #define LM_GGML_F16_VEC_LOAD(p, i) LM_GGML_F32Cx4_LOAD(p)
794
- #define LM_GGML_F16_VEC_STORE(p, r, i) LM_GGML_F32Cx4_STORE(p, r[i])
795
- #define LM_GGML_F16_VEC_FMA LM_GGML_F32Cx4_FMA
796
- #define LM_GGML_F16_VEC_ADD LM_GGML_F32Cx4_ADD
797
- #define LM_GGML_F16_VEC_MUL LM_GGML_F32Cx4_MUL
798
- #define LM_GGML_F16_VEC_REDUCE LM_GGML_F32Cx4_REDUCE
799
-
800
- #elif defined(__VXE__) || defined(__VXE2__)
801
-
802
- #define LM_GGML_SIMD
803
-
804
- // F32 s390x
805
-
806
- #define LM_GGML_F32_STEP 32
807
- #define LM_GGML_F32_EPR 4
808
-
809
- #define LM_GGML_F32x4 __vector float
810
- #define LM_GGML_F32x4_ZERO vec_splats(0.0f)
811
- #define LM_GGML_F32x4_SET1 vec_splats
812
- #define LM_GGML_F32x4_LOAD(p) vec_xl(0, p)
813
- #define LM_GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
814
- #define LM_GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
815
- #define LM_GGML_F32x4_ADD vec_add
816
- #define LM_GGML_F32x4_MUL vec_mul
817
- #define LM_GGML_F32x4_REDUCE(res, x) \
818
- { \
819
- int offset = LM_GGML_F32_ARR >> 1; \
820
- for (int i = 0; i < offset; ++i) { \
821
- x[i] = vec_add(x[i], x[offset + i]); \
822
- } \
823
- offset >>= 1; \
824
- for (int i = 0; i < offset; ++i) { \
825
- x[i] = vec_add(x[i], x[offset + i]); \
826
- } \
827
- offset >>= 1; \
828
- for (int i = 0; i < offset; ++i) { \
829
- x[i] = vec_add(x[i], x[offset + i]); \
830
- } \
831
- res = vec_extract(x[0], 0) + \
832
- vec_extract(x[0], 1) + \
833
- vec_extract(x[0], 2) + \
834
- vec_extract(x[0], 3); \
835
- }
836
-
837
- #define LM_GGML_F32_VEC LM_GGML_F32x4
838
- #define LM_GGML_F32_VEC_ZERO LM_GGML_F32x4_ZERO
839
- #define LM_GGML_F32_VEC_SET1 LM_GGML_F32x4_SET1
840
- #define LM_GGML_F32_VEC_LOAD LM_GGML_F32x4_LOAD
841
- #define LM_GGML_F32_VEC_STORE LM_GGML_F32x4_STORE
842
- #define LM_GGML_F32_VEC_FMA LM_GGML_F32x4_FMA
843
- #define LM_GGML_F32_VEC_ADD LM_GGML_F32x4_ADD
844
- #define LM_GGML_F32_VEC_MUL LM_GGML_F32x4_MUL
845
- #define LM_GGML_F32_VEC_REDUCE LM_GGML_F32x4_REDUCE
846
-
847
- // F16 s390x
848
- #define LM_GGML_F16_STEP LM_GGML_F32_STEP
849
- #define LM_GGML_F16_EPR LM_GGML_F32_EPR
850
-
851
- static inline __vector float __lzs_f16cx4_load(const lm_ggml_fp16_t * x) {
852
- float tmp[4];
853
-
854
- for (int i = 0; i < 4; i++) {
855
- tmp[i] = LM_GGML_FP16_TO_FP32(x[i]);
856
- }
857
-
858
- return vec_xl(0, tmp);
859
- }
860
-
861
- static inline void __lzs_f16cx4_store(lm_ggml_fp16_t * x, __vector float y) {
862
- float arr[4];
863
-
864
- vec_xst(y, 0, arr);
865
-
866
- for (int i = 0; i < 4; i++) {
867
- x[i] = LM_GGML_FP32_TO_FP16(arr[i]);
868
- }
869
- }
870
-
871
- #define LM_GGML_F16_VEC LM_GGML_F32x4
872
- #define LM_GGML_F16_VEC_ZERO LM_GGML_F32x4_ZERO
873
- #define LM_GGML_F16_VEC_SET1 LM_GGML_F32x4_SET1
874
- #define LM_GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p)
875
- #define LM_GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
876
- #define LM_GGML_F16_VEC_FMA LM_GGML_F32x4_FMA
877
- #define LM_GGML_F16_VEC_ADD LM_GGML_F32x4_ADD
878
- #define LM_GGML_F16_VEC_MUL LM_GGML_F32x4_MUL
879
- #define LM_GGML_F16_VEC_REDUCE LM_GGML_F32x4_REDUCE
880
-
881
- #endif
882
-
883
- // LM_GGML_F32_ARR / LM_GGML_F16_ARR
884
- // number of registers to use per step
885
- #ifdef LM_GGML_SIMD
886
- #define LM_GGML_F32_ARR (LM_GGML_F32_STEP/LM_GGML_F32_EPR)
887
- #define LM_GGML_F16_ARR (LM_GGML_F16_STEP/LM_GGML_F16_EPR)
888
- #endif