cui-llama.rn 1.7.3 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (276) hide show
  1. package/README.md +217 -17
  2. package/android/src/main/CMakeLists.txt +34 -15
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +94 -8
  4. package/android/src/main/java/com/rnllama/RNLlama.java +247 -0
  5. package/android/src/main/jni.cpp +213 -14
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  14. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
  15. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
  16. package/cpp/README.md +1 -1
  17. package/cpp/chat-parser.cpp +385 -0
  18. package/cpp/chat-parser.h +120 -0
  19. package/cpp/chat.cpp +726 -596
  20. package/cpp/chat.h +71 -6
  21. package/cpp/common.cpp +56 -38
  22. package/cpp/common.h +9 -3
  23. package/cpp/ggml-backend-reg.cpp +5 -0
  24. package/cpp/ggml-backend.cpp +10 -2
  25. package/cpp/ggml-common.h +4 -0
  26. package/cpp/ggml-cpu/amx/amx.cpp +1 -1
  27. package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
  28. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  29. package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
  30. package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
  31. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  32. package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
  33. package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  34. package/cpp/ggml-cpu/arch-fallback.h +184 -0
  35. package/cpp/ggml-cpu/common.h +4 -3
  36. package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
  37. package/cpp/ggml-cpu/ggml-cpu.c +123 -104
  38. package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
  39. package/cpp/ggml-cpu/ops.cpp +330 -148
  40. package/cpp/ggml-cpu/ops.h +1 -0
  41. package/cpp/ggml-cpu/quants.c +1158 -0
  42. package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  43. package/cpp/ggml-cpu/repack.cpp +1571 -0
  44. package/cpp/ggml-cpu/repack.h +98 -0
  45. package/cpp/ggml-cpu/simd-mappings.h +330 -38
  46. package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  47. package/cpp/ggml-cpu/vec.cpp +87 -18
  48. package/cpp/ggml-cpu/vec.h +249 -94
  49. package/cpp/ggml-cpu.h +1 -0
  50. package/cpp/ggml-impl.h +63 -183
  51. package/cpp/ggml-llama-sim.metallib +0 -0
  52. package/cpp/ggml-llama.metallib +0 -0
  53. package/cpp/ggml-metal.m +152 -45
  54. package/cpp/ggml-quants.c +0 -2
  55. package/cpp/ggml.c +61 -21
  56. package/cpp/ggml.h +22 -3
  57. package/cpp/gguf.cpp +24 -3
  58. package/cpp/json-partial.cpp +256 -0
  59. package/cpp/json-partial.h +38 -0
  60. package/cpp/json-schema-to-grammar.cpp +5 -47
  61. package/cpp/json-schema-to-grammar.h +4 -4
  62. package/cpp/llama-arch.cpp +153 -3
  63. package/cpp/llama-arch.h +27 -1
  64. package/cpp/llama-batch.cpp +741 -272
  65. package/cpp/llama-batch.h +112 -54
  66. package/cpp/llama-chat.cpp +30 -8
  67. package/cpp/llama-chat.h +1 -0
  68. package/cpp/llama-context.cpp +524 -339
  69. package/cpp/llama-context.h +38 -17
  70. package/cpp/llama-cparams.cpp +4 -0
  71. package/cpp/llama-cparams.h +2 -0
  72. package/cpp/llama-grammar.cpp +12 -2
  73. package/cpp/llama-graph.cpp +431 -356
  74. package/cpp/llama-graph.h +126 -58
  75. package/cpp/llama-hparams.cpp +10 -2
  76. package/cpp/llama-hparams.h +19 -2
  77. package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
  78. package/cpp/llama-kv-cache-unified-iswa.h +128 -0
  79. package/cpp/llama-kv-cache-unified.cpp +1841 -0
  80. package/cpp/llama-kv-cache-unified.h +303 -0
  81. package/cpp/llama-kv-cells.h +439 -0
  82. package/cpp/llama-memory-hybrid.cpp +246 -0
  83. package/cpp/llama-memory-hybrid.h +138 -0
  84. package/cpp/llama-memory-recurrent.cpp +1112 -0
  85. package/cpp/llama-memory-recurrent.h +183 -0
  86. package/cpp/llama-memory.cpp +41 -0
  87. package/cpp/llama-memory.h +86 -5
  88. package/cpp/llama-mmap.cpp +1 -1
  89. package/cpp/llama-model-loader.cpp +42 -17
  90. package/cpp/llama-model-saver.cpp +1 -0
  91. package/cpp/llama-model.cpp +1639 -513
  92. package/cpp/llama-model.h +26 -0
  93. package/cpp/llama-sampling.cpp +2 -2
  94. package/cpp/llama-vocab.cpp +65 -28
  95. package/cpp/llama-vocab.h +1 -0
  96. package/cpp/llama.cpp +11 -7
  97. package/cpp/llama.h +150 -42
  98. package/cpp/minja/chat-template.hpp +1 -1
  99. package/cpp/minja/minja.hpp +1 -1
  100. package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
  101. package/cpp/nlohmann/json_fwd.hpp +187 -0
  102. package/cpp/regex-partial.cpp +204 -0
  103. package/cpp/regex-partial.h +56 -0
  104. package/cpp/rn-llama.cpp +646 -35
  105. package/cpp/rn-llama.h +32 -1
  106. package/cpp/rn-tts.h +39 -0
  107. package/cpp/sampling.cpp +7 -8
  108. package/cpp/tools/mtmd/clip-impl.h +5 -0
  109. package/cpp/tools/mtmd/clip.cpp +572 -436
  110. package/cpp/tools/mtmd/clip.h +14 -4
  111. package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
  112. package/cpp/tools/mtmd/mtmd-audio.h +2 -17
  113. package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
  114. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  115. package/cpp/tools/mtmd/mtmd.cpp +368 -248
  116. package/cpp/tools/mtmd/mtmd.h +6 -70
  117. package/cpp/unicode.cpp +5 -0
  118. package/ios/CMakeLists.txt +26 -6
  119. package/ios/RNLlama.h +1 -1
  120. package/ios/RNLlama.mm +153 -3
  121. package/ios/RNLlamaContext.h +9 -1
  122. package/ios/RNLlamaContext.mm +112 -9
  123. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  124. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
  125. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
  126. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  127. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  128. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  129. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
  130. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  131. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  132. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  133. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  134. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  135. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  136. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  137. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  138. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  139. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  140. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  141. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  142. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  143. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  144. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  145. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  146. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  147. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
  148. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  157. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  165. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  166. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  167. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  168. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  169. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  170. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  171. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  172. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  173. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  174. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  175. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  176. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  177. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  178. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  179. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  180. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  181. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  182. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  183. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  184. package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  185. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  186. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  187. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  188. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  189. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  190. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  191. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
  192. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
  193. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
  194. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
  195. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
  196. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
  197. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
  198. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
  199. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  200. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
  201. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
  202. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
  203. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
  204. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
  205. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
  206. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
  207. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  208. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  209. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  210. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  211. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  212. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
  213. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
  214. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
  215. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
  216. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  217. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
  218. package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
  219. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  220. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
  221. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
  222. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
  223. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  224. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  225. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
  226. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
  227. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
  228. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
  229. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
  230. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
  231. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
  232. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
  233. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
  234. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
  235. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
  236. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
  237. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
  238. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
  239. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
  240. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
  241. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
  242. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
  243. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
  244. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
  245. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
  246. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
  247. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
  248. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
  249. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
  250. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
  251. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
  252. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
  253. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
  254. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
  255. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
  256. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
  257. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  258. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  259. package/jest/mock.js +24 -0
  260. package/package.json +1 -1
  261. package/src/NativeRNLlama.ts +46 -2
  262. package/src/index.ts +105 -1
  263. package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  264. package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
  265. package/cpp/ggml-cpu/sgemm.cpp +0 -3544
  266. package/cpp/ggml-cpu/sgemm.h +0 -14
  267. package/cpp/llama-kv-cache.cpp +0 -2827
  268. package/cpp/llama-kv-cache.h +0 -515
  269. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  270. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  271. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  272. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
  273. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
  274. /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  275. /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
  276. /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
@@ -3,72 +3,20 @@
3
3
  #include "ggml-common.h"
4
4
  #include "ggml-backend-impl.h"
5
5
 
6
- #include "ggml-quants.h"
7
6
  #include "ggml-impl.h"
8
7
  #include "ggml-cpu.h"
9
8
  #include "ggml-cpu-impl.h"
10
- #include "ggml-cpu-traits.h"
9
+ #include "simd-mappings.h"
10
+ #include "traits.h"
11
11
 
12
12
  #include <cmath>
13
13
  #include <cstring>
14
14
  #include <cassert>
15
- #include <cfloat>
16
15
  #include <cstdlib> // for qsort
17
16
  #include <cstdio> // for LM_GGML_ASSERT
18
17
 
19
- #include "ggml-cpu-aarch64.h"
20
-
21
- // TODO: move to include file?
22
- template <int K> constexpr int QK_0() {
23
- if constexpr (K == 4) {
24
- return QK4_0;
25
- }
26
- if constexpr (K == 8) {
27
- return QK8_0;
28
- }
29
- return -1;
30
- }
31
-
32
- template <int K, int N> struct block {
33
- lm_ggml_half d[N]; // deltas for N qK_0 blocks
34
- int8_t qs[(QK_0<K>() * N * K) / 8]; // quants for N qK_0 blocks
35
- };
36
-
37
- // control size
38
- static_assert(sizeof(block<4, 4>) == 4 * sizeof(lm_ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
39
- static_assert(sizeof(block<4, 8>) == 8 * sizeof(lm_ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
40
- static_assert(sizeof(block<8, 4>) == 4 * sizeof(lm_ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
41
- static_assert(sizeof(block<8, 8>) == 8 * sizeof(lm_ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
42
-
43
- using block_q4_0x4 = block<4, 4>;
44
- using block_q4_0x8 = block<4, 8>;
45
- using block_q8_0x4 = block<8, 4>;
46
- using block_q8_0x8 = block<8, 8>;
47
-
48
-
49
- struct block_q4_Kx8 {
50
- lm_ggml_half d[8]; // super-block scale for quantized scales
51
- lm_ggml_half dmin[8]; // super-block scale for quantized mins
52
- uint8_t scales[96]; // scales and mins, quantized with 6 bits
53
- uint8_t qs[1024]; // 4--bit quants
54
- };
55
-
56
- static_assert(sizeof(block_q4_Kx8) == sizeof(lm_ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
57
-
58
- struct block_q8_Kx4 {
59
- float d[4]; // delta
60
- int8_t qs[QK_K * 4]; // quants
61
- int16_t bsums[QK_K / 4]; // sum of quants in groups of 16
62
- };
63
-
64
- static_assert(sizeof(block_q8_Kx4) == sizeof(float) * 4 + QK_K * 4 + (QK_K / 4) * sizeof(int16_t), "wrong q8_K block size/padding");
65
-
66
- struct block_iq4_nlx4 {
67
- lm_ggml_half d[4]; // deltas for 4 iq4_nl blocks
68
- uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks
69
- };
70
-
71
- static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(lm_ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
18
+ #define LM_GGML_CPU_CLANG_WORKAROUND
19
+ #include "../../repack.h"
72
20
 
73
21
  #if defined(__GNUC__)
74
22
  #pragma GCC diagnostic ignored "-Woverlength-strings"
@@ -76,27 +24,6 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(lm_ggml_half) + QK4_NL * 2, "
76
24
 
77
25
  #define UNUSED LM_GGML_UNUSED
78
26
 
79
- static inline int nearest_int(float fval) {
80
- assert(fabsf(fval) <= 4194303.f);
81
- float val = fval + 12582912.f;
82
- int i; memcpy(&i, &val, sizeof(int));
83
- return (i & 0x007fffff) - 0x00400000;
84
- }
85
-
86
- // Functions to create the interleaved data layout formats
87
-
88
- // interleave 4 block_q4_0s in blocks of blck_size_interleave
89
- // returns an interleaved block_q4_0x4
90
- // in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
91
- // first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
92
- //
93
- // - in : an array of block_q4_0 pointers
94
- // - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
95
- // blck_size_interleave bytes
96
- // - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes
97
- // from bias offset form to pure sign form (this saves subtract
98
- // operations durin unpacking)
99
- //
100
27
  #if defined(__AVX__)
101
28
  #if defined(__F16C__)
102
29
  #if defined(__AVX512F__)
@@ -113,11 +40,11 @@ static inline __m512 __avx512_f32cx8x2_load(lm_ggml_fp16_t *x, lm_ggml_fp16_t *y
113
40
  float tmp[16];
114
41
 
115
42
  for (int i = 0; i < 8; i++) {
116
- tmp[i] = LM_GGML_FP16_TO_FP32(x[i]);
43
+ tmp[i] = LM_GGML_CPU_FP16_TO_FP32(x[i]);
117
44
  }
118
45
 
119
46
  for (int i = 0; i < 8; i++) {
120
- tmp[i + 8] = LM_GGML_FP16_TO_FP32(y[i]);
47
+ tmp[i + 8] = LM_GGML_CPU_FP16_TO_FP32(y[i]);
121
48
  }
122
49
 
123
50
  return _mm512_loadu_ps(tmp);
@@ -128,10 +55,10 @@ static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
128
55
  _mm_storeu_si128((__m128i*)tmphalf, x);
129
56
 
130
57
  for (int i = 0; i < 4; i++) {
131
- tmp[i] = LM_GGML_FP16_TO_FP32(tmphalf[i]);
132
- tmp[i + 4] = LM_GGML_FP16_TO_FP32(tmphalf[i]);
133
- tmp[i + 8] = LM_GGML_FP16_TO_FP32(tmphalf[i]);
134
- tmp[i + 12] = LM_GGML_FP16_TO_FP32(tmphalf[i]);
58
+ tmp[i] = LM_GGML_CPU_FP16_TO_FP32(tmphalf[i]);
59
+ tmp[i + 4] = LM_GGML_CPU_FP16_TO_FP32(tmphalf[i]);
60
+ tmp[i + 8] = LM_GGML_CPU_FP16_TO_FP32(tmphalf[i]);
61
+ tmp[i + 12] = LM_GGML_CPU_FP16_TO_FP32(tmphalf[i]);
135
62
  }
136
63
 
137
64
  return _mm512_loadu_ps(tmp);
@@ -141,7 +68,7 @@ static inline __m256 __avx_f32cx8_load(lm_ggml_fp16_t *x) {
141
68
  float tmp[8];
142
69
 
143
70
  for (int i = 0; i < 8; i++) {
144
- tmp[i] = LM_GGML_FP16_TO_FP32(x[i]);
71
+ tmp[i] = LM_GGML_CPU_FP16_TO_FP32(x[i]);
145
72
  }
146
73
 
147
74
  return _mm256_loadu_ps(tmp);
@@ -150,8 +77,8 @@ static inline __m256 __avx_repeat_f32cx8_load(lm_ggml_fp16_t *x) {
150
77
  float tmp[8];
151
78
 
152
79
  for (int i = 0; i < 4; i++) {
153
- tmp[i] = LM_GGML_FP16_TO_FP32(x[i]);
154
- tmp[i + 4] = LM_GGML_FP16_TO_FP32(x[i]);
80
+ tmp[i] = LM_GGML_CPU_FP16_TO_FP32(x[i]);
81
+ tmp[i + 4] = LM_GGML_CPU_FP16_TO_FP32(x[i]);
155
82
  }
156
83
 
157
84
  return _mm256_loadu_ps(tmp);
@@ -162,7 +89,7 @@ static inline __m256 __avx_rearranged_f32cx8_load(lm_ggml_fp16_t *x, __m128i arr
162
89
 
163
90
  _mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
164
91
  for (int i = 0; i < 8; i++) {
165
- tmp[i] = LM_GGML_FP16_TO_FP32(tmphalf[i]);
92
+ tmp[i] = LM_GGML_CPU_FP16_TO_FP32(tmphalf[i]);
166
93
  }
167
94
 
168
95
  return _mm256_loadu_ps(tmp);
@@ -178,6 +105,12 @@ static inline __m256 __avx_rearranged_f32cx8_load(lm_ggml_fp16_t *x, __m128i arr
178
105
  #endif
179
106
  #endif
180
107
 
108
+ static inline int nearest_int(float fval) {
109
+ assert(fabsf(fval) <= 4194303.f);
110
+ float val = fval + 12582912.f;
111
+ int i; memcpy(&i, &val, sizeof(int));
112
+ return (i & 0x007fffff) - 0x00400000;
113
+ }
181
114
 
182
115
  #if defined(__AVX2__) || defined(__AVX512F__)
183
116
  #if defined(__AVX512F__)
@@ -242,188 +175,14 @@ static inline __m256i mul_sum_i8_pairs_acc_int32x8(const __m256i acc, const __m2
242
175
  }
243
176
  #endif
244
177
 
245
- static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
246
-
247
- static void lm_ggml_quantize_mat_q8_0_4x4(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k) {
248
- assert(QK8_0 == 32);
249
- assert(k % QK8_0 == 0);
250
- const int nb = k / QK8_0;
251
-
252
- block_q8_0x4 * LM_GGML_RESTRICT y = (block_q8_0x4 *) vy;
253
-
254
- #if defined(__ARM_NEON)
255
- float32x4_t srcv[4][8];
256
- float id[4];
257
-
258
- for (int i = 0; i < nb; i++) {
259
- float32x4_t asrcv[8];
260
- float32x4_t amaxv[8];
261
-
262
- for (int row_iter = 0; row_iter < 4; row_iter++) {
263
- for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
264
- for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
265
-
266
- for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
267
- for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
268
- for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
269
-
270
- const float amax = vmaxvq_f32(amaxv[0]);
271
-
272
- const float d = amax / ((1 << 7) - 1);
273
- id[row_iter] = d ? 1.0f / d : 0.0f;
274
-
275
- y[i].d[row_iter] = LM_GGML_FP32_TO_FP16(d);
276
- }
277
-
278
- for (int j = 0; j < 8; j++) {
279
- float32x4_t v = vmulq_n_f32(srcv[0][j], id[0]);
280
- int32x4_t vi = vcvtnq_s32_f32(v);
281
- y[i].qs[16 * j + 0] = vgetq_lane_s32(vi, 0);
282
- y[i].qs[16 * j + 1] = vgetq_lane_s32(vi, 1);
283
- y[i].qs[16 * j + 2] = vgetq_lane_s32(vi, 2);
284
- y[i].qs[16 * j + 3] = vgetq_lane_s32(vi, 3);
285
-
286
- v = vmulq_n_f32(srcv[1][j], id[1]);
287
- vi = vcvtnq_s32_f32(v);
288
- y[i].qs[16 * j + 4] = vgetq_lane_s32(vi, 0);
289
- y[i].qs[16 * j + 5] = vgetq_lane_s32(vi, 1);
290
- y[i].qs[16 * j + 6] = vgetq_lane_s32(vi, 2);
291
- y[i].qs[16 * j + 7] = vgetq_lane_s32(vi, 3);
292
-
293
- v = vmulq_n_f32(srcv[2][j], id[2]);
294
- vi = vcvtnq_s32_f32(v);
295
- y[i].qs[16 * j + 8] = vgetq_lane_s32(vi, 0);
296
- y[i].qs[16 * j + 9] = vgetq_lane_s32(vi, 1);
297
- y[i].qs[16 * j + 10] = vgetq_lane_s32(vi, 2);
298
- y[i].qs[16 * j + 11] = vgetq_lane_s32(vi, 3);
299
-
300
- v = vmulq_n_f32(srcv[3][j], id[3]);
301
- vi = vcvtnq_s32_f32(v);
302
- y[i].qs[16 * j + 12] = vgetq_lane_s32(vi, 0);
303
- y[i].qs[16 * j + 13] = vgetq_lane_s32(vi, 1);
304
- y[i].qs[16 * j + 14] = vgetq_lane_s32(vi, 2);
305
- y[i].qs[16 * j + 15] = vgetq_lane_s32(vi, 3);
306
- }
307
- }
308
- #else
309
- // scalar
310
- const int blck_size_interleave = 4;
311
- float srcv[4][QK8_0];
312
- float id[4];
313
-
314
- for (int i = 0; i < nb; i++) {
315
- for (int row_iter = 0; row_iter < 4; row_iter++) {
316
- float amax = 0.0f; // absolute max
317
-
318
- for (int j = 0; j < QK8_0; j++) {
319
- srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
320
- amax = MAX(amax, fabsf(srcv[row_iter][j]));
321
- }
322
-
323
- const float d = amax / ((1 << 7) - 1);
324
- id[row_iter] = d ? 1.0f / d : 0.0f;
325
-
326
- y[i].d[row_iter] = LM_GGML_FP32_TO_FP16(d);
327
- }
328
-
329
- for (int j = 0; j < QK8_0 * 4; j++) {
330
- int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
331
- int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
332
- src_offset += (j % blck_size_interleave);
333
-
334
- float x0 = srcv[src_id][src_offset] * id[src_id];
335
- y[i].qs[j] = roundf(x0);
336
- }
337
- }
338
- #endif
339
- }
340
-
341
- static void lm_ggml_quantize_mat_q8_0_4x8(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k) {
178
+ void lm_ggml_quantize_mat_q8_0_4x8(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k) {
342
179
  assert(QK8_0 == 32);
343
180
  assert(k % QK8_0 == 0);
344
181
  const int nb = k / QK8_0;
345
182
 
346
183
  block_q8_0x4 * LM_GGML_RESTRICT y = (block_q8_0x4 *) vy;
347
184
 
348
- #if defined(__ARM_NEON)
349
- float32x4_t srcv[4][8];
350
- float id[4];
351
-
352
- for (int i = 0; i < nb; i++) {
353
- float32x4_t asrcv[8];
354
- float32x4_t amaxv[8];
355
-
356
- for (int row_iter = 0; row_iter < 4; row_iter++) {
357
- for (int j = 0; j < 8; j++) srcv[row_iter][j] = vld1q_f32(x + row_iter * k + i * 32 + 4 * j);
358
- for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[row_iter][j]);
359
-
360
- for (int j = 0; j < 4; j++) amaxv[2 * j] = vmaxq_f32(asrcv[2 * j], asrcv[2 * j + 1]);
361
- for (int j = 0; j < 2; j++) amaxv[4 * j] = vmaxq_f32(amaxv[4 * j], amaxv[4 * j + 2]);
362
- for (int j = 0; j < 1; j++) amaxv[8 * j] = vmaxq_f32(amaxv[8 * j], amaxv[8 * j + 4]);
363
-
364
- const float amax = vmaxvq_f32(amaxv[0]);
365
-
366
- const float d = amax / ((1 << 7) - 1);
367
- id[row_iter] = d ? 1.0f / d : 0.0f;
368
-
369
- y[i].d[row_iter] = LM_GGML_FP32_TO_FP16(d);
370
- }
371
-
372
- for (int j = 0; j < 4; j++) {
373
- float32x4_t v = vmulq_n_f32(srcv[0][2 * j], id[0]);
374
- int32x4_t vi = vcvtnq_s32_f32(v);
375
- y[i].qs[32 * j + 0] = vgetq_lane_s32(vi, 0);
376
- y[i].qs[32 * j + 1] = vgetq_lane_s32(vi, 1);
377
- y[i].qs[32 * j + 2] = vgetq_lane_s32(vi, 2);
378
- y[i].qs[32 * j + 3] = vgetq_lane_s32(vi, 3);
379
- v = vmulq_n_f32(srcv[0][2 * j + 1], id[0]);
380
- vi = vcvtnq_s32_f32(v);
381
- y[i].qs[32 * j + 4] = vgetq_lane_s32(vi, 0);
382
- y[i].qs[32 * j + 5] = vgetq_lane_s32(vi, 1);
383
- y[i].qs[32 * j + 6] = vgetq_lane_s32(vi, 2);
384
- y[i].qs[32 * j + 7] = vgetq_lane_s32(vi, 3);
385
-
386
- v = vmulq_n_f32(srcv[1][2 * j], id[1]);
387
- vi = vcvtnq_s32_f32(v);
388
- y[i].qs[32 * j + 8] = vgetq_lane_s32(vi, 0);
389
- y[i].qs[32 * j + 9] = vgetq_lane_s32(vi, 1);
390
- y[i].qs[32 * j + 10] = vgetq_lane_s32(vi, 2);
391
- y[i].qs[32 * j + 11] = vgetq_lane_s32(vi, 3);
392
- v = vmulq_n_f32(srcv[1][2 * j + 1], id[1]);
393
- vi = vcvtnq_s32_f32(v);
394
- y[i].qs[32 * j + 12] = vgetq_lane_s32(vi, 0);
395
- y[i].qs[32 * j + 13] = vgetq_lane_s32(vi, 1);
396
- y[i].qs[32 * j + 14] = vgetq_lane_s32(vi, 2);
397
- y[i].qs[32 * j + 15] = vgetq_lane_s32(vi, 3);
398
-
399
- v = vmulq_n_f32(srcv[2][2 * j], id[2]);
400
- vi = vcvtnq_s32_f32(v);
401
- y[i].qs[32 * j + 16] = vgetq_lane_s32(vi, 0);
402
- y[i].qs[32 * j + 17] = vgetq_lane_s32(vi, 1);
403
- y[i].qs[32 * j + 18] = vgetq_lane_s32(vi, 2);
404
- y[i].qs[32 * j + 19] = vgetq_lane_s32(vi, 3);
405
- v = vmulq_n_f32(srcv[2][2 * j + 1], id[2]);
406
- vi = vcvtnq_s32_f32(v);
407
- y[i].qs[32 * j + 20] = vgetq_lane_s32(vi, 0);
408
- y[i].qs[32 * j + 21] = vgetq_lane_s32(vi, 1);
409
- y[i].qs[32 * j + 22] = vgetq_lane_s32(vi, 2);
410
- y[i].qs[32 * j + 23] = vgetq_lane_s32(vi, 3);
411
-
412
- v = vmulq_n_f32(srcv[3][2 * j], id[3]);
413
- vi = vcvtnq_s32_f32(v);
414
- y[i].qs[32 * j + 24] = vgetq_lane_s32(vi, 0);
415
- y[i].qs[32 * j + 25] = vgetq_lane_s32(vi, 1);
416
- y[i].qs[32 * j + 26] = vgetq_lane_s32(vi, 2);
417
- y[i].qs[32 * j + 27] = vgetq_lane_s32(vi, 3);
418
- v = vmulq_n_f32(srcv[3][2 * j + 1], id[3]);
419
- vi = vcvtnq_s32_f32(v);
420
- y[i].qs[32 * j + 28] = vgetq_lane_s32(vi, 0);
421
- y[i].qs[32 * j + 29] = vgetq_lane_s32(vi, 1);
422
- y[i].qs[32 * j + 30] = vgetq_lane_s32(vi, 2);
423
- y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
424
- }
425
- }
426
- #elif defined(__AVX2__) || defined(__AVX__)
185
+ #if defined(__AVX2__) || defined(__AVX__)
427
186
  float id[4];
428
187
  __m256 srcv[4][4];
429
188
  __m256 idvec[4];
@@ -453,7 +212,7 @@ static void lm_ggml_quantize_mat_q8_0_4x8(const float * LM_GGML_RESTRICT x, void
453
212
  id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
454
213
 
455
214
  // Store the scale for the individual block
456
- y[i].d[row_iter] = LM_GGML_FP32_TO_FP16(d);
215
+ y[i].d[row_iter] = LM_GGML_CPU_FP32_TO_FP16(d);
457
216
 
458
217
  // Store the values in blocks of eight values - Aim is to use these later for block interleaving
459
218
  srcv[row_iter][0] = v0;
@@ -520,6 +279,7 @@ static void lm_ggml_quantize_mat_q8_0_4x8(const float * LM_GGML_RESTRICT x, void
520
279
  #endif
521
280
  }
522
281
  }
282
+
523
283
  #else
524
284
  // scalar
525
285
  const int blck_size_interleave = 8;
@@ -538,7 +298,7 @@ static void lm_ggml_quantize_mat_q8_0_4x8(const float * LM_GGML_RESTRICT x, void
538
298
  const float d = amax / ((1 << 7) - 1);
539
299
  id[row_iter] = d ? 1.0f / d : 0.0f;
540
300
 
541
- y[i].d[row_iter] = LM_GGML_FP32_TO_FP16(d);
301
+ y[i].d[row_iter] = LM_GGML_CPU_FP32_TO_FP16(d);
542
302
  }
543
303
 
544
304
  for (int j = 0; j < QK8_0 * 4; j++) {
@@ -553,7 +313,7 @@ static void lm_ggml_quantize_mat_q8_0_4x8(const float * LM_GGML_RESTRICT x, void
553
313
  #endif
554
314
  }
555
315
 
556
- static void lm_ggml_quantize_mat_q8_K_4x8(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k) {
316
+ void lm_ggml_quantize_mat_q8_K_4x8(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t k) {
557
317
  assert(QK_K == 256);
558
318
  assert(k % QK_K == 0);
559
319
  const int nb = k / QK_K;
@@ -817,203 +577,7 @@ static void lm_ggml_quantize_mat_q8_K_4x8(const float * LM_GGML_RESTRICT x, void
817
577
  #endif
818
578
  }
819
579
 
820
- template <int64_t INTER_SIZE, lm_ggml_type PARAM_TYPE>
821
- void lm_ggml_quantize_mat_t(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row);
822
-
823
- template <> void lm_ggml_quantize_mat_t<4, LM_GGML_TYPE_Q8_0>(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
824
- assert(nrow == 4);
825
- UNUSED(nrow);
826
- lm_ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row);
827
- }
828
-
829
- template <> void lm_ggml_quantize_mat_t<8, LM_GGML_TYPE_Q8_0>(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
830
- assert(nrow == 4);
831
- UNUSED(nrow);
832
- lm_ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
833
- }
834
-
835
- template <> void lm_ggml_quantize_mat_t<8, LM_GGML_TYPE_Q8_K>(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
836
- assert(nrow == 4);
837
- UNUSED(nrow);
838
- lm_ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
839
- }
840
-
841
- static void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
842
- const int qk = QK8_0;
843
- const int nb = n / qk;
844
- const int ncols_interleaved = 4;
845
- const int blocklen = 4;
846
-
847
- assert (n % qk == 0);
848
- assert (nc % ncols_interleaved == 0);
849
-
850
- UNUSED(s);
851
- UNUSED(bs);
852
- UNUSED(vx);
853
- UNUSED(vy);
854
- UNUSED(nr);
855
- UNUSED(nc);
856
- UNUSED(nb);
857
- UNUSED(ncols_interleaved);
858
- UNUSED(blocklen);
859
-
860
- #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
861
- if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
862
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
863
-
864
- for (int c = 0; c < nc; c += ncols_interleaved) {
865
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
866
- float32x4_t acc = vdupq_n_f32(0);
867
- for (int b = 0; b < nb; b++) {
868
- int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
869
- int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
870
- int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
871
- int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
872
- float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
873
-
874
- int8x16_t a0 = vld1q_s8(a_ptr->qs);
875
- int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
876
- float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
877
-
878
- int32x4_t ret = vdupq_n_s32(0);
879
-
880
- ret = vdotq_laneq_s32(ret, b0 << 4, a0, 0);
881
- ret = vdotq_laneq_s32(ret, b1 << 4, a0, 1);
882
- ret = vdotq_laneq_s32(ret, b2 << 4, a0, 2);
883
- ret = vdotq_laneq_s32(ret, b3 << 4, a0, 3);
884
-
885
- ret = vdotq_laneq_s32(ret, b0 & 0xf0U, a1, 0);
886
- ret = vdotq_laneq_s32(ret, b1 & 0xf0U, a1, 1);
887
- ret = vdotq_laneq_s32(ret, b2 & 0xf0U, a1, 2);
888
- ret = vdotq_laneq_s32(ret, b3 & 0xf0U, a1, 3);
889
-
890
- acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
891
- vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
892
- a_ptr++;
893
- b_ptr++;
894
- }
895
- vst1q_f32(s, acc);
896
- s += ncols_interleaved;
897
- }
898
- return;
899
- }
900
- #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
901
- float sumf[4];
902
- int sumi;
903
-
904
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
905
- for (int x = 0; x < nc / ncols_interleaved; x++) {
906
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
907
-
908
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
909
- for (int l = 0; l < nb; l++) {
910
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
911
- for (int j = 0; j < ncols_interleaved; j++) {
912
- sumi = 0;
913
- for (int i = 0; i < blocklen; ++i) {
914
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
915
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
916
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
917
- }
918
- sumf[j] += sumi * LM_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_FP16_TO_FP32(a_ptr[l].d);
919
- }
920
- }
921
- }
922
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
923
- }
924
- }
925
-
926
- static void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
927
- const int qk = QK8_0;
928
- const int nb = n / qk;
929
- const int ncols_interleaved = 4;
930
- const int blocklen = 8;
931
-
932
- assert (n % qk == 0);
933
- assert (nc % ncols_interleaved == 0);
934
-
935
- UNUSED(s);
936
- UNUSED(bs);
937
- UNUSED(vx);
938
- UNUSED(vy);
939
- UNUSED(nr);
940
- UNUSED(nc);
941
- UNUSED(nb);
942
- UNUSED(ncols_interleaved);
943
- UNUSED(blocklen);
944
-
945
- #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
946
- if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
947
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
948
-
949
- for (int c = 0; c < nc; c += ncols_interleaved) {
950
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
951
- float32x4_t acc = vdupq_n_f32(0);
952
- for (int b = 0; b < nb; b++) {
953
- int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
954
- int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
955
- int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
956
- int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
957
- float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
958
-
959
- int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
960
- int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
961
- int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
962
- int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
963
- float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
964
-
965
- int32x4_t ret0 = vdupq_n_s32(0);
966
- int32x4_t ret1 = vdupq_n_s32(0);
967
-
968
- ret0 = vdotq_s32(ret0, b0 << 4, a0);
969
- ret1 = vdotq_s32(ret1, b1 << 4, a0);
970
- ret0 = vdotq_s32(ret0, b2 << 4, a1);
971
- ret1 = vdotq_s32(ret1, b3 << 4, a1);
972
-
973
- ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
974
- ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
975
- ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
976
- ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
977
-
978
- int32x4_t ret = vpaddq_s32(ret0, ret1);
979
-
980
- acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
981
- vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
982
- a_ptr++;
983
- b_ptr++;
984
- }
985
- vst1q_f32(s, acc);
986
- s += ncols_interleaved;
987
- }
988
- return;
989
- }
990
- #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
991
- float sumf[4];
992
- int sumi;
993
-
994
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
995
- for (int x = 0; x < nc / ncols_interleaved; x++) {
996
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
997
-
998
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
999
- for (int l = 0; l < nb; l++) {
1000
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1001
- for (int j = 0; j < ncols_interleaved; j++) {
1002
- sumi = 0;
1003
- for (int i = 0; i < blocklen; ++i) {
1004
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1005
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1006
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
1007
- }
1008
- sumf[j] += sumi * LM_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_FP16_TO_FP32(a_ptr[l].d);
1009
- }
1010
- }
1011
- }
1012
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1013
- }
1014
- }
1015
-
1016
- static void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
580
+ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
1017
581
  const int qk = QK8_0;
1018
582
  const int nb = n / qk;
1019
583
  const int ncols_interleaved = 8;
@@ -1032,75 +596,7 @@ static void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
1032
596
  UNUSED(ncols_interleaved);
1033
597
  UNUSED(blocklen);
1034
598
 
1035
- #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
1036
- #if defined(__ARM_FEATURE_SVE)
1037
- if (lm_ggml_cpu_has_sve() && lm_ggml_cpu_get_sve_cnt() == QK8_0) {
1038
- const void * b_ptr = vx;
1039
- const void * a_ptr = vy;
1040
- float * res_ptr = s;
1041
-
1042
- __asm__ __volatile__(
1043
- "ptrue p0.b\n"
1044
- "add %x[b_ptr], %x[b_ptr], #0x10\n"
1045
- "1:" // Column loop
1046
- "add x22, %x[a_ptr], #0x2\n"
1047
- "mov z31.b, #0x0\n"
1048
- "mov x21, %x[nb]\n"
1049
- "2:" // Block loop
1050
- "ld1b { z30.b }, p0/Z, [%x[b_ptr]]\n"
1051
- "ld1b { z29.b }, p0/Z, [%x[b_ptr], #1, MUL VL]\n"
1052
- "mov z28.s, #0x0\n"
1053
- "mov z27.s, #0x0\n"
1054
- "ld1rd { z26.d }, p0/Z, [x22]\n"
1055
- "ld1b { z25.b }, p0/Z, [%x[b_ptr], #2, MUL VL]\n"
1056
- "sub x20, x22, #0x2\n"
1057
- "sub x21, x21, #0x1\n"
1058
- "ld1b { z24.b }, p0/Z, [%x[b_ptr], #3, MUL VL]\n"
1059
- "ld1rd { z23.d }, p0/Z, [x22, #8]\n"
1060
- "lsl z22.b, z30.b, #0x4\n"
1061
- "lsl z16.b, z29.b, #0x4\n"
1062
- "and z30.b, z30.b, #0xf0\n"
1063
- "and z29.b, z29.b, #0xf0\n"
1064
- "ld1rd { z21.d }, p0/Z, [x22, #16]\n"
1065
- "ld1rd { z20.d }, p0/Z, [x22, #24]\n"
1066
- "lsl z19.b, z25.b, #0x4\n"
1067
- "and z25.b, z25.b, #0xf0\n"
1068
- "ld1rh { z17.h }, p0/Z, [x20]\n"
1069
- "ld1h { z18.s }, p0/Z, [%x[b_ptr], #-1, MUL VL]\n"
1070
- "sdot z28.s, z22.b, z26.b\n"
1071
- "sdot z27.s, z16.b, z26.b\n"
1072
- "lsl z16.b, z24.b, #0x4\n"
1073
- "add x22, x22, #0x22\n"
1074
- "and z24.b, z24.b, #0xf0\n"
1075
- "add %x[b_ptr], %x[b_ptr], #0x90\n"
1076
- "fcvt z17.s, p0/m, z17.h\n"
1077
- "fcvt z18.s, p0/m, z18.h\n"
1078
- "sdot z28.s, z19.b, z23.b\n"
1079
- "sdot z27.s, z16.b, z23.b\n"
1080
- "fmul z18.s, z18.s, z17.s\n"
1081
- "sdot z28.s, z30.b, z21.b\n"
1082
- "sdot z27.s, z29.b, z21.b\n"
1083
- "sdot z28.s, z25.b, z20.b\n"
1084
- "sdot z27.s, z24.b, z20.b\n"
1085
- "uzp1 z17.s, z28.s, z27.s\n"
1086
- "uzp2 z16.s, z28.s, z27.s\n"
1087
- "add z17.s, z17.s, z16.s\n"
1088
- "asr z17.s, z17.s, #0x4\n"
1089
- "scvtf z17.s, p0/m, z17.s\n"
1090
- "fmla z31.s, p0/M, z17.s, z18.s\n"
1091
- "cbnz x21, 2b\n"
1092
- "sub %x[nc], %x[nc], #0x8\n"
1093
- "st1w { z31.s }, p0, [%x[res_ptr]]\n"
1094
- "add %x[res_ptr], %x[res_ptr], #0x20\n"
1095
- "cbnz %x[nc], 1b\n"
1096
- : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
1097
- : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
1098
- : "memory", "p0", "x20", "x21", "x22", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
1099
- );
1100
- return;
1101
- }
1102
- #endif // #if defined(__ARM_FEATURE_SVE)
1103
- #elif defined(__AVX2__)
599
+ #if defined(__AVX2__)
1104
600
  // Lookup table to convert signed nibbles to signed bytes
1105
601
  __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
1106
602
  signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
@@ -1152,7 +648,7 @@ static void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
1152
648
  const __m256 col_scale_f32 = LM_GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
1153
649
 
1154
650
  // Load and convert to FP32 scale from block_q8_0
1155
- const __m256 row_scale_f32 = _mm256_set1_ps(LM_GGML_FP16_TO_FP32(a_ptr[b].d));
651
+ const __m256 row_scale_f32 = _mm256_set1_ps(LM_GGML_CPU_FP16_TO_FP32(a_ptr[b].d));
1156
652
 
1157
653
  // Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
1158
654
  __m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs));
@@ -1191,74 +687,8 @@ static void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
1191
687
  }
1192
688
  }
1193
689
  return;
1194
- #elif defined(__riscv_v_intrinsic)
1195
- if (__riscv_vlenb() >= QK4_0) {
1196
- const size_t vl = QK4_0;
1197
-
1198
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1199
- for (int x = 0; x < nc / ncols_interleaved; x++) {
1200
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
1201
690
 
1202
- vfloat32m1_t sumf = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
1203
- for (int l = 0; l < nb; l++) {
1204
- const int64_t a0 = *(const int64_t *)&a_ptr[l].qs[0];
1205
- const int64_t a1 = *(const int64_t *)&a_ptr[l].qs[8];
1206
- const int64_t a2 = *(const int64_t *)&a_ptr[l].qs[16];
1207
- const int64_t a3 = *(const int64_t *)&a_ptr[l].qs[24];
1208
- __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
1209
- const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a0, vl / 4));
1210
- const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a1, vl / 4));
1211
- const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a2, vl / 4));
1212
- const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a3, vl / 4));
1213
-
1214
- const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
1215
- const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
1216
- const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
1217
- const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
1218
- const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
1219
- const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
1220
- const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
1221
-
1222
- const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
1223
- const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
1224
- const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
1225
- const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
1226
-
1227
- const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_hi_m));
1228
- const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
1229
- const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
1230
- const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
1231
- const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
1232
- const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
1233
- const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
1234
- const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
1235
- const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
1236
- const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
1237
- const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
1238
- const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
1239
- const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
1240
-
1241
- // vector version needs Zvfhmin extension
1242
- const float a_scale = LM_GGML_FP16_TO_FP32(a_ptr[l].d);
1243
- const float b_scales[8] = {
1244
- LM_GGML_FP16_TO_FP32(b_ptr[l].d[0]),
1245
- LM_GGML_FP16_TO_FP32(b_ptr[l].d[1]),
1246
- LM_GGML_FP16_TO_FP32(b_ptr[l].d[2]),
1247
- LM_GGML_FP16_TO_FP32(b_ptr[l].d[3]),
1248
- LM_GGML_FP16_TO_FP32(b_ptr[l].d[4]),
1249
- LM_GGML_FP16_TO_FP32(b_ptr[l].d[5]),
1250
- LM_GGML_FP16_TO_FP32(b_ptr[l].d[6]),
1251
- LM_GGML_FP16_TO_FP32(b_ptr[l].d[7])
1252
- };
1253
- const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
1254
- const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4);
1255
- sumf = __riscv_vfmacc_vv_f32m1(sumf, tmp1, b_scales_vec, vl / 4);
1256
- }
1257
- __riscv_vse32_v_f32m1(s + x * ncols_interleaved, sumf, vl / 4);
1258
- }
1259
- return;
1260
- }
1261
- #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
691
+ #endif
1262
692
  {
1263
693
  float sumf[8];
1264
694
  int sumi;
@@ -1277,7 +707,7 @@ static void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
1277
707
  const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1278
708
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
1279
709
  }
1280
- sumf[j] += sumi * LM_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_FP16_TO_FP32(a_ptr[l].d);
710
+ sumf[j] += sumi * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
1281
711
  }
1282
712
  }
1283
713
  }
@@ -1286,7 +716,7 @@ static void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
1286
716
  }
1287
717
  }
1288
718
 
1289
- static void lm_ggml_gemv_q4_K_8x8_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
719
+ void lm_ggml_gemv_q4_K_8x8_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
1290
720
  const int qk = QK_K;
1291
721
  const int nb = n / qk;
1292
722
  const int ncols_interleaved = 8;
@@ -1543,13 +973,13 @@ static void lm_ggml_gemv_q4_K_8x8_q8_K(int n, float * LM_GGML_RESTRICT s, size_t
1543
973
  sumi2 = sumi2 * scales_1[j];
1544
974
  sumi += sumi1 + sumi2;
1545
975
  }
1546
- sumf[j] += sumi * LM_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
976
+ sumf[j] += sumi * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
1547
977
  }
1548
978
  }
1549
979
  for (int sb = 0; sb < 8; sb++) {
1550
980
  uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
1551
981
  for (int j = 0; j < ncols_interleaved; j++) {
1552
- sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * LM_GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
982
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
1553
983
  }
1554
984
  }
1555
985
  }
@@ -1560,14 +990,14 @@ static void lm_ggml_gemv_q4_K_8x8_q8_K(int n, float * LM_GGML_RESTRICT s, size_t
1560
990
  #endif
1561
991
  }
1562
992
 
1563
-
1564
- static void lm_ggml_gemv_iq4_nl_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
993
+ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
1565
994
  const int qk = QK8_0;
1566
995
  const int nb = n / qk;
1567
- const int ncols_interleaved = 4;
1568
- const int blocklen = 4;
996
+ const int ncols_interleaved = 8;
997
+ const int blocklen = 8;
1569
998
 
1570
999
  assert (n % qk == 0);
1000
+ assert (nr % 4 == 0);
1571
1001
  assert (nc % ncols_interleaved == 0);
1572
1002
 
1573
1003
  UNUSED(s);
@@ -1580,1529 +1010,49 @@ static void lm_ggml_gemv_iq4_nl_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size
1580
1010
  UNUSED(ncols_interleaved);
1581
1011
  UNUSED(blocklen);
1582
1012
 
1583
- #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
1584
- if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
1585
- const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
1586
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1587
- float * res_ptr = s;
1013
+ #if defined(__AVX2__) || defined(__AVX512F__)
1014
+ {
1015
+ const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
1016
+ const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
1017
+ int64_t b_nb = n / QK4_0;
1018
+ int64_t y = 0;
1019
+ // Mask to mask out nibbles from packed bytes
1020
+ const __m256i m4b = _mm256_set1_epi8(0x0F);
1021
+ const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3);
1022
+ // Lookup table to convert signed nibbles to signed bytes
1023
+ __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
1024
+ signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
1025
+ // Permute mask used for easier vector processing at later stages
1026
+ __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
1027
+ int64_t xstart = 0;
1028
+ int anr = nr - nr%16; // Used to align nr with boundary of 16
1029
+ #ifdef __AVX512F__
1030
+ int anc = nc - nc%16; // Used to align nc with boundary of 16
1031
+ // Mask to mask out nibbles from packed bytes expanded to 512 bit length
1032
+ const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
1033
+ // Lookup table to convert signed nibbles to signed bytes expanded to 512 bit length
1034
+ __m512i signextendlutexpanded = _mm512_inserti32x8(_mm512_castsi256_si512(signextendlut), signextendlut, 1);
1588
1035
 
1589
- for (int x = 0; x < nc / ncols_interleaved; x++) {
1590
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
1036
+ // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
1037
+ for (; y < anr / 4; y += 4) {
1591
1038
 
1592
- float32x4_t sumf = vdupq_n_f32(0);
1593
- for (int l = 0; l < nb; l++) {
1594
- uint8x16_t b_0 = vld1q_u8(b_ptr[l].qs + 0);
1595
- uint8x16_t b_1 = vld1q_u8(b_ptr[l].qs + 16);
1596
- uint8x16_t b_2 = vld1q_u8(b_ptr[l].qs + 32);
1597
- uint8x16_t b_3 = vld1q_u8(b_ptr[l].qs + 48);
1598
-
1599
- int8x16_t b_0_hi = vqtbl1q_s8(kvalues, b_0 >> 4);
1600
- int8x16_t b_0_lo = vqtbl1q_s8(kvalues, b_0 & 0x0F);
1601
- int8x16_t b_1_hi = vqtbl1q_s8(kvalues, b_1 >> 4);
1602
- int8x16_t b_1_lo = vqtbl1q_s8(kvalues, b_1 & 0x0F);
1603
- int8x16_t b_2_hi = vqtbl1q_s8(kvalues, b_2 >> 4);
1604
- int8x16_t b_2_lo = vqtbl1q_s8(kvalues, b_2 & 0x0F);
1605
- int8x16_t b_3_hi = vqtbl1q_s8(kvalues, b_3 >> 4);
1606
- int8x16_t b_3_lo = vqtbl1q_s8(kvalues, b_3 & 0x0F);
1607
-
1608
- int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 0);
1609
- int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16);
1610
-
1611
- int32x4_t sumi = vdupq_n_s32(0);
1612
- sumi = vdotq_laneq_s32(sumi, b_0_lo, a_0, 0);
1613
- sumi = vdotq_laneq_s32(sumi, b_0_hi, a_1, 0);
1614
- sumi = vdotq_laneq_s32(sumi, b_1_lo, a_0, 1);
1615
- sumi = vdotq_laneq_s32(sumi, b_1_hi, a_1, 1);
1616
- sumi = vdotq_laneq_s32(sumi, b_2_lo, a_0, 2);
1617
- sumi = vdotq_laneq_s32(sumi, b_2_hi, a_1, 2);
1618
- sumi = vdotq_laneq_s32(sumi, b_3_lo, a_0, 3);
1619
- sumi = vdotq_laneq_s32(sumi, b_3_hi, a_1, 3);
1620
-
1621
- float32x4_t a_d = vcvt_f32_f16(vld1_dup_f16((const float16_t *)&a_ptr[l].d));
1622
- float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
1623
- float32x4_t d = a_d * b_d;
1624
-
1625
- sumf = vmlaq_f32(sumf, d, vcvtq_f32_s32(sumi));
1039
+ const block_q8_0x4 * a_ptrs[4];
1040
+
1041
+ a_ptrs[0] = a_ptr_start + (y * nb);
1042
+ for (int i = 0; i < 3; ++i) {
1043
+ a_ptrs[i + 1] = a_ptrs[i] + nb;
1626
1044
  }
1627
1045
 
1628
- vst1q_f32(res_ptr + x * 4, sumf);
1629
- }
1630
- return;
1631
- }
1632
- #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
1633
- {
1634
- float sumf[4];
1635
- int sumi;
1046
+ // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
1047
+ for (int64_t x = 0; x < anc / 8; x += 2) {
1636
1048
 
1637
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1638
- for (int x = 0; x < nc / ncols_interleaved; x++) {
1639
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
1049
+ const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x) * b_nb);
1050
+ const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
1640
1051
 
1641
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1642
- for (int l = 0; l < nb; l++) {
1643
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1644
- for (int j = 0; j < ncols_interleaved; j++) {
1645
- sumi = 0;
1646
- for (int i = 0; i < blocklen; ++i) {
1647
- const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
1648
- const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
1649
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
1650
- }
1651
- sumf[j] += sumi * LM_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_FP16_TO_FP32(a_ptr[l].d);
1652
- }
1653
- }
1654
- }
1655
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
1656
- }
1657
- }
1658
- }
1659
-
1660
- static void lm_ggml_gemm_q4_0_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
1661
- const int qk = QK8_0;
1662
- const int nb = n / qk;
1663
- const int ncols_interleaved = 4;
1664
- const int blocklen = 4;
1665
-
1666
- assert (n % qk == 0);
1667
- assert (nr % 4 == 0);
1668
- assert (nc % ncols_interleaved == 0);
1669
-
1670
- UNUSED(s);
1671
- UNUSED(bs);
1672
- UNUSED(vx);
1673
- UNUSED(vy);
1674
- UNUSED(nr);
1675
- UNUSED(nc);
1676
- UNUSED(nb);
1677
- UNUSED(ncols_interleaved);
1678
- UNUSED(blocklen);
1679
-
1680
- #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
1681
- if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
1682
- const void * b_ptr = vx;
1683
- const void * a_ptr = vy;
1684
- float * res_ptr = s;
1685
- size_t res_stride = bs * sizeof(float);
1686
-
1687
- __asm__ __volatile__(
1688
- "mov x10, %x[nr]\n"
1689
- "mov x9, #0x88\n"
1690
- "cmp x10, #0x10\n"
1691
- "mul x9, %x[nb], x9\n"
1692
- "blt 4f\n"
1693
- "1:" // Row loop
1694
- "add x28, %x[b_ptr], #0x8\n"
1695
- "mov x27, %x[nc]\n"
1696
- "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
1697
- "2:" // Column loop
1698
- "add x25, %x[a_ptr], #0x8\n"
1699
- "movi v15.16b, #0x0\n"
1700
- "movi v19.16b, #0x0\n"
1701
- "mov x24, %x[nb]\n"
1702
- "add x23, x25, x9\n"
1703
- "movi v18.16b, #0x0\n"
1704
- "movi v14.16b, #0x0\n"
1705
- "add x22, x23, x9\n"
1706
- "movi v11.16b, #0x0\n"
1707
- "movi v13.16b, #0x0\n"
1708
- "add x21, x22, x9\n"
1709
- "movi v23.16b, #0x0\n"
1710
- "movi v16.16b, #0x0\n"
1711
- "movi v25.16b, #0x0\n"
1712
- "movi v7.16b, #0x0\n"
1713
- "movi v0.16b, #0x0\n"
1714
- "movi v4.16b, #0x0\n"
1715
- "movi v5.16b, #0x0\n"
1716
- "movi v21.16b, #0x0\n"
1717
- "movi v8.16b, #0x0\n"
1718
- "movi v1.16b, #0x0\n"
1719
- "3:" // Block loop
1720
- "ldr q3, [x28, #0x0]\n"
1721
- "ldr q31, [x25, #0x0]\n"
1722
- "movi v28.16b, #0x4\n"
1723
- "movi v10.4s, #0x0\n"
1724
- "ldr q22, [x28, #0x10]\n"
1725
- "ldr q6, [x25, #0x10]\n"
1726
- "movi v29.4s, #0x0\n"
1727
- "movi v9.4s, #0x0\n"
1728
- "ldr q27, [x28, #0x20]\n"
1729
- "ldr q30, [x28, #0x30]\n"
1730
- "movi v20.4s, #0x0\n"
1731
- "movi v24.16b, #0xf0\n"
1732
- "ldr d2, [x25, #-0x8]\n"
1733
- "ldr d26, [x23, #-0x8]\n"
1734
- "sshl v12.16b, v3.16b, v28.16b\n"
1735
- "sub x20, x28, #0x8\n"
1736
- "ldr d17, [x20, #0x0]\n"
1737
- "and v3.16b, v3.16b, v24.16b\n"
1738
- "subs x24, x24, #0x1\n"
1739
- "add x28, x28, #0x48\n"
1740
- ".inst 0x4f9fe18a // sdot v10.4s, v12.16b, v31.4b[0]\n"
1741
- ".inst 0x4fbfe19d // sdot v29.4s, v12.16b, v31.4b[1]\n"
1742
- ".inst 0x4f9fe989 // sdot v9.4s, v12.16b, v31.4b[2]\n"
1743
- ".inst 0x4fbfe994 // sdot v20.4s, v12.16b, v31.4b[3]\n"
1744
- "sshl v31.16b, v22.16b, v28.16b\n"
1745
- "and v22.16b, v22.16b, v24.16b\n"
1746
- "fcvtl v17.4s, v17.4h\n"
1747
- "fcvtl v2.4s, v2.4h\n"
1748
- "fcvtl v26.4s, v26.4h\n"
1749
- ".inst 0x4f86e3ea // sdot v10.4s, v31.16b, v6.4b[0]\n"
1750
- ".inst 0x4fa6e3fd // sdot v29.4s, v31.16b, v6.4b[1]\n"
1751
- ".inst 0x4f86ebe9 // sdot v9.4s, v31.16b, v6.4b[2]\n"
1752
- ".inst 0x4fa6ebf4 // sdot v20.4s, v31.16b, v6.4b[3]\n"
1753
- "sshl v6.16b, v27.16b, v28.16b\n"
1754
- "sshl v28.16b, v30.16b, v28.16b\n"
1755
- "and v27.16b, v27.16b, v24.16b\n"
1756
- "and v30.16b, v30.16b, v24.16b\n"
1757
- "ldr q24, [x25, #0x20]\n"
1758
- ".inst 0x4f98e0ca // sdot v10.4s, v6.16b, v24.4b[0]\n"
1759
- ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
1760
- ".inst 0x4f98e8c9 // sdot v9.4s, v6.16b, v24.4b[2]\n"
1761
- ".inst 0x4fb8e8d4 // sdot v20.4s, v6.16b, v24.4b[3]\n"
1762
- "ldr q24, [x25, #0x30]\n"
1763
- ".inst 0x4f98e38a // sdot v10.4s, v28.16b, v24.4b[0]\n"
1764
- ".inst 0x4fb8e39d // sdot v29.4s, v28.16b, v24.4b[1]\n"
1765
- ".inst 0x4f98eb89 // sdot v9.4s, v28.16b, v24.4b[2]\n"
1766
- ".inst 0x4fb8eb94 // sdot v20.4s, v28.16b, v24.4b[3]\n"
1767
- "ldr q24, [x25, #0x40]\n"
1768
- ".inst 0x4f98e06a // sdot v10.4s, v3.16b, v24.4b[0]\n"
1769
- ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
1770
- ".inst 0x4f98e869 // sdot v9.4s, v3.16b, v24.4b[2]\n"
1771
- ".inst 0x4fb8e874 // sdot v20.4s, v3.16b, v24.4b[3]\n"
1772
- "ldr q24, [x25, #0x50]\n"
1773
- ".inst 0x4f98e2ca // sdot v10.4s, v22.16b, v24.4b[0]\n"
1774
- ".inst 0x4fb8e2dd // sdot v29.4s, v22.16b, v24.4b[1]\n"
1775
- ".inst 0x4f98eac9 // sdot v9.4s, v22.16b, v24.4b[2]\n"
1776
- ".inst 0x4fb8ead4 // sdot v20.4s, v22.16b, v24.4b[3]\n"
1777
- "ldr q24, [x25, #0x60]\n"
1778
- ".inst 0x4f98e36a // sdot v10.4s, v27.16b, v24.4b[0]\n"
1779
- ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
1780
- ".inst 0x4f98eb69 // sdot v9.4s, v27.16b, v24.4b[2]\n"
1781
- ".inst 0x4fb8eb74 // sdot v20.4s, v27.16b, v24.4b[3]\n"
1782
- "ldr q24, [x25, #0x70]\n"
1783
- "add x25, x25, #0x88\n"
1784
- ".inst 0x4f98e3ca // sdot v10.4s, v30.16b, v24.4b[0]\n"
1785
- ".inst 0x4fb8e3dd // sdot v29.4s, v30.16b, v24.4b[1]\n"
1786
- ".inst 0x4f98ebc9 // sdot v9.4s, v30.16b, v24.4b[2]\n"
1787
- ".inst 0x4fb8ebd4 // sdot v20.4s, v30.16b, v24.4b[3]\n"
1788
- "fmul v24.4s, v17.4s, v2.s[0]\n"
1789
- "scvtf v10.4s, v10.4s, #0x4\n"
1790
- "scvtf v29.4s, v29.4s, #0x4\n"
1791
- "scvtf v9.4s, v9.4s, #0x4\n"
1792
- "scvtf v20.4s, v20.4s, #0x4\n"
1793
- "fmla v15.4s, v10.4s, v24.4s\n"
1794
- "ldr q24, [x23, #0x0]\n"
1795
- "fmul v10.4s, v17.4s, v2.s[1]\n"
1796
- "fmla v19.4s, v29.4s, v10.4s\n"
1797
- "ldr q10, [x23, #0x10]\n"
1798
- "fmul v29.4s, v17.4s, v2.s[2]\n"
1799
- "fmul v2.4s, v17.4s, v2.s[3]\n"
1800
- "fmla v18.4s, v9.4s, v29.4s\n"
1801
- "movi v9.4s, #0x0\n"
1802
- "movi v29.4s, #0x0\n"
1803
- ".inst 0x4f98e189 // sdot v9.4s, v12.16b, v24.4b[0]\n"
1804
- ".inst 0x4fb8e19d // sdot v29.4s, v12.16b, v24.4b[1]\n"
1805
- "fmla v14.4s, v20.4s, v2.4s\n"
1806
- "movi v20.4s, #0x0\n"
1807
- "movi v2.4s, #0x0\n"
1808
- ".inst 0x4f98e994 // sdot v20.4s, v12.16b, v24.4b[2]\n"
1809
- ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
1810
- "ldr q24, [x23, #0x20]\n"
1811
- ".inst 0x4f8ae3e9 // sdot v9.4s, v31.16b, v10.4b[0]\n"
1812
- ".inst 0x4faae3fd // sdot v29.4s, v31.16b, v10.4b[1]\n"
1813
- ".inst 0x4f8aebf4 // sdot v20.4s, v31.16b, v10.4b[2]\n"
1814
- ".inst 0x4faaebe2 // sdot v2.4s, v31.16b, v10.4b[3]\n"
1815
- "ldr q10, [x23, #0x30]\n"
1816
- ".inst 0x4f98e0c9 // sdot v9.4s, v6.16b, v24.4b[0]\n"
1817
- ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
1818
- ".inst 0x4f98e8d4 // sdot v20.4s, v6.16b, v24.4b[2]\n"
1819
- ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
1820
- "ldr q24, [x23, #0x40]\n"
1821
- ".inst 0x4f8ae389 // sdot v9.4s, v28.16b, v10.4b[0]\n"
1822
- ".inst 0x4faae39d // sdot v29.4s, v28.16b, v10.4b[1]\n"
1823
- ".inst 0x4f8aeb94 // sdot v20.4s, v28.16b, v10.4b[2]\n"
1824
- ".inst 0x4faaeb82 // sdot v2.4s, v28.16b, v10.4b[3]\n"
1825
- "ldr q10, [x23, #0x50]\n"
1826
- ".inst 0x4f98e069 // sdot v9.4s, v3.16b, v24.4b[0]\n"
1827
- ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
1828
- ".inst 0x4f98e874 // sdot v20.4s, v3.16b, v24.4b[2]\n"
1829
- ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
1830
- "ldr q24, [x23, #0x60]\n"
1831
- ".inst 0x4f8ae2c9 // sdot v9.4s, v22.16b, v10.4b[0]\n"
1832
- ".inst 0x4faae2dd // sdot v29.4s, v22.16b, v10.4b[1]\n"
1833
- ".inst 0x4f8aead4 // sdot v20.4s, v22.16b, v10.4b[2]\n"
1834
- ".inst 0x4faaeac2 // sdot v2.4s, v22.16b, v10.4b[3]\n"
1835
- "ldr q10, [x23, #0x70]\n"
1836
- "add x23, x23, #0x88\n"
1837
- ".inst 0x4f98e369 // sdot v9.4s, v27.16b, v24.4b[0]\n"
1838
- ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
1839
- ".inst 0x4f98eb74 // sdot v20.4s, v27.16b, v24.4b[2]\n"
1840
- ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
1841
- "ldr q24, [x22, #0x0]\n"
1842
- ".inst 0x4f8ae3c9 // sdot v9.4s, v30.16b, v10.4b[0]\n"
1843
- ".inst 0x4faae3dd // sdot v29.4s, v30.16b, v10.4b[1]\n"
1844
- ".inst 0x4f8aebd4 // sdot v20.4s, v30.16b, v10.4b[2]\n"
1845
- ".inst 0x4faaebc2 // sdot v2.4s, v30.16b, v10.4b[3]\n"
1846
- "fmul v10.4s, v17.4s, v26.s[0]\n"
1847
- "scvtf v9.4s, v9.4s, #0x4\n"
1848
- "scvtf v29.4s, v29.4s, #0x4\n"
1849
- "scvtf v20.4s, v20.4s, #0x4\n"
1850
- "scvtf v2.4s, v2.4s, #0x4\n"
1851
- "fmla v11.4s, v9.4s, v10.4s\n"
1852
- "ldr q9, [x22, #0x10]\n"
1853
- "fmul v10.4s, v17.4s, v26.s[1]\n"
1854
- "fmla v13.4s, v29.4s, v10.4s\n"
1855
- "ldr d29, [x22, #-0x8]\n"
1856
- "fmul v10.4s, v17.4s, v26.s[2]\n"
1857
- "fmul v26.4s, v17.4s, v26.s[3]\n"
1858
- "fcvtl v29.4s, v29.4h\n"
1859
- "fmla v23.4s, v20.4s, v10.4s\n"
1860
- "movi v20.4s, #0x0\n"
1861
- "movi v10.4s, #0x0\n"
1862
- "fmla v16.4s, v2.4s, v26.4s\n"
1863
- "movi v26.4s, #0x0\n"
1864
- "movi v2.4s, #0x0\n"
1865
- ".inst 0x4f98e194 // sdot v20.4s, v12.16b, v24.4b[0]\n"
1866
- ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
1867
- ".inst 0x4f98e99a // sdot v26.4s, v12.16b, v24.4b[2]\n"
1868
- ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
1869
- "ldr q24, [x22, #0x20]\n"
1870
- ".inst 0x4f89e3f4 // sdot v20.4s, v31.16b, v9.4b[0]\n"
1871
- ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
1872
- ".inst 0x4f89ebfa // sdot v26.4s, v31.16b, v9.4b[2]\n"
1873
- ".inst 0x4fa9ebe2 // sdot v2.4s, v31.16b, v9.4b[3]\n"
1874
- "ldr q9, [x22, #0x30]\n"
1875
- ".inst 0x4f98e0d4 // sdot v20.4s, v6.16b, v24.4b[0]\n"
1876
- ".inst 0x4fb8e0ca // sdot v10.4s, v6.16b, v24.4b[1]\n"
1877
- ".inst 0x4f98e8da // sdot v26.4s, v6.16b, v24.4b[2]\n"
1878
- ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
1879
- "ldr q24, [x22, #0x40]\n"
1880
- ".inst 0x4f89e394 // sdot v20.4s, v28.16b, v9.4b[0]\n"
1881
- ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
1882
- ".inst 0x4f89eb9a // sdot v26.4s, v28.16b, v9.4b[2]\n"
1883
- ".inst 0x4fa9eb82 // sdot v2.4s, v28.16b, v9.4b[3]\n"
1884
- "ldr q9, [x22, #0x50]\n"
1885
- ".inst 0x4f98e074 // sdot v20.4s, v3.16b, v24.4b[0]\n"
1886
- ".inst 0x4fb8e06a // sdot v10.4s, v3.16b, v24.4b[1]\n"
1887
- ".inst 0x4f98e87a // sdot v26.4s, v3.16b, v24.4b[2]\n"
1888
- ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
1889
- "ldr q24, [x22, #0x60]\n"
1890
- ".inst 0x4f89e2d4 // sdot v20.4s, v22.16b, v9.4b[0]\n"
1891
- ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
1892
- ".inst 0x4f89eada // sdot v26.4s, v22.16b, v9.4b[2]\n"
1893
- ".inst 0x4fa9eac2 // sdot v2.4s, v22.16b, v9.4b[3]\n"
1894
- "ldr q9, [x22, #0x70]\n"
1895
- "add x22, x22, #0x88\n"
1896
- ".inst 0x4f98e374 // sdot v20.4s, v27.16b, v24.4b[0]\n"
1897
- ".inst 0x4fb8e36a // sdot v10.4s, v27.16b, v24.4b[1]\n"
1898
- ".inst 0x4f98eb7a // sdot v26.4s, v27.16b, v24.4b[2]\n"
1899
- ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
1900
- "ldr q24, [x21, #0x0]\n"
1901
- ".inst 0x4f89e3d4 // sdot v20.4s, v30.16b, v9.4b[0]\n"
1902
- ".inst 0x4fa9e3ca // sdot v10.4s, v30.16b, v9.4b[1]\n"
1903
- ".inst 0x4f89ebda // sdot v26.4s, v30.16b, v9.4b[2]\n"
1904
- ".inst 0x4fa9ebc2 // sdot v2.4s, v30.16b, v9.4b[3]\n"
1905
- "fmul v9.4s, v17.4s, v29.s[0]\n"
1906
- "scvtf v20.4s, v20.4s, #0x4\n"
1907
- "scvtf v10.4s, v10.4s, #0x4\n"
1908
- "scvtf v26.4s, v26.4s, #0x4\n"
1909
- "scvtf v2.4s, v2.4s, #0x4\n"
1910
- "fmla v25.4s, v20.4s, v9.4s\n"
1911
- "ldr q9, [x21, #0x10]\n"
1912
- "fmul v20.4s, v17.4s, v29.s[1]\n"
1913
- "fmla v7.4s, v10.4s, v20.4s\n"
1914
- "ldr d20, [x21, #-0x8]\n"
1915
- "fmul v10.4s, v17.4s, v29.s[2]\n"
1916
- "fmul v29.4s, v17.4s, v29.s[3]\n"
1917
- "fcvtl v20.4s, v20.4h\n"
1918
- "fmla v0.4s, v26.4s, v10.4s\n"
1919
- "movi v26.4s, #0x0\n"
1920
- "movi v10.4s, #0x0\n"
1921
- "fmla v4.4s, v2.4s, v29.4s\n"
1922
- "movi v2.4s, #0x0\n"
1923
- "movi v29.4s, #0x0\n"
1924
- ".inst 0x4f98e19a // sdot v26.4s, v12.16b, v24.4b[0]\n"
1925
- ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
1926
- ".inst 0x4f98e982 // sdot v2.4s, v12.16b, v24.4b[2]\n"
1927
- ".inst 0x4fb8e99d // sdot v29.4s, v12.16b, v24.4b[3]\n"
1928
- "ldr q12, [x21, #0x20]\n"
1929
- "fmul v24.4s, v17.4s, v20.s[0]\n"
1930
- ".inst 0x4f89e3fa // sdot v26.4s, v31.16b, v9.4b[0]\n"
1931
- ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
1932
- ".inst 0x4f89ebe2 // sdot v2.4s, v31.16b, v9.4b[2]\n"
1933
- ".inst 0x4fa9ebfd // sdot v29.4s, v31.16b, v9.4b[3]\n"
1934
- "ldr q9, [x21, #0x30]\n"
1935
- "fmul v31.4s, v17.4s, v20.s[1]\n"
1936
- ".inst 0x4f8ce0da // sdot v26.4s, v6.16b, v12.4b[0]\n"
1937
- ".inst 0x4face0ca // sdot v10.4s, v6.16b, v12.4b[1]\n"
1938
- ".inst 0x4f8ce8c2 // sdot v2.4s, v6.16b, v12.4b[2]\n"
1939
- ".inst 0x4face8dd // sdot v29.4s, v6.16b, v12.4b[3]\n"
1940
- "ldr q12, [x21, #0x40]\n"
1941
- "fmul v6.4s, v17.4s, v20.s[2]\n"
1942
- "fmul v20.4s, v17.4s, v20.s[3]\n"
1943
- ".inst 0x4f89e39a // sdot v26.4s, v28.16b, v9.4b[0]\n"
1944
- ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
1945
- ".inst 0x4f89eb82 // sdot v2.4s, v28.16b, v9.4b[2]\n"
1946
- ".inst 0x4fa9eb9d // sdot v29.4s, v28.16b, v9.4b[3]\n"
1947
- "ldr q9, [x21, #0x50]\n"
1948
- ".inst 0x4f8ce07a // sdot v26.4s, v3.16b, v12.4b[0]\n"
1949
- ".inst 0x4face06a // sdot v10.4s, v3.16b, v12.4b[1]\n"
1950
- ".inst 0x4f8ce862 // sdot v2.4s, v3.16b, v12.4b[2]\n"
1951
- ".inst 0x4face87d // sdot v29.4s, v3.16b, v12.4b[3]\n"
1952
- "ldr q12, [x21, #0x60]\n"
1953
- ".inst 0x4f89e2da // sdot v26.4s, v22.16b, v9.4b[0]\n"
1954
- ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
1955
- ".inst 0x4f89eac2 // sdot v2.4s, v22.16b, v9.4b[2]\n"
1956
- ".inst 0x4fa9eadd // sdot v29.4s, v22.16b, v9.4b[3]\n"
1957
- "ldr q17, [x21, #0x70]\n"
1958
- "add x21, x21, #0x88\n"
1959
- ".inst 0x4f8ce37a // sdot v26.4s, v27.16b, v12.4b[0]\n"
1960
- ".inst 0x4face36a // sdot v10.4s, v27.16b, v12.4b[1]\n"
1961
- ".inst 0x4f8ceb62 // sdot v2.4s, v27.16b, v12.4b[2]\n"
1962
- ".inst 0x4faceb7d // sdot v29.4s, v27.16b, v12.4b[3]\n"
1963
- ".inst 0x4f91e3da // sdot v26.4s, v30.16b, v17.4b[0]\n"
1964
- ".inst 0x4fb1e3ca // sdot v10.4s, v30.16b, v17.4b[1]\n"
1965
- ".inst 0x4f91ebc2 // sdot v2.4s, v30.16b, v17.4b[2]\n"
1966
- ".inst 0x4fb1ebdd // sdot v29.4s, v30.16b, v17.4b[3]\n"
1967
- "scvtf v26.4s, v26.4s, #0x4\n"
1968
- "scvtf v10.4s, v10.4s, #0x4\n"
1969
- "fmla v5.4s, v26.4s, v24.4s\n"
1970
- "scvtf v2.4s, v2.4s, #0x4\n"
1971
- "scvtf v29.4s, v29.4s, #0x4\n"
1972
- "fmla v21.4s, v10.4s, v31.4s\n"
1973
- "fmla v8.4s, v2.4s, v6.4s\n"
1974
- "fmla v1.4s, v29.4s, v20.4s\n"
1975
- "bgt 3b\n"
1976
- "mov x20, %x[res_ptr]\n"
1977
- "subs x27, x27, #0x4\n"
1978
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
1979
- "str q15, [x20, #0x0]\n"
1980
- "add x20, x20, %x[res_stride]\n"
1981
- "str q19, [x20, #0x0]\n"
1982
- "add x20, x20, %x[res_stride]\n"
1983
- "str q18, [x20, #0x0]\n"
1984
- "add x20, x20, %x[res_stride]\n"
1985
- "str q14, [x20, #0x0]\n"
1986
- "add x20, x20, %x[res_stride]\n"
1987
- "str q11, [x20, #0x0]\n"
1988
- "add x20, x20, %x[res_stride]\n"
1989
- "str q13, [x20, #0x0]\n"
1990
- "add x20, x20, %x[res_stride]\n"
1991
- "str q23, [x20, #0x0]\n"
1992
- "add x20, x20, %x[res_stride]\n"
1993
- "str q16, [x20, #0x0]\n"
1994
- "add x20, x20, %x[res_stride]\n"
1995
- "str q25, [x20, #0x0]\n"
1996
- "add x20, x20, %x[res_stride]\n"
1997
- "str q7, [x20, #0x0]\n"
1998
- "add x20, x20, %x[res_stride]\n"
1999
- "str q0, [x20, #0x0]\n"
2000
- "add x20, x20, %x[res_stride]\n"
2001
- "str q4, [x20, #0x0]\n"
2002
- "add x20, x20, %x[res_stride]\n"
2003
- "str q5, [x20, #0x0]\n"
2004
- "add x20, x20, %x[res_stride]\n"
2005
- "str q21, [x20, #0x0]\n"
2006
- "add x20, x20, %x[res_stride]\n"
2007
- "str q8, [x20, #0x0]\n"
2008
- "add x20, x20, %x[res_stride]\n"
2009
- "str q1, [x20, #0x0]\n"
2010
- "bne 2b\n"
2011
- "mov x20, #0x4\n"
2012
- "sub x10, x10, #0x10\n"
2013
- "cmp x10, #0x10\n"
2014
- "mov %x[res_ptr], x26\n"
2015
- "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
2016
- "bge 1b\n"
2017
- "4:" // Row loop skip
2018
- "cbz x10, 9f\n"
2019
- "5:" // Row tail: Row loop
2020
- "add x24, %x[b_ptr], #0x8\n"
2021
- "mov x23, %x[nc]\n"
2022
- "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
2023
- "6:" // Row tail: Column loop
2024
- "movi v15.16b, #0x0\n"
2025
- "movi v19.16b, #0x0\n"
2026
- "add x25, %x[a_ptr], #0x8\n"
2027
- "mov x21, %x[nb]\n"
2028
- "movi v18.16b, #0x0\n"
2029
- "movi v14.16b, #0x0\n"
2030
- "7:" // Row tail: Block loop
2031
- "ldr q7, [x24, #0x0]\n"
2032
- "ldr q5, [x25, #0x0]\n"
2033
- "movi v9.16b, #0x4\n"
2034
- "movi v4.4s, #0x0\n"
2035
- "ldr q3, [x24, #0x10]\n"
2036
- "ldr q2, [x25, #0x10]\n"
2037
- "movi v1.4s, #0x0\n"
2038
- "movi v0.4s, #0x0\n"
2039
- "ldr q13, [x24, #0x20]\n"
2040
- "ldr q31, [x25, #0x20]\n"
2041
- "movi v30.4s, #0x0\n"
2042
- "movi v29.16b, #0xf0\n"
2043
- "ldr q28, [x24, #0x30]\n"
2044
- "ldr q27, [x25, #0x30]\n"
2045
- "sshl v20.16b, v7.16b, v9.16b\n"
2046
- "sub x20, x24, #0x8\n"
2047
- "ldr q26, [x25, #0x40]\n"
2048
- "ldr q25, [x25, #0x50]\n"
2049
- "sshl v17.16b, v3.16b, v9.16b\n"
2050
- "and v7.16b, v7.16b, v29.16b\n"
2051
- "ldr q24, [x25, #0x60]\n"
2052
- "ldr q16, [x25, #0x70]\n"
2053
- "sshl v22.16b, v13.16b, v9.16b\n"
2054
- "and v3.16b, v3.16b, v29.16b\n"
2055
- "ldr d21, [x20, #0x0]\n"
2056
- "ldr d12, [x25, #-0x8]\n"
2057
- ".inst 0x4f85e284 // sdot v4.4s, v20.16b, v5.4b[0]\n"
2058
- ".inst 0x4fa5e281 // sdot v1.4s, v20.16b, v5.4b[1]\n"
2059
- ".inst 0x4f85ea80 // sdot v0.4s, v20.16b, v5.4b[2]\n"
2060
- ".inst 0x4fa5ea9e // sdot v30.4s, v20.16b, v5.4b[3]\n"
2061
- "sshl v9.16b, v28.16b, v9.16b\n"
2062
- "subs x21, x21, #0x1\n"
2063
- "and v13.16b, v13.16b, v29.16b\n"
2064
- "and v28.16b, v28.16b, v29.16b\n"
2065
- "add x25, x25, #0x88\n"
2066
- "add x24, x24, #0x48\n"
2067
- "fcvtl v21.4s, v21.4h\n"
2068
- "fcvtl v12.4s, v12.4h\n"
2069
- ".inst 0x4f82e224 // sdot v4.4s, v17.16b, v2.4b[0]\n"
2070
- ".inst 0x4fa2e221 // sdot v1.4s, v17.16b, v2.4b[1]\n"
2071
- ".inst 0x4f82ea20 // sdot v0.4s, v17.16b, v2.4b[2]\n"
2072
- ".inst 0x4fa2ea3e // sdot v30.4s, v17.16b, v2.4b[3]\n"
2073
- "fmul v11.4s, v21.4s, v12.s[0]\n"
2074
- "fmul v23.4s, v21.4s, v12.s[1]\n"
2075
- "fmul v17.4s, v21.4s, v12.s[2]\n"
2076
- ".inst 0x4f9fe2c4 // sdot v4.4s, v22.16b, v31.4b[0]\n"
2077
- "fmul v6.4s, v21.4s, v12.s[3]\n"
2078
- ".inst 0x4fbfe2c1 // sdot v1.4s, v22.16b, v31.4b[1]\n"
2079
- ".inst 0x4f9feac0 // sdot v0.4s, v22.16b, v31.4b[2]\n"
2080
- ".inst 0x4fbfeade // sdot v30.4s, v22.16b, v31.4b[3]\n"
2081
- ".inst 0x4f9be124 // sdot v4.4s, v9.16b, v27.4b[0]\n"
2082
- ".inst 0x4fbbe121 // sdot v1.4s, v9.16b, v27.4b[1]\n"
2083
- ".inst 0x4f9be920 // sdot v0.4s, v9.16b, v27.4b[2]\n"
2084
- ".inst 0x4fbbe93e // sdot v30.4s, v9.16b, v27.4b[3]\n"
2085
- ".inst 0x4f9ae0e4 // sdot v4.4s, v7.16b, v26.4b[0]\n"
2086
- ".inst 0x4fbae0e1 // sdot v1.4s, v7.16b, v26.4b[1]\n"
2087
- ".inst 0x4f9ae8e0 // sdot v0.4s, v7.16b, v26.4b[2]\n"
2088
- ".inst 0x4fbae8fe // sdot v30.4s, v7.16b, v26.4b[3]\n"
2089
- ".inst 0x4f99e064 // sdot v4.4s, v3.16b, v25.4b[0]\n"
2090
- ".inst 0x4fb9e061 // sdot v1.4s, v3.16b, v25.4b[1]\n"
2091
- ".inst 0x4f99e860 // sdot v0.4s, v3.16b, v25.4b[2]\n"
2092
- ".inst 0x4fb9e87e // sdot v30.4s, v3.16b, v25.4b[3]\n"
2093
- ".inst 0x4f98e1a4 // sdot v4.4s, v13.16b, v24.4b[0]\n"
2094
- ".inst 0x4fb8e1a1 // sdot v1.4s, v13.16b, v24.4b[1]\n"
2095
- ".inst 0x4f98e9a0 // sdot v0.4s, v13.16b, v24.4b[2]\n"
2096
- ".inst 0x4fb8e9be // sdot v30.4s, v13.16b, v24.4b[3]\n"
2097
- ".inst 0x4f90e384 // sdot v4.4s, v28.16b, v16.4b[0]\n"
2098
- ".inst 0x4fb0e381 // sdot v1.4s, v28.16b, v16.4b[1]\n"
2099
- ".inst 0x4f90eb80 // sdot v0.4s, v28.16b, v16.4b[2]\n"
2100
- ".inst 0x4fb0eb9e // sdot v30.4s, v28.16b, v16.4b[3]\n"
2101
- "scvtf v4.4s, v4.4s, #0x4\n"
2102
- "scvtf v1.4s, v1.4s, #0x4\n"
2103
- "scvtf v0.4s, v0.4s, #0x4\n"
2104
- "fmla v15.4s, v4.4s, v11.4s\n"
2105
- "scvtf v30.4s, v30.4s, #0x4\n"
2106
- "fmla v19.4s, v1.4s, v23.4s\n"
2107
- "fmla v18.4s, v0.4s, v17.4s\n"
2108
- "fmla v14.4s, v30.4s, v6.4s\n"
2109
- "bgt 7b\n"
2110
- "mov x20, %x[res_ptr]\n"
2111
- "cmp x10, #0x1\n"
2112
- "str q15, [x20, #0x0]\n"
2113
- "add x20, x20, %x[res_stride]\n"
2114
- "ble 8f\n"
2115
- "cmp x10, #0x2\n"
2116
- "str q19, [x20, #0x0]\n"
2117
- "add x20, x20, %x[res_stride]\n"
2118
- "ble 8f\n"
2119
- "cmp x10, #0x3\n"
2120
- "str q18, [x20, #0x0]\n"
2121
- "add x20, x20, %x[res_stride]\n"
2122
- "ble 8f\n"
2123
- "str q14, [x20, #0x0]\n"
2124
- "8:" // Row tail: Accumulator store skip
2125
- "subs x23, x23, #0x4\n"
2126
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
2127
- "bne 6b\n"
2128
- "subs x10, x10, #0x4\n"
2129
- "add %x[a_ptr], %x[a_ptr], x9\n"
2130
- "mov %x[res_ptr], x22\n"
2131
- "bgt 5b\n"
2132
- "9:" // Row tail: Row loop skip
2133
- : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
2134
- : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
2135
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
2136
- );
2137
- return;
2138
- }
2139
- #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
2140
- {
2141
- float sumf[4][4];
2142
- int sumi;
2143
-
2144
- for (int y = 0; y < nr / 4; y++) {
2145
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2146
- for (int x = 0; x < nc / ncols_interleaved; x++) {
2147
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
2148
- for (int m = 0; m < 4; m++) {
2149
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
2150
- }
2151
- for (int l = 0; l < nb; l++) {
2152
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
2153
- for (int m = 0; m < 4; m++) {
2154
- for (int j = 0; j < ncols_interleaved; j++) {
2155
- sumi = 0;
2156
- for (int i = 0; i < blocklen; ++i) {
2157
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
2158
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
2159
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2160
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
2161
- }
2162
- sumf[m][j] += sumi * LM_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_FP16_TO_FP32(a_ptr[l].d[m]);
2163
- }
2164
- }
2165
- }
2166
- }
2167
- for (int m = 0; m < 4; m++) {
2168
- for (int j = 0; j < ncols_interleaved; j++)
2169
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2170
- }
2171
- }
2172
- }
2173
- }
2174
- }
2175
-
2176
- static void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
2177
- const int qk = QK8_0;
2178
- const int nb = n / qk;
2179
- const int ncols_interleaved = 4;
2180
- const int blocklen = 8;
2181
-
2182
- assert (n % qk == 0);
2183
- assert (nr % 4 == 0);
2184
- assert (nc % ncols_interleaved == 0);
2185
-
2186
- UNUSED(s);
2187
- UNUSED(bs);
2188
- UNUSED(vx);
2189
- UNUSED(vy);
2190
- UNUSED(nr);
2191
- UNUSED(nc);
2192
- UNUSED(nb);
2193
- UNUSED(ncols_interleaved);
2194
- UNUSED(blocklen);
2195
-
2196
- #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
2197
- if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
2198
- const void * b_ptr = vx;
2199
- const void * a_ptr = vy;
2200
- float * res_ptr = s;
2201
- size_t res_stride = bs * sizeof(float);
2202
-
2203
- __asm__ __volatile__(
2204
- "mov x10, %x[nr]\n"
2205
- "mov x9, #0x88\n"
2206
- "cmp x10, #0x10\n"
2207
- "mul x9, %x[nb], x9\n"
2208
- "blt 4f\n"
2209
- "1:" // Row loop
2210
- "add x28, %x[b_ptr], #0x8\n"
2211
- "mov x27, %x[nc]\n"
2212
- "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
2213
- "2:" // Column loop
2214
- "add x25, %x[a_ptr], #0x8\n"
2215
- "movi v2.16b, #0x0\n"
2216
- "movi v10.16b, #0x0\n"
2217
- "mov x24, %x[nb]\n"
2218
- "add x23, x25, x9\n"
2219
- "movi v12.16b, #0x0\n"
2220
- "movi v28.16b, #0x0\n"
2221
- "add x22, x23, x9\n"
2222
- "movi v11.16b, #0x0\n"
2223
- "movi v13.16b, #0x0\n"
2224
- "add x21, x22, x9\n"
2225
- "movi v22.16b, #0x0\n"
2226
- "movi v23.16b, #0x0\n"
2227
- "movi v25.16b, #0x0\n"
2228
- "movi v5.16b, #0x0\n"
2229
- "movi v7.16b, #0x0\n"
2230
- "movi v4.16b, #0x0\n"
2231
- "movi v6.16b, #0x0\n"
2232
- "movi v30.16b, #0x0\n"
2233
- "movi v24.16b, #0x0\n"
2234
- "movi v14.16b, #0x0\n"
2235
- "3:" // Block loop
2236
- "ldr q21, [x28, #0x0]\n"
2237
- "ldr q16, [x28, #0x10]\n"
2238
- "movi v1.16b, #0x4\n"
2239
- "movi v19.4s, #0x0\n"
2240
- "ldr q27, [x25, #0x0]\n"
2241
- "ldr q15, [x25, #0x10]\n"
2242
- "movi v26.4s, #0x0\n"
2243
- "movi v18.4s, #0x0\n"
2244
- "ldr q29, [x28, #0x20]\n"
2245
- "ldr q3, [x28, #0x30]\n"
2246
- "movi v17.4s, #0x0\n"
2247
- "movi v0.16b, #0xf0\n"
2248
- "ldr d20, [x25, #-0x8]\n"
2249
- "ldr d9, [x23, #-0x8]\n"
2250
- "sshl v8.16b, v21.16b, v1.16b\n"
2251
- "sshl v31.16b, v16.16b, v1.16b\n"
2252
- "and v21.16b, v21.16b, v0.16b\n"
2253
- "and v16.16b, v16.16b, v0.16b\n"
2254
- "sub x20, x28, #0x8\n"
2255
- "subs x24, x24, #0x1\n"
2256
- "add x28, x28, #0x48\n"
2257
- ".inst 0x4e88a773 // smmla v19.4s, v27.16b, v8.16b\n"
2258
- ".inst 0x4e9fa77a // smmla v26.4s, v27.16b, v31.16b\n"
2259
- "ldr q27, [x25, #0x20]\n"
2260
- ".inst 0x4e88a5f2 // smmla v18.4s, v15.16b, v8.16b\n"
2261
- ".inst 0x4e9fa5f1 // smmla v17.4s, v15.16b, v31.16b\n"
2262
- "sshl v15.16b, v29.16b, v1.16b\n"
2263
- "sshl v1.16b, v3.16b, v1.16b\n"
2264
- "and v29.16b, v29.16b, v0.16b\n"
2265
- "and v3.16b, v3.16b, v0.16b\n"
2266
- "ldr q0, [x25, #0x30]\n"
2267
- "fcvtl v20.4s, v20.4h\n"
2268
- ".inst 0x4e8fa773 // smmla v19.4s, v27.16b, v15.16b\n"
2269
- "fcvtl v9.4s, v9.4h\n"
2270
- ".inst 0x4e81a77a // smmla v26.4s, v27.16b, v1.16b\n"
2271
- "ldr q27, [x25, #0x40]\n"
2272
- ".inst 0x4e8fa412 // smmla v18.4s, v0.16b, v15.16b\n"
2273
- ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n"
2274
- "ldr q0, [x25, #0x50]\n"
2275
- ".inst 0x4e95a773 // smmla v19.4s, v27.16b, v21.16b\n"
2276
- ".inst 0x4e90a77a // smmla v26.4s, v27.16b, v16.16b\n"
2277
- "ldr q27, [x25, #0x60]\n"
2278
- ".inst 0x4e95a412 // smmla v18.4s, v0.16b, v21.16b\n"
2279
- ".inst 0x4e90a411 // smmla v17.4s, v0.16b, v16.16b\n"
2280
- "ldr q0, [x25, #0x70]\n"
2281
- "add x25, x25, #0x88\n"
2282
- ".inst 0x4e9da773 // smmla v19.4s, v27.16b, v29.16b\n"
2283
- ".inst 0x4e83a77a // smmla v26.4s, v27.16b, v3.16b\n"
2284
- "ldr d27, [x20, #0x0]\n"
2285
- ".inst 0x4e9da412 // smmla v18.4s, v0.16b, v29.16b\n"
2286
- ".inst 0x4e83a411 // smmla v17.4s, v0.16b, v3.16b\n"
2287
- "fcvtl v27.4s, v27.4h\n"
2288
- "uzp1 v0.2d, v19.2d, v26.2d\n"
2289
- "uzp2 v26.2d, v19.2d, v26.2d\n"
2290
- "fmul v19.4s, v27.4s, v20.s[0]\n"
2291
- "scvtf v0.4s, v0.4s, #0x4\n"
2292
- "scvtf v26.4s, v26.4s, #0x4\n"
2293
- "fmla v2.4s, v0.4s, v19.4s\n"
2294
- "ldr q19, [x23, #0x0]\n"
2295
- "uzp1 v0.2d, v18.2d, v17.2d\n"
2296
- "uzp2 v18.2d, v18.2d, v17.2d\n"
2297
- "fmul v17.4s, v27.4s, v20.s[1]\n"
2298
- "scvtf v0.4s, v0.4s, #0x4\n"
2299
- "scvtf v18.4s, v18.4s, #0x4\n"
2300
- "fmla v10.4s, v26.4s, v17.4s\n"
2301
- "ldr q17, [x23, #0x10]\n"
2302
- "fmul v26.4s, v27.4s, v20.s[2]\n"
2303
- "fmul v20.4s, v27.4s, v20.s[3]\n"
2304
- "fmla v12.4s, v0.4s, v26.4s\n"
2305
- "ldr d0, [x22, #-0x8]\n"
2306
- "ldr d26, [x21, #-0x8]\n"
2307
- "fcvtl v0.4s, v0.4h\n"
2308
- "fmla v28.4s, v18.4s, v20.4s\n"
2309
- "movi v20.4s, #0x0\n"
2310
- "movi v18.4s, #0x0\n"
2311
- ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
2312
- ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
2313
- "ldr q19, [x23, #0x20]\n"
2314
- "fcvtl v26.4s, v26.4h\n"
2315
- ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
2316
- ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
2317
- "ldr q19, [x23, #0x40]\n"
2318
- ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
2319
- ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
2320
- "ldr q19, [x23, #0x60]\n"
2321
- ".inst 0x4e9da674 // smmla v20.4s, v19.16b, v29.16b\n"
2322
- ".inst 0x4e83a672 // smmla v18.4s, v19.16b, v3.16b\n"
2323
- "uzp1 v19.2d, v20.2d, v18.2d\n"
2324
- "scvtf v19.4s, v19.4s, #0x4\n"
2325
- "uzp2 v20.2d, v20.2d, v18.2d\n"
2326
- "fmul v18.4s, v27.4s, v9.s[0]\n"
2327
- "scvtf v20.4s, v20.4s, #0x4\n"
2328
- "fmla v11.4s, v19.4s, v18.4s\n"
2329
- "ldr q18, [x22, #0x0]\n"
2330
- "fmul v19.4s, v27.4s, v9.s[1]\n"
2331
- "fmla v13.4s, v20.4s, v19.4s\n"
2332
- "movi v19.4s, #0x0\n"
2333
- "movi v20.4s, #0x0\n"
2334
- ".inst 0x4e88a633 // smmla v19.4s, v17.16b, v8.16b\n"
2335
- ".inst 0x4e9fa634 // smmla v20.4s, v17.16b, v31.16b\n"
2336
- "ldr q17, [x23, #0x30]\n"
2337
- ".inst 0x4e8fa633 // smmla v19.4s, v17.16b, v15.16b\n"
2338
- ".inst 0x4e81a634 // smmla v20.4s, v17.16b, v1.16b\n"
2339
- "ldr q17, [x23, #0x50]\n"
2340
- ".inst 0x4e95a633 // smmla v19.4s, v17.16b, v21.16b\n"
2341
- ".inst 0x4e90a634 // smmla v20.4s, v17.16b, v16.16b\n"
2342
- "ldr q17, [x23, #0x70]\n"
2343
- "add x23, x23, #0x88\n"
2344
- ".inst 0x4e9da633 // smmla v19.4s, v17.16b, v29.16b\n"
2345
- ".inst 0x4e83a634 // smmla v20.4s, v17.16b, v3.16b\n"
2346
- "uzp1 v17.2d, v19.2d, v20.2d\n"
2347
- "scvtf v17.4s, v17.4s, #0x4\n"
2348
- "uzp2 v20.2d, v19.2d, v20.2d\n"
2349
- "fmul v19.4s, v27.4s, v9.s[2]\n"
2350
- "fmul v9.4s, v27.4s, v9.s[3]\n"
2351
- "scvtf v20.4s, v20.4s, #0x4\n"
2352
- "fmla v22.4s, v17.4s, v19.4s\n"
2353
- "ldr q17, [x22, #0x10]\n"
2354
- "movi v19.4s, #0x0\n"
2355
- ".inst 0x4e88a653 // smmla v19.4s, v18.16b, v8.16b\n"
2356
- "fmla v23.4s, v20.4s, v9.4s\n"
2357
- "movi v20.4s, #0x0\n"
2358
- "movi v9.4s, #0x0\n"
2359
- ".inst 0x4e9fa654 // smmla v20.4s, v18.16b, v31.16b\n"
2360
- "ldr q18, [x22, #0x20]\n"
2361
- ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
2362
- ".inst 0x4e8fa653 // smmla v19.4s, v18.16b, v15.16b\n"
2363
- ".inst 0x4e81a654 // smmla v20.4s, v18.16b, v1.16b\n"
2364
- "ldr q18, [x22, #0x40]\n"
2365
- ".inst 0x4e95a653 // smmla v19.4s, v18.16b, v21.16b\n"
2366
- ".inst 0x4e90a654 // smmla v20.4s, v18.16b, v16.16b\n"
2367
- "ldr q18, [x22, #0x60]\n"
2368
- ".inst 0x4e9da653 // smmla v19.4s, v18.16b, v29.16b\n"
2369
- ".inst 0x4e83a654 // smmla v20.4s, v18.16b, v3.16b\n"
2370
- "movi v18.4s, #0x0\n"
2371
- ".inst 0x4e9fa632 // smmla v18.4s, v17.16b, v31.16b\n"
2372
- "ldr q17, [x22, #0x30]\n"
2373
- ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
2374
- ".inst 0x4e81a632 // smmla v18.4s, v17.16b, v1.16b\n"
2375
- "ldr q17, [x22, #0x50]\n"
2376
- ".inst 0x4e95a629 // smmla v9.4s, v17.16b, v21.16b\n"
2377
- ".inst 0x4e90a632 // smmla v18.4s, v17.16b, v16.16b\n"
2378
- "ldr q17, [x22, #0x70]\n"
2379
- "add x22, x22, #0x88\n"
2380
- ".inst 0x4e9da629 // smmla v9.4s, v17.16b, v29.16b\n"
2381
- ".inst 0x4e83a632 // smmla v18.4s, v17.16b, v3.16b\n"
2382
- "uzp1 v17.2d, v19.2d, v20.2d\n"
2383
- "uzp2 v20.2d, v19.2d, v20.2d\n"
2384
- "fmul v19.4s, v27.4s, v0.s[0]\n"
2385
- "scvtf v17.4s, v17.4s, #0x4\n"
2386
- "scvtf v20.4s, v20.4s, #0x4\n"
2387
- "fmla v25.4s, v17.4s, v19.4s\n"
2388
- "ldr q19, [x21, #0x0]\n"
2389
- "fmul v17.4s, v27.4s, v0.s[1]\n"
2390
- "fmla v5.4s, v20.4s, v17.4s\n"
2391
- "ldr q17, [x21, #0x10]\n"
2392
- "uzp1 v20.2d, v9.2d, v18.2d\n"
2393
- "uzp2 v9.2d, v9.2d, v18.2d\n"
2394
- "fmul v18.4s, v27.4s, v0.s[2]\n"
2395
- "fmul v0.4s, v27.4s, v0.s[3]\n"
2396
- "scvtf v20.4s, v20.4s, #0x4\n"
2397
- "scvtf v9.4s, v9.4s, #0x4\n"
2398
- "fmla v7.4s, v20.4s, v18.4s\n"
2399
- "movi v20.4s, #0x0\n"
2400
- "movi v18.4s, #0x0\n"
2401
- ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
2402
- ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
2403
- "ldr q19, [x21, #0x20]\n"
2404
- "fmla v4.4s, v9.4s, v0.4s\n"
2405
- "movi v9.4s, #0x0\n"
2406
- "movi v0.4s, #0x0\n"
2407
- ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
2408
- "fmul v8.4s, v27.4s, v26.s[0]\n"
2409
- ".inst 0x4e9fa620 // smmla v0.4s, v17.16b, v31.16b\n"
2410
- "ldr q17, [x21, #0x30]\n"
2411
- ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
2412
- "fmul v31.4s, v27.4s, v26.s[1]\n"
2413
- ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
2414
- "ldr q19, [x21, #0x40]\n"
2415
- ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
2416
- "fmul v15.4s, v27.4s, v26.s[2]\n"
2417
- "fmul v27.4s, v27.4s, v26.s[3]\n"
2418
- ".inst 0x4e81a620 // smmla v0.4s, v17.16b, v1.16b\n"
2419
- "ldr q1, [x21, #0x50]\n"
2420
- ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
2421
- ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
2422
- "ldr q26, [x21, #0x60]\n"
2423
- ".inst 0x4e95a429 // smmla v9.4s, v1.16b, v21.16b\n"
2424
- ".inst 0x4e90a420 // smmla v0.4s, v1.16b, v16.16b\n"
2425
- "ldr q21, [x21, #0x70]\n"
2426
- "add x21, x21, #0x88\n"
2427
- ".inst 0x4e9da754 // smmla v20.4s, v26.16b, v29.16b\n"
2428
- ".inst 0x4e83a752 // smmla v18.4s, v26.16b, v3.16b\n"
2429
- ".inst 0x4e9da6a9 // smmla v9.4s, v21.16b, v29.16b\n"
2430
- ".inst 0x4e83a6a0 // smmla v0.4s, v21.16b, v3.16b\n"
2431
- "uzp1 v29.2d, v20.2d, v18.2d\n"
2432
- "uzp2 v21.2d, v20.2d, v18.2d\n"
2433
- "scvtf v29.4s, v29.4s, #0x4\n"
2434
- "uzp1 v18.2d, v9.2d, v0.2d\n"
2435
- "uzp2 v16.2d, v9.2d, v0.2d\n"
2436
- "scvtf v21.4s, v21.4s, #0x4\n"
2437
- "fmla v6.4s, v29.4s, v8.4s\n"
2438
- "scvtf v18.4s, v18.4s, #0x4\n"
2439
- "scvtf v16.4s, v16.4s, #0x4\n"
2440
- "fmla v30.4s, v21.4s, v31.4s\n"
2441
- "fmla v24.4s, v18.4s, v15.4s\n"
2442
- "fmla v14.4s, v16.4s, v27.4s\n"
2443
- "bgt 3b\n"
2444
- "mov x20, %x[res_ptr]\n"
2445
- "subs x27, x27, #0x4\n"
2446
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
2447
- "str q2, [x20, #0x0]\n"
2448
- "add x20, x20, %x[res_stride]\n"
2449
- "str q10, [x20, #0x0]\n"
2450
- "add x20, x20, %x[res_stride]\n"
2451
- "str q12, [x20, #0x0]\n"
2452
- "add x20, x20, %x[res_stride]\n"
2453
- "str q28, [x20, #0x0]\n"
2454
- "add x20, x20, %x[res_stride]\n"
2455
- "str q11, [x20, #0x0]\n"
2456
- "add x20, x20, %x[res_stride]\n"
2457
- "str q13, [x20, #0x0]\n"
2458
- "add x20, x20, %x[res_stride]\n"
2459
- "str q22, [x20, #0x0]\n"
2460
- "add x20, x20, %x[res_stride]\n"
2461
- "str q23, [x20, #0x0]\n"
2462
- "add x20, x20, %x[res_stride]\n"
2463
- "str q25, [x20, #0x0]\n"
2464
- "add x20, x20, %x[res_stride]\n"
2465
- "str q5, [x20, #0x0]\n"
2466
- "add x20, x20, %x[res_stride]\n"
2467
- "str q7, [x20, #0x0]\n"
2468
- "add x20, x20, %x[res_stride]\n"
2469
- "str q4, [x20, #0x0]\n"
2470
- "add x20, x20, %x[res_stride]\n"
2471
- "str q6, [x20, #0x0]\n"
2472
- "add x20, x20, %x[res_stride]\n"
2473
- "str q30, [x20, #0x0]\n"
2474
- "add x20, x20, %x[res_stride]\n"
2475
- "str q24, [x20, #0x0]\n"
2476
- "add x20, x20, %x[res_stride]\n"
2477
- "str q14, [x20, #0x0]\n"
2478
- "bne 2b\n"
2479
- "mov x20, #0x4\n"
2480
- "sub x10, x10, #0x10\n"
2481
- "cmp x10, #0x10\n"
2482
- "mov %x[res_ptr], x26\n"
2483
- "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
2484
- "bge 1b\n"
2485
- "4:" // Row loop skip
2486
- "cbz x10, 9f\n"
2487
- "5:" // Row tail: Row loop
2488
- "add x24, %x[b_ptr], #0x8\n"
2489
- "mov x23, %x[nc]\n"
2490
- "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
2491
- "6:" // Row tail: Column loop
2492
- "movi v2.16b, #0x0\n"
2493
- "movi v10.16b, #0x0\n"
2494
- "add x25, %x[a_ptr], #0x8\n"
2495
- "mov x21, %x[nb]\n"
2496
- "movi v12.16b, #0x0\n"
2497
- "movi v28.16b, #0x0\n"
2498
- "7:" // Row tail: Block loop
2499
- "ldr q6, [x24, #0x0]\n"
2500
- "ldr q5, [x24, #0x10]\n"
2501
- "movi v17.16b, #0x4\n"
2502
- "movi v8.4s, #0x0\n"
2503
- "ldr q4, [x25, #0x0]\n"
2504
- "ldr q13, [x25, #0x10]\n"
2505
- "movi v27.4s, #0x0\n"
2506
- "movi v0.4s, #0x0\n"
2507
- "ldr q31, [x24, #0x20]\n"
2508
- "ldr q14, [x24, #0x30]\n"
2509
- "movi v29.4s, #0x0\n"
2510
- "movi v22.16b, #0xf0\n"
2511
- "ldr q11, [x25, #0x20]\n"
2512
- "ldr q23, [x25, #0x30]\n"
2513
- "sshl v21.16b, v6.16b, v17.16b\n"
2514
- "sshl v16.16b, v5.16b, v17.16b\n"
2515
- "ldr q20, [x25, #0x40]\n"
2516
- "ldr q26, [x25, #0x50]\n"
2517
- "and v6.16b, v6.16b, v22.16b\n"
2518
- "and v5.16b, v5.16b, v22.16b\n"
2519
- "ldr q25, [x25, #0x60]\n"
2520
- "ldr q3, [x25, #0x70]\n"
2521
- "sshl v19.16b, v31.16b, v17.16b\n"
2522
- "sshl v18.16b, v14.16b, v17.16b\n"
2523
- "ldr d17, [x25, #-0x8]\n"
2524
- ".inst 0x4e95a488 // smmla v8.4s, v4.16b, v21.16b\n"
2525
- ".inst 0x4e90a49b // smmla v27.4s, v4.16b, v16.16b\n"
2526
- "and v31.16b, v31.16b, v22.16b\n"
2527
- ".inst 0x4e95a5a0 // smmla v0.4s, v13.16b, v21.16b\n"
2528
- ".inst 0x4e90a5bd // smmla v29.4s, v13.16b, v16.16b\n"
2529
- "and v14.16b, v14.16b, v22.16b\n"
2530
- "sub x20, x24, #0x8\n"
2531
- "ldr d16, [x20, #0x0]\n"
2532
- "subs x21, x21, #0x1\n"
2533
- "add x25, x25, #0x88\n"
2534
- "fcvtl v17.4s, v17.4h\n"
2535
- "add x24, x24, #0x48\n"
2536
- ".inst 0x4e93a568 // smmla v8.4s, v11.16b, v19.16b\n"
2537
- ".inst 0x4e92a57b // smmla v27.4s, v11.16b, v18.16b\n"
2538
- ".inst 0x4e93a6e0 // smmla v0.4s, v23.16b, v19.16b\n"
2539
- ".inst 0x4e92a6fd // smmla v29.4s, v23.16b, v18.16b\n"
2540
- "fcvtl v16.4s, v16.4h\n"
2541
- ".inst 0x4e86a688 // smmla v8.4s, v20.16b, v6.16b\n"
2542
- ".inst 0x4e85a69b // smmla v27.4s, v20.16b, v5.16b\n"
2543
- "fmul v23.4s, v16.4s, v17.s[0]\n"
2544
- "fmul v21.4s, v16.4s, v17.s[1]\n"
2545
- "fmul v1.4s, v16.4s, v17.s[2]\n"
2546
- "fmul v20.4s, v16.4s, v17.s[3]\n"
2547
- ".inst 0x4e86a740 // smmla v0.4s, v26.16b, v6.16b\n"
2548
- ".inst 0x4e85a75d // smmla v29.4s, v26.16b, v5.16b\n"
2549
- ".inst 0x4e9fa728 // smmla v8.4s, v25.16b, v31.16b\n"
2550
- ".inst 0x4e8ea73b // smmla v27.4s, v25.16b, v14.16b\n"
2551
- ".inst 0x4e9fa460 // smmla v0.4s, v3.16b, v31.16b\n"
2552
- ".inst 0x4e8ea47d // smmla v29.4s, v3.16b, v14.16b\n"
2553
- "uzp1 v19.2d, v8.2d, v27.2d\n"
2554
- "uzp2 v18.2d, v8.2d, v27.2d\n"
2555
- "scvtf v19.4s, v19.4s, #0x4\n"
2556
- "uzp1 v17.2d, v0.2d, v29.2d\n"
2557
- "uzp2 v16.2d, v0.2d, v29.2d\n"
2558
- "scvtf v18.4s, v18.4s, #0x4\n"
2559
- "fmla v2.4s, v19.4s, v23.4s\n"
2560
- "scvtf v17.4s, v17.4s, #0x4\n"
2561
- "scvtf v16.4s, v16.4s, #0x4\n"
2562
- "fmla v10.4s, v18.4s, v21.4s\n"
2563
- "fmla v12.4s, v17.4s, v1.4s\n"
2564
- "fmla v28.4s, v16.4s, v20.4s\n"
2565
- "bgt 7b\n"
2566
- "mov x20, %x[res_ptr]\n"
2567
- "cmp x10, #0x1\n"
2568
- "str q2, [x20, #0x0]\n"
2569
- "add x20, x20, %x[res_stride]\n"
2570
- "ble 8f\n"
2571
- "cmp x10, #0x2\n"
2572
- "str q10, [x20, #0x0]\n"
2573
- "add x20, x20, %x[res_stride]\n"
2574
- "ble 8f\n"
2575
- "cmp x10, #0x3\n"
2576
- "str q12, [x20, #0x0]\n"
2577
- "add x20, x20, %x[res_stride]\n"
2578
- "ble 8f\n"
2579
- "str q28, [x20, #0x0]\n"
2580
- "8:" // Row tail: Accumulator store skip
2581
- "subs x23, x23, #0x4\n"
2582
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
2583
- "bne 6b\n"
2584
- "subs x10, x10, #0x4\n"
2585
- "add %x[a_ptr], %x[a_ptr], x9\n"
2586
- "mov %x[res_ptr], x22\n"
2587
- "bgt 5b\n"
2588
- "9:" // Row tail: Row loop skip
2589
- : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
2590
- : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
2591
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
2592
- );
2593
- return;
2594
- }
2595
- #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
2596
- float sumf[4][4];
2597
- int sumi;
2598
-
2599
- for (int y = 0; y < nr / 4; y++) {
2600
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
2601
- for (int x = 0; x < nc / ncols_interleaved; x++) {
2602
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
2603
- for (int m = 0; m < 4; m++) {
2604
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
2605
- }
2606
- for (int l = 0; l < nb; l++) {
2607
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
2608
- for (int m = 0; m < 4; m++) {
2609
- for (int j = 0; j < ncols_interleaved; j++) {
2610
- sumi = 0;
2611
- for (int i = 0; i < blocklen; ++i) {
2612
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
2613
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
2614
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2615
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
2616
- }
2617
- sumf[m][j] += sumi * LM_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_FP16_TO_FP32(a_ptr[l].d[m]);
2618
- }
2619
- }
2620
- }
2621
- }
2622
- for (int m = 0; m < 4; m++) {
2623
- for (int j = 0; j < ncols_interleaved; j++)
2624
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
2625
- }
2626
- }
2627
- }
2628
- }
2629
-
2630
- static void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
2631
- const int qk = QK8_0;
2632
- const int nb = n / qk;
2633
- const int ncols_interleaved = 8;
2634
- const int blocklen = 8;
2635
-
2636
- assert (n % qk == 0);
2637
- assert (nr % 4 == 0);
2638
- assert (nc % ncols_interleaved == 0);
2639
-
2640
- UNUSED(s);
2641
- UNUSED(bs);
2642
- UNUSED(vx);
2643
- UNUSED(vy);
2644
- UNUSED(nr);
2645
- UNUSED(nc);
2646
- UNUSED(nb);
2647
- UNUSED(ncols_interleaved);
2648
- UNUSED(blocklen);
2649
-
2650
- #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
2651
- #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
2652
- if (lm_ggml_cpu_has_sve() && lm_ggml_cpu_has_matmul_int8() && lm_ggml_cpu_get_sve_cnt() == QK8_0) {
2653
- const void * b_ptr = vx;
2654
- const void * a_ptr = vy;
2655
- float * res_ptr = s;
2656
- size_t res_stride = bs * sizeof(float);
2657
-
2658
- __asm__ __volatile__(
2659
- "mov x20, #0x4\n"
2660
- "mov x13, %x[nr]\n"
2661
- "mov z28.s, #-0x4\n"
2662
- "mov x12, #0x88\n"
2663
- "ptrue p1.b\n"
2664
- "whilelt p0.s, XZR, x20\n"
2665
- "cmp x13, #0x10\n"
2666
- "mul x12, %x[nb], x12\n"
2667
- "blt 4f\n"
2668
- "1:" // Row loop
2669
- "add x11, %x[b_ptr], #0x10\n"
2670
- "mov x10, %x[nc]\n"
2671
- "add x9, %x[res_ptr], %x[res_stride], LSL #4\n"
2672
- "2:" // Column loop
2673
- "add x28, %x[a_ptr], #0x8\n"
2674
- "mov z24.b, #0x0\n"
2675
- "mov z15.b, #0x0\n"
2676
- "mov x27, %x[nb]\n"
2677
- "add x26, x28, x12\n"
2678
- "mov z12.b, #0x0\n"
2679
- "mov z0.b, #0x0\n"
2680
- "add x25, x26, x12\n"
2681
- "mov z13.b, #0x0\n"
2682
- "mov z1.b, #0x0\n"
2683
- "add x24, x25, x12\n"
2684
- "mov z20.b, #0x0\n"
2685
- "mov z25.b, #0x0\n"
2686
- "mov z11.b, #0x0\n"
2687
- "mov z16.b, #0x0\n"
2688
- "mov z19.b, #0x0\n"
2689
- "mov z26.b, #0x0\n"
2690
- "mov z8.b, #0x0\n"
2691
- "mov z29.b, #0x0\n"
2692
- "mov z27.b, #0x0\n"
2693
- "mov z10.b, #0x0\n"
2694
- "3:" // Block loop
2695
- "ld1b { z30.b }, p1/Z, [x11]\n"
2696
- "ld1b { z21.b }, p1/Z, [x11, #1, MUL VL]\n"
2697
- "mov z18.s, #0x0\n"
2698
- "mov z7.s, #0x0\n"
2699
- "ld1rqb { z3.b }, p1/Z, [x28]\n"
2700
- "ld1rqb { z5.b }, p1/Z, [x28, #16]\n"
2701
- "mov z9.s, #0x0\n"
2702
- "mov z22.s, #0x0\n"
2703
- "ld1b { z4.b }, p1/Z, [x11, #2, MUL VL]\n"
2704
- "ld1b { z17.b }, p1/Z, [x11, #3, MUL VL]\n"
2705
- "sub x20, x11, #0x10\n"
2706
- "sub x23, x28, #0x8\n"
2707
- "lsl z31.b, z30.b, #0x4\n"
2708
- "lsl z6.b, z21.b, #0x4\n"
2709
- "ld1h { z23.s }, p1/Z, [x20]\n"
2710
- "sub x22, x26, #0x8\n"
2711
- "and z30.b, z30.b, #0xf0\n"
2712
- "and z21.b, z21.b, #0xf0\n"
2713
- "sub x21, x25, #0x8\n"
2714
- "sub x20, x24, #0x8\n"
2715
- "lsl z14.b, z4.b, #0x4\n"
2716
- "lsl z2.b, z17.b, #0x4\n"
2717
- "subs x27, x27, #0x1\n"
2718
- "add x11, x11, #0x90\n"
2719
- ".inst 0x451f9872 // smmla z18.s, z3.b, z31.b\n"
2720
- ".inst 0x45069867 // smmla z7.s, z3.b, z6.b\n"
2721
- "ld1rqb { z3.b }, p1/Z, [x28, #32]\n"
2722
- "and z4.b, z4.b, #0xf0\n"
2723
- ".inst 0x451f98a9 // smmla z9.s, z5.b, z31.b\n"
2724
- ".inst 0x450698b6 // smmla z22.s, z5.b, z6.b\n"
2725
- "ld1rqb { z5.b }, p1/Z, [x28, #48]\n"
2726
- "and z17.b, z17.b, #0xf0\n"
2727
- "fcvt z23.s, p1/m, z23.h\n"
2728
- ".inst 0x450e9872 // smmla z18.s, z3.b, z14.b\n"
2729
- ".inst 0x45029867 // smmla z7.s, z3.b, z2.b\n"
2730
- "ld1rqb { z3.b }, p1/Z, [x28, #64]\n"
2731
- ".inst 0x450e98a9 // smmla z9.s, z5.b, z14.b\n"
2732
- ".inst 0x450298b6 // smmla z22.s, z5.b, z2.b\n"
2733
- "ld1rqb { z5.b }, p1/Z, [x28, #80]\n"
2734
- "fscale z23.s, p1/m, z23.s, z28.s\n"
2735
- ".inst 0x451e9872 // smmla z18.s, z3.b, z30.b\n"
2736
- ".inst 0x45159867 // smmla z7.s, z3.b, z21.b\n"
2737
- "ld1rqb { z3.b }, p1/Z, [x28, #96]\n"
2738
- ".inst 0x451e98a9 // smmla z9.s, z5.b, z30.b\n"
2739
- ".inst 0x451598b6 // smmla z22.s, z5.b, z21.b\n"
2740
- "ld1rqb { z5.b }, p1/Z, [x28, #112]\n"
2741
- "add x28, x28, #0x88\n"
2742
- ".inst 0x45049872 // smmla z18.s, z3.b, z4.b\n"
2743
- ".inst 0x45119867 // smmla z7.s, z3.b, z17.b\n"
2744
- "ld1h { z3.s }, p0/Z, [x23]\n"
2745
- ".inst 0x450498a9 // smmla z9.s, z5.b, z4.b\n"
2746
- ".inst 0x451198b6 // smmla z22.s, z5.b, z17.b\n"
2747
- "fcvt z3.s, p1/m, z3.h\n"
2748
- "uzp1 z5.d, z18.d, z7.d\n"
2749
- "uzp2 z18.d, z18.d, z7.d\n"
2750
- "mov z3.q, z3.q[0]\n"
2751
- "uzp1 z7.d, z9.d, z22.d\n"
2752
- "uzp2 z22.d, z9.d, z22.d\n"
2753
- "fmul z9.s, z23.s, z3.s[0]\n"
2754
- "scvtf z5.s, p1/m, z5.s\n"
2755
- "scvtf z18.s, p1/m, z18.s\n"
2756
- "scvtf z7.s, p1/m, z7.s\n"
2757
- "scvtf z22.s, p1/m, z22.s\n"
2758
- "fmla z24.s, p1/M, z5.s, z9.s\n"
2759
- "ld1rqb { z5.b }, p1/Z, [x26]\n"
2760
- "fmul z9.s, z23.s, z3.s[1]\n"
2761
- "fmla z15.s, p1/M, z18.s, z9.s\n"
2762
- "ld1rqb { z18.b }, p1/Z, [x26, #16]\n"
2763
- "fmul z9.s, z23.s, z3.s[2]\n"
2764
- "fmul z3.s, z23.s, z3.s[3]\n"
2765
- "fmla z12.s, p1/M, z7.s, z9.s\n"
2766
- "mov z9.s, #0x0\n"
2767
- "ld1h { z7.s }, p0/Z, [x22]\n"
2768
- ".inst 0x451f98a9 // smmla z9.s, z5.b, z31.b\n"
2769
- "fmla z0.s, p1/M, z22.s, z3.s\n"
2770
- "mov z22.s, #0x0\n"
2771
- "ld1h { z3.s }, p0/Z, [x21]\n"
2772
- ".inst 0x450698b6 // smmla z22.s, z5.b, z6.b\n"
2773
- "ld1rqb { z5.b }, p1/Z, [x26, #32]\n"
2774
- "fcvt z7.s, p1/m, z7.h\n"
2775
- "fcvt z3.s, p1/m, z3.h\n"
2776
- ".inst 0x450e98a9 // smmla z9.s, z5.b, z14.b\n"
2777
- ".inst 0x450298b6 // smmla z22.s, z5.b, z2.b\n"
2778
- "ld1rqb { z5.b }, p1/Z, [x26, #64]\n"
2779
- "mov z7.q, z7.q[0]\n"
2780
- "mov z3.q, z3.q[0]\n"
2781
- ".inst 0x451e98a9 // smmla z9.s, z5.b, z30.b\n"
2782
- ".inst 0x451598b6 // smmla z22.s, z5.b, z21.b\n"
2783
- "ld1rqb { z5.b }, p1/Z, [x26, #96]\n"
2784
- ".inst 0x450498a9 // smmla z9.s, z5.b, z4.b\n"
2785
- ".inst 0x451198b6 // smmla z22.s, z5.b, z17.b\n"
2786
- "uzp1 z5.d, z9.d, z22.d\n"
2787
- "scvtf z5.s, p1/m, z5.s\n"
2788
- "uzp2 z22.d, z9.d, z22.d\n"
2789
- "fmul z9.s, z23.s, z7.s[0]\n"
2790
- "scvtf z22.s, p1/m, z22.s\n"
2791
- "fmla z13.s, p1/M, z5.s, z9.s\n"
2792
- "ld1rqb { z9.b }, p1/Z, [x25]\n"
2793
- "fmul z5.s, z23.s, z7.s[1]\n"
2794
- "fmla z1.s, p1/M, z22.s, z5.s\n"
2795
- "mov z5.s, #0x0\n"
2796
- "mov z22.s, #0x0\n"
2797
- ".inst 0x451f9a45 // smmla z5.s, z18.b, z31.b\n"
2798
- ".inst 0x45069a56 // smmla z22.s, z18.b, z6.b\n"
2799
- "ld1rqb { z18.b }, p1/Z, [x26, #48]\n"
2800
- ".inst 0x450e9a45 // smmla z5.s, z18.b, z14.b\n"
2801
- ".inst 0x45029a56 // smmla z22.s, z18.b, z2.b\n"
2802
- "ld1rqb { z18.b }, p1/Z, [x26, #80]\n"
2803
- ".inst 0x451e9a45 // smmla z5.s, z18.b, z30.b\n"
2804
- ".inst 0x45159a56 // smmla z22.s, z18.b, z21.b\n"
2805
- "ld1rqb { z18.b }, p1/Z, [x26, #112]\n"
2806
- "add x26, x26, #0x88\n"
2807
- ".inst 0x45049a45 // smmla z5.s, z18.b, z4.b\n"
2808
- ".inst 0x45119a56 // smmla z22.s, z18.b, z17.b\n"
2809
- "uzp1 z18.d, z5.d, z22.d\n"
2810
- "scvtf z18.s, p1/m, z18.s\n"
2811
- "uzp2 z22.d, z5.d, z22.d\n"
2812
- "fmul z5.s, z23.s, z7.s[2]\n"
2813
- "fmul z7.s, z23.s, z7.s[3]\n"
2814
- "scvtf z22.s, p1/m, z22.s\n"
2815
- "fmla z20.s, p1/M, z18.s, z5.s\n"
2816
- "ld1rqb { z18.b }, p1/Z, [x25, #16]\n"
2817
- "ld1h { z5.s }, p0/Z, [x20]\n"
2818
- "fcvt z5.s, p1/m, z5.h\n"
2819
- "fmla z25.s, p1/M, z22.s, z7.s\n"
2820
- "mov z22.s, #0x0\n"
2821
- "mov z7.s, #0x0\n"
2822
- ".inst 0x451f9936 // smmla z22.s, z9.b, z31.b\n"
2823
- ".inst 0x45069927 // smmla z7.s, z9.b, z6.b\n"
2824
- "ld1rqb { z9.b }, p1/Z, [x25, #32]\n"
2825
- "mov z5.q, z5.q[0]\n"
2826
- ".inst 0x450e9936 // smmla z22.s, z9.b, z14.b\n"
2827
- ".inst 0x45029927 // smmla z7.s, z9.b, z2.b\n"
2828
- "ld1rqb { z9.b }, p1/Z, [x25, #64]\n"
2829
- ".inst 0x451e9936 // smmla z22.s, z9.b, z30.b\n"
2830
- ".inst 0x45159927 // smmla z7.s, z9.b, z21.b\n"
2831
- "ld1rqb { z9.b }, p1/Z, [x25, #96]\n"
2832
- ".inst 0x45049936 // smmla z22.s, z9.b, z4.b\n"
2833
- ".inst 0x45119927 // smmla z7.s, z9.b, z17.b\n"
2834
- "uzp1 z9.d, z22.d, z7.d\n"
2835
- "scvtf z9.s, p1/m, z9.s\n"
2836
- "uzp2 z22.d, z22.d, z7.d\n"
2837
- "fmul z7.s, z23.s, z3.s[0]\n"
2838
- "scvtf z22.s, p1/m, z22.s\n"
2839
- "fmla z11.s, p1/M, z9.s, z7.s\n"
2840
- "ld1rqb { z9.b }, p1/Z, [x24]\n"
2841
- "fmul z7.s, z23.s, z3.s[1]\n"
2842
- "fmla z16.s, p1/M, z22.s, z7.s\n"
2843
- "mov z22.s, #0x0\n"
2844
- "mov z7.s, #0x0\n"
2845
- ".inst 0x451f9a56 // smmla z22.s, z18.b, z31.b\n"
2846
- ".inst 0x45069a47 // smmla z7.s, z18.b, z6.b\n"
2847
- "ld1rqb { z18.b }, p1/Z, [x25, #48]\n"
2848
- ".inst 0x450e9a56 // smmla z22.s, z18.b, z14.b\n"
2849
- ".inst 0x45029a47 // smmla z7.s, z18.b, z2.b\n"
2850
- "ld1rqb { z18.b }, p1/Z, [x25, #80]\n"
2851
- ".inst 0x451e9a56 // smmla z22.s, z18.b, z30.b\n"
2852
- ".inst 0x45159a47 // smmla z7.s, z18.b, z21.b\n"
2853
- "ld1rqb { z18.b }, p1/Z, [x25, #112]\n"
2854
- "add x25, x25, #0x88\n"
2855
- ".inst 0x45049a56 // smmla z22.s, z18.b, z4.b\n"
2856
- ".inst 0x45119a47 // smmla z7.s, z18.b, z17.b\n"
2857
- "uzp1 z18.d, z22.d, z7.d\n"
2858
- "scvtf z18.s, p1/m, z18.s\n"
2859
- "uzp2 z7.d, z22.d, z7.d\n"
2860
- "fmul z22.s, z23.s, z3.s[2]\n"
2861
- "fmul z3.s, z23.s, z3.s[3]\n"
2862
- "scvtf z7.s, p1/m, z7.s\n"
2863
- "fmla z19.s, p1/M, z18.s, z22.s\n"
2864
- "ld1rqb { z18.b }, p1/Z, [x24, #16]\n"
2865
- "fmul z22.s, z23.s, z5.s[0]\n"
2866
- "fmla z26.s, p1/M, z7.s, z3.s\n"
2867
- "mov z3.s, #0x0\n"
2868
- "mov z7.s, #0x0\n"
2869
- ".inst 0x451f9923 // smmla z3.s, z9.b, z31.b\n"
2870
- ".inst 0x45069927 // smmla z7.s, z9.b, z6.b\n"
2871
- "ld1rqb { z9.b }, p1/Z, [x24, #32]\n"
2872
- ".inst 0x450e9923 // smmla z3.s, z9.b, z14.b\n"
2873
- ".inst 0x45029927 // smmla z7.s, z9.b, z2.b\n"
2874
- "mov z9.s, #0x0\n"
2875
- ".inst 0x451f9a49 // smmla z9.s, z18.b, z31.b\n"
2876
- "mov z31.s, #0x0\n"
2877
- ".inst 0x45069a5f // smmla z31.s, z18.b, z6.b\n"
2878
- "ld1rqb { z6.b }, p1/Z, [x24, #48]\n"
2879
- "ld1rqb { z18.b }, p1/Z, [x24, #64]\n"
2880
- ".inst 0x450e98c9 // smmla z9.s, z6.b, z14.b\n"
2881
- "fmul z14.s, z23.s, z5.s[1]\n"
2882
- ".inst 0x450298df // smmla z31.s, z6.b, z2.b\n"
2883
- "ld1rqb { z6.b }, p1/Z, [x24, #80]\n"
2884
- "fmul z2.s, z23.s, z5.s[2]\n"
2885
- "fmul z23.s, z23.s, z5.s[3]\n"
2886
- ".inst 0x451e9a43 // smmla z3.s, z18.b, z30.b\n"
2887
- ".inst 0x45159a47 // smmla z7.s, z18.b, z21.b\n"
2888
- "ld1rqb { z5.b }, p1/Z, [x24, #96]\n"
2889
- ".inst 0x451e98c9 // smmla z9.s, z6.b, z30.b\n"
2890
- ".inst 0x451598df // smmla z31.s, z6.b, z21.b\n"
2891
- "ld1rqb { z18.b }, p1/Z, [x24, #112]\n"
2892
- "add x24, x24, #0x88\n"
2893
- ".inst 0x450498a3 // smmla z3.s, z5.b, z4.b\n"
2894
- ".inst 0x451198a7 // smmla z7.s, z5.b, z17.b\n"
2895
- ".inst 0x45049a49 // smmla z9.s, z18.b, z4.b\n"
2896
- ".inst 0x45119a5f // smmla z31.s, z18.b, z17.b\n"
2897
- "uzp1 z18.d, z3.d, z7.d\n"
2898
- "uzp2 z5.d, z3.d, z7.d\n"
2899
- "scvtf z18.s, p1/m, z18.s\n"
2900
- "uzp1 z6.d, z9.d, z31.d\n"
2901
- "uzp2 z9.d, z9.d, z31.d\n"
2902
- "scvtf z5.s, p1/m, z5.s\n"
2903
- "fmla z8.s, p1/M, z18.s, z22.s\n"
2904
- "scvtf z6.s, p1/m, z6.s\n"
2905
- "scvtf z9.s, p1/m, z9.s\n"
2906
- "fmla z29.s, p1/M, z5.s, z14.s\n"
2907
- "fmla z27.s, p1/M, z6.s, z2.s\n"
2908
- "fmla z10.s, p1/M, z9.s, z23.s\n"
2909
- "bgt 3b\n"
2910
- "mov x20, %x[res_ptr]\n"
2911
- "subs x10, x10, #0x8\n"
2912
- "add %x[res_ptr], %x[res_ptr], #0x20\n"
2913
- "st1w { z24.s }, p1, [x20]\n"
2914
- "add x20, x20, %x[res_stride]\n"
2915
- "st1w { z15.s }, p1, [x20]\n"
2916
- "add x20, x20, %x[res_stride]\n"
2917
- "st1w { z12.s }, p1, [x20]\n"
2918
- "add x20, x20, %x[res_stride]\n"
2919
- "st1w { z0.s }, p1, [x20]\n"
2920
- "add x20, x20, %x[res_stride]\n"
2921
- "st1w { z13.s }, p1, [x20]\n"
2922
- "add x20, x20, %x[res_stride]\n"
2923
- "st1w { z1.s }, p1, [x20]\n"
2924
- "add x20, x20, %x[res_stride]\n"
2925
- "st1w { z20.s }, p1, [x20]\n"
2926
- "add x20, x20, %x[res_stride]\n"
2927
- "st1w { z25.s }, p1, [x20]\n"
2928
- "add x20, x20, %x[res_stride]\n"
2929
- "st1w { z11.s }, p1, [x20]\n"
2930
- "add x20, x20, %x[res_stride]\n"
2931
- "st1w { z16.s }, p1, [x20]\n"
2932
- "add x20, x20, %x[res_stride]\n"
2933
- "st1w { z19.s }, p1, [x20]\n"
2934
- "add x20, x20, %x[res_stride]\n"
2935
- "st1w { z26.s }, p1, [x20]\n"
2936
- "add x20, x20, %x[res_stride]\n"
2937
- "st1w { z8.s }, p1, [x20]\n"
2938
- "add x20, x20, %x[res_stride]\n"
2939
- "st1w { z29.s }, p1, [x20]\n"
2940
- "add x20, x20, %x[res_stride]\n"
2941
- "st1w { z27.s }, p1, [x20]\n"
2942
- "add x20, x20, %x[res_stride]\n"
2943
- "st1w { z10.s }, p1, [x20]\n"
2944
- "bne 2b\n"
2945
- "mov x20, #0x4\n"
2946
- "sub x13, x13, #0x10\n"
2947
- "cmp x13, #0x10\n"
2948
- "mov %x[res_ptr], x9\n"
2949
- "madd %x[a_ptr], x20, x12, %x[a_ptr]\n"
2950
- "bge 1b\n"
2951
- "4:" // Row loop skip
2952
- "cbz x13, 9f\n"
2953
- "5:" // Row tail: Row loop
2954
- "add x25, %x[b_ptr], #0x10\n"
2955
- "mov x24, %x[nc]\n"
2956
- "add x23, %x[res_ptr], %x[res_stride], LSL #2\n"
2957
- "6:" // Row tail: Column loop
2958
- "mov z24.b, #0x0\n"
2959
- "mov z15.b, #0x0\n"
2960
- "add x28, %x[a_ptr], #0x8\n"
2961
- "mov x22, %x[nb]\n"
2962
- "mov z12.b, #0x0\n"
2963
- "mov z0.b, #0x0\n"
2964
- "7:" // Row tail: Block loop
2965
- "ld1b { z3.b }, p1/Z, [x25]\n"
2966
- "ld1b { z6.b }, p1/Z, [x25, #1, MUL VL]\n"
2967
- "mov z2.s, #0x0\n"
2968
- "mov z25.s, #0x0\n"
2969
- "ld1rqb { z26.b }, p1/Z, [x28]\n"
2970
- "ld1rqb { z21.b }, p1/Z, [x28, #16]\n"
2971
- "mov z27.s, #0x0\n"
2972
- "mov z19.s, #0x0\n"
2973
- "ld1b { z29.b }, p1/Z, [x25, #2, MUL VL]\n"
2974
- "ld1b { z16.b }, p1/Z, [x25, #3, MUL VL]\n"
2975
- "sub x21, x25, #0x10\n"
2976
- "sub x20, x28, #0x8\n"
2977
- "lsl z20.b, z3.b, #0x4\n"
2978
- "lsl z4.b, z6.b, #0x4\n"
2979
- "ld1rqb { z10.b }, p1/Z, [x28, #32]\n"
2980
- "ld1rqb { z23.b }, p1/Z, [x28, #48]\n"
2981
- "and z3.b, z3.b, #0xf0\n"
2982
- "and z6.b, z6.b, #0xf0\n"
2983
- "ld1rqb { z11.b }, p1/Z, [x28, #64]\n"
2984
- "ld1rqb { z7.b }, p1/Z, [x28, #80]\n"
2985
- "lsl z8.b, z29.b, #0x4\n"
2986
- "lsl z14.b, z16.b, #0x4\n"
2987
- "ld1rqb { z18.b }, p1/Z, [x28, #96]\n"
2988
- "ld1rqb { z30.b }, p1/Z, [x28, #112]\n"
2989
- ".inst 0x45149b42 // smmla z2.s, z26.b, z20.b\n"
2990
- ".inst 0x45049b59 // smmla z25.s, z26.b, z4.b\n"
2991
- "and z29.b, z29.b, #0xf0\n"
2992
- "ld1h { z17.s }, p1/Z, [x21]\n"
2993
- ".inst 0x45149abb // smmla z27.s, z21.b, z20.b\n"
2994
- ".inst 0x45049ab3 // smmla z19.s, z21.b, z4.b\n"
2995
- "and z16.b, z16.b, #0xf0\n"
2996
- "ld1h { z4.s }, p0/Z, [x20]\n"
2997
- "subs x22, x22, #0x1\n"
2998
- "add x28, x28, #0x88\n"
2999
- "fcvt z17.s, p1/m, z17.h\n"
3000
- "add x25, x25, #0x90\n"
3001
- ".inst 0x45089942 // smmla z2.s, z10.b, z8.b\n"
3002
- ".inst 0x450e9959 // smmla z25.s, z10.b, z14.b\n"
3003
- "fcvt z4.s, p1/m, z4.h\n"
3004
- ".inst 0x45089afb // smmla z27.s, z23.b, z8.b\n"
3005
- ".inst 0x450e9af3 // smmla z19.s, z23.b, z14.b\n"
3006
- "fscale z17.s, p1/m, z17.s, z28.s\n"
3007
- "mov z4.q, z4.q[0]\n"
3008
- ".inst 0x45039962 // smmla z2.s, z11.b, z3.b\n"
3009
- ".inst 0x45069979 // smmla z25.s, z11.b, z6.b\n"
3010
- "fmul z23.s, z17.s, z4.s[0]\n"
3011
- "fmul z9.s, z17.s, z4.s[1]\n"
3012
- "fmul z21.s, z17.s, z4.s[2]\n"
3013
- "fmul z4.s, z17.s, z4.s[3]\n"
3014
- ".inst 0x450398fb // smmla z27.s, z7.b, z3.b\n"
3015
- ".inst 0x450698f3 // smmla z19.s, z7.b, z6.b\n"
3016
- ".inst 0x451d9a42 // smmla z2.s, z18.b, z29.b\n"
3017
- ".inst 0x45109a59 // smmla z25.s, z18.b, z16.b\n"
3018
- ".inst 0x451d9bdb // smmla z27.s, z30.b, z29.b\n"
3019
- ".inst 0x45109bd3 // smmla z19.s, z30.b, z16.b\n"
3020
- "uzp1 z31.d, z2.d, z25.d\n"
3021
- "uzp2 z13.d, z2.d, z25.d\n"
3022
- "scvtf z31.s, p1/m, z31.s\n"
3023
- "uzp1 z17.d, z27.d, z19.d\n"
3024
- "uzp2 z18.d, z27.d, z19.d\n"
3025
- "scvtf z13.s, p1/m, z13.s\n"
3026
- "fmla z24.s, p1/M, z31.s, z23.s\n"
3027
- "scvtf z17.s, p1/m, z17.s\n"
3028
- "scvtf z18.s, p1/m, z18.s\n"
3029
- "fmla z15.s, p1/M, z13.s, z9.s\n"
3030
- "fmla z12.s, p1/M, z17.s, z21.s\n"
3031
- "fmla z0.s, p1/M, z18.s, z4.s\n"
3032
- "bgt 7b\n"
3033
- "mov x20, %x[res_ptr]\n"
3034
- "cmp x13, #0x1\n"
3035
- "st1w { z24.s }, p1, [x20]\n"
3036
- "add x20, x20, %x[res_stride]\n"
3037
- "ble 8f\n"
3038
- "cmp x13, #0x2\n"
3039
- "st1w { z15.s }, p1, [x20]\n"
3040
- "add x20, x20, %x[res_stride]\n"
3041
- "ble 8f\n"
3042
- "cmp x13, #0x3\n"
3043
- "st1w { z12.s }, p1, [x20]\n"
3044
- "add x20, x20, %x[res_stride]\n"
3045
- "ble 8f\n"
3046
- "st1w { z0.s }, p1, [x20]\n"
3047
- "8:" // Row tail: Accumulator store skip
3048
- "subs x24, x24, #0x8\n"
3049
- "add %x[res_ptr], %x[res_ptr], #0x20\n"
3050
- "bne 6b\n"
3051
- "subs x13, x13, #0x4\n"
3052
- "add %x[a_ptr], %x[a_ptr], x12\n"
3053
- "mov %x[res_ptr], x23\n"
3054
- "bgt 5b\n"
3055
- "9:" // Row tail: Row loop skip
3056
- : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
3057
- : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
3058
- : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
3059
- );
3060
- return;
3061
- }
3062
- #endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
3063
- #elif defined(__AVX2__) || defined(__AVX512F__)
3064
- {
3065
- const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
3066
- const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
3067
- int64_t b_nb = n / QK4_0;
3068
- int64_t y = 0;
3069
- // Mask to mask out nibbles from packed bytes
3070
- const __m256i m4b = _mm256_set1_epi8(0x0F);
3071
- const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3);
3072
- // Lookup table to convert signed nibbles to signed bytes
3073
- __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
3074
- signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
3075
- // Permute mask used for easier vector processing at later stages
3076
- __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
3077
- int64_t xstart = 0;
3078
- int anr = nr - nr%16; // Used to align nr with boundary of 16
3079
- #ifdef __AVX512F__
3080
- int anc = nc - nc%16; // Used to align nc with boundary of 16
3081
- // Mask to mask out nibbles from packed bytes expanded to 512 bit length
3082
- const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
3083
- // Lookup table to convert signed nibbles to signed bytes expanded to 512 bit length
3084
- __m512i signextendlutexpanded = _mm512_inserti32x8(_mm512_castsi256_si512(signextendlut), signextendlut, 1);
3085
-
3086
- // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
3087
- for (; y < anr / 4; y += 4) {
3088
-
3089
- const block_q8_0x4 * a_ptrs[4];
3090
-
3091
- a_ptrs[0] = a_ptr_start + (y * nb);
3092
- for (int i = 0; i < 3; ++i) {
3093
- a_ptrs[i + 1] = a_ptrs[i] + nb;
3094
- }
3095
-
3096
- // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
3097
- for (int64_t x = 0; x < anc / 8; x += 2) {
3098
-
3099
- const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x) * b_nb);
3100
- const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
3101
-
3102
- // Master FP accumulators
3103
- __m512 acc_rows[16];
3104
- for (int i = 0; i < 16; i++) {
3105
- acc_rows[i] = _mm512_setzero_ps();
1052
+ // Master FP accumulators
1053
+ __m512 acc_rows[16];
1054
+ for (int i = 0; i < 16; i++) {
1055
+ acc_rows[i] = _mm512_setzero_ps();
3106
1056
  }
3107
1057
 
3108
1058
  for (int64_t b = 0; b < nb; b++) {
@@ -3783,207 +1733,7 @@ static void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
3783
1733
  }
3784
1734
  return;
3785
1735
  }
3786
- #elif defined(__riscv_v_intrinsic)
3787
- if (__riscv_vlenb() >= QK4_0) {
3788
- const size_t vl = QK4_0;
3789
-
3790
- for (int y = 0; y < nr / 4; y++) {
3791
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
3792
- for (int x = 0; x < nc / ncols_interleaved; x++) {
3793
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
3794
- vfloat32m1_t sumf0 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
3795
- vfloat32m1_t sumf1 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
3796
- vfloat32m1_t sumf2 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
3797
- vfloat32m1_t sumf3 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
3798
- for (int l = 0; l < nb; l++) {
3799
- const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
3800
- const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
3801
- const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
3802
- const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
3803
- const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
3804
- const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
3805
- const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
3806
-
3807
- // vector version needs Zvfhmin extension
3808
- const float a_scales[4] = {
3809
- LM_GGML_FP16_TO_FP32(a_ptr[l].d[0]),
3810
- LM_GGML_FP16_TO_FP32(a_ptr[l].d[1]),
3811
- LM_GGML_FP16_TO_FP32(a_ptr[l].d[2]),
3812
- LM_GGML_FP16_TO_FP32(a_ptr[l].d[3])
3813
- };
3814
- const float b_scales[8] = {
3815
- LM_GGML_FP16_TO_FP32(b_ptr[l].d[0]),
3816
- LM_GGML_FP16_TO_FP32(b_ptr[l].d[1]),
3817
- LM_GGML_FP16_TO_FP32(b_ptr[l].d[2]),
3818
- LM_GGML_FP16_TO_FP32(b_ptr[l].d[3]),
3819
- LM_GGML_FP16_TO_FP32(b_ptr[l].d[4]),
3820
- LM_GGML_FP16_TO_FP32(b_ptr[l].d[5]),
3821
- LM_GGML_FP16_TO_FP32(b_ptr[l].d[6]),
3822
- LM_GGML_FP16_TO_FP32(b_ptr[l].d[7])
3823
- };
3824
- const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
3825
-
3826
- const int64_t A0 = *(const int64_t *)&a_ptr[l].qs[0];
3827
- const int64_t A4 = *(const int64_t *)&a_ptr[l].qs[32];
3828
- const int64_t A8 = *(const int64_t *)&a_ptr[l].qs[64];
3829
- const int64_t Ac = *(const int64_t *)&a_ptr[l].qs[96];
3830
- __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
3831
- vint16m4_t sumi_l0;
3832
- {
3833
- const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A0, vl / 4));
3834
- const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A4, vl / 4));
3835
- const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A8, vl / 4));
3836
- const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ac, vl / 4));
3837
- const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
3838
- const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
3839
- const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
3840
- const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
3841
-
3842
- sumi_l0 = sumi_hi_m;
3843
- }
3844
-
3845
- {
3846
- const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l0));
3847
- const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
3848
- const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
3849
- const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
3850
- const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
3851
- const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
3852
- const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
3853
- const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
3854
- const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
3855
- const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
3856
- const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
3857
- const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
3858
- const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
3859
-
3860
- const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[0], vl / 4);
3861
- sumf0 = __riscv_vfmacc_vv_f32m1(sumf0, tmp1, b_scales_vec, vl / 4);
3862
- }
3863
-
3864
- const int64_t A1 = *(const int64_t *)&a_ptr[l].qs[8];
3865
- const int64_t A5 = *(const int64_t *)&a_ptr[l].qs[40];
3866
- const int64_t A9 = *(const int64_t *)&a_ptr[l].qs[72];
3867
- const int64_t Ad = *(const int64_t *)&a_ptr[l].qs[104];
3868
- __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
3869
- vint16m4_t sumi_l1;
3870
- {
3871
- const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A1, vl / 4));
3872
- const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A5, vl / 4));
3873
- const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A9, vl / 4));
3874
- const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ad, vl / 4));
3875
- const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
3876
- const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
3877
- const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
3878
- const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
3879
-
3880
- sumi_l1 = sumi_hi_m;
3881
- }
3882
-
3883
- {
3884
- const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l1));
3885
- const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
3886
- const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
3887
- const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
3888
- const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
3889
- const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
3890
- const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
3891
- const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
3892
- const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
3893
- const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
3894
- const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
3895
- const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
3896
- const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
3897
-
3898
- const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[1], vl / 4);
3899
- sumf1 = __riscv_vfmacc_vv_f32m1(sumf1, tmp1, b_scales_vec, vl / 4);
3900
- }
3901
-
3902
- const int64_t A2 = *(const int64_t *)&a_ptr[l].qs[16];
3903
- const int64_t A6 = *(const int64_t *)&a_ptr[l].qs[48];
3904
- const int64_t Aa = *(const int64_t *)&a_ptr[l].qs[80];
3905
- const int64_t Ae = *(const int64_t *)&a_ptr[l].qs[112];
3906
- __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
3907
- vint16m4_t sumi_l2;
3908
- {
3909
- const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A2, vl / 4));
3910
- const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A6, vl / 4));
3911
- const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Aa, vl / 4));
3912
- const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ae, vl / 4));
3913
- const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
3914
- const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
3915
- const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
3916
- const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
3917
-
3918
- sumi_l2 = sumi_hi_m;
3919
- }
3920
-
3921
- {
3922
- const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l2));
3923
- const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
3924
- const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
3925
- const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
3926
- const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
3927
- const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
3928
- const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
3929
- const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
3930
- const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
3931
- const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
3932
- const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
3933
- const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
3934
- const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
3935
-
3936
- const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[2], vl / 4);
3937
- sumf2 = __riscv_vfmacc_vv_f32m1(sumf2, tmp1, b_scales_vec, vl / 4);
3938
- }
3939
-
3940
- const int64_t A3 = *(const int64_t *)&a_ptr[l].qs[24];
3941
- const int64_t A7 = *(const int64_t *)&a_ptr[l].qs[56];
3942
- const int64_t Ab = *(const int64_t *)&a_ptr[l].qs[88];
3943
- const int64_t Af = *(const int64_t *)&a_ptr[l].qs[120];
3944
- __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
3945
- vint16m4_t sumi_l3;
3946
- {
3947
- const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A3, vl / 4));
3948
- const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A7, vl / 4));
3949
- const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ab, vl / 4));
3950
- const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Af, vl / 4));
3951
- const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
3952
- const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
3953
- const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
3954
- const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
3955
-
3956
- sumi_l3 = sumi_hi_m;
3957
- }
3958
1736
 
3959
- {
3960
- const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l3));
3961
- const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
3962
- const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
3963
- const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
3964
- const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
3965
- const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
3966
- const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
3967
- const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
3968
- const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
3969
- const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
3970
- const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
3971
- const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
3972
- const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
3973
-
3974
- const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[3], vl / 4);
3975
- sumf3 = __riscv_vfmacc_vv_f32m1(sumf3, tmp1, b_scales_vec, vl / 4);
3976
- }
3977
- }
3978
- __riscv_vse32_v_f32m1(&s[(y * 4 + 0) * bs + x * ncols_interleaved], sumf0, vl / 4);
3979
- __riscv_vse32_v_f32m1(&s[(y * 4 + 1) * bs + x * ncols_interleaved], sumf1, vl / 4);
3980
- __riscv_vse32_v_f32m1(&s[(y * 4 + 2) * bs + x * ncols_interleaved], sumf2, vl / 4);
3981
- __riscv_vse32_v_f32m1(&s[(y * 4 + 3) * bs + x * ncols_interleaved], sumf3, vl / 4);
3982
- }
3983
- }
3984
-
3985
- return;
3986
- }
3987
1737
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
3988
1738
  float sumf[4][8];
3989
1739
  int sumi;
@@ -4006,7 +1756,7 @@ static void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
4006
1756
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
4007
1757
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
4008
1758
  }
4009
- sumf[m][j] += sumi * LM_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_FP16_TO_FP32(a_ptr[l].d[m]);
1759
+ sumf[m][j] += sumi * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
4010
1760
  }
4011
1761
  }
4012
1762
  }
@@ -4019,7 +1769,7 @@ static void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * LM_GGML_RESTRICT s, size_t
4019
1769
  }
4020
1770
  }
4021
1771
 
4022
- static void lm_ggml_gemm_q4_K_8x8_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
1772
+ void lm_ggml_gemm_q4_K_8x8_q8_K(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
4023
1773
  const int qk = QK_K;
4024
1774
  const int nb = n / qk;
4025
1775
  const int ncols_interleaved = 8;
@@ -5510,7 +3260,7 @@ static void lm_ggml_gemm_q4_K_8x8_q8_K(int n, float * LM_GGML_RESTRICT s, size_t
5510
3260
  sumi2 = sumi2 * scales_1[j];
5511
3261
  sumi += sumi1 + sumi2;
5512
3262
  }
5513
- sumf[m][j] += sumi * LM_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
3263
+ sumf[m][j] += sumi * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
5514
3264
  }
5515
3265
  }
5516
3266
  }
@@ -5519,7 +3269,7 @@ static void lm_ggml_gemm_q4_K_8x8_q8_K(int n, float * LM_GGML_RESTRICT s, size_t
5519
3269
  for(int m = 0; m < 4; m++) {
5520
3270
  const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
5521
3271
  for(int j = 0; j < ncols_interleaved; j++) {
5522
- sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * LM_GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
3272
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * LM_GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
5523
3273
  }
5524
3274
  }
5525
3275
  }
@@ -5533,899 +3283,3 @@ static void lm_ggml_gemm_q4_K_8x8_q8_K(int n, float * LM_GGML_RESTRICT s, size_t
5533
3283
  }
5534
3284
  #endif
5535
3285
  }
5536
-
5537
- static void lm_ggml_gemm_iq4_nl_4x4_q8_0(int n, float * LM_GGML_RESTRICT s, size_t bs, const void * LM_GGML_RESTRICT vx, const void * LM_GGML_RESTRICT vy, int nr, int nc) {
5538
- const int qk = QK8_0;
5539
- const int nb = n / qk;
5540
- const int ncols_interleaved = 4;
5541
- const int blocklen = 4;
5542
-
5543
- assert (n % qk == 0);
5544
- assert (nr % 4 == 0);
5545
- assert (nc % ncols_interleaved == 0);
5546
-
5547
- UNUSED(s);
5548
- UNUSED(bs);
5549
- UNUSED(vx);
5550
- UNUSED(vy);
5551
- UNUSED(nr);
5552
- UNUSED(nc);
5553
- UNUSED(nb);
5554
- UNUSED(ncols_interleaved);
5555
- UNUSED(blocklen);
5556
-
5557
- #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
5558
- if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
5559
- const int8x16_t kvalues = vld1q_s8(kvalues_iq4nl);
5560
-
5561
- for (int y = 0; y < nr / 4; y++) {
5562
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
5563
- for (int x = 0; x < nc / ncols_interleaved; x++) {
5564
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
5565
-
5566
- float32x4_t sumf[4];
5567
- for (int m = 0; m < 4; m++) {
5568
- sumf[m] = vdupq_n_f32(0);
5569
- }
5570
-
5571
- for (int l = 0; l < nb; l++) {
5572
- float32x4_t a_d = vcvt_f32_f16(vld1_f16((const float16_t *)a_ptr[l].d));
5573
- float32x4_t b_d = vcvt_f32_f16(vld1_f16((const float16_t *)b_ptr[l].d));
5574
-
5575
- int32x4_t sumi_0 = vdupq_n_s32(0);
5576
- int32x4_t sumi_1 = vdupq_n_s32(0);
5577
- int32x4_t sumi_2 = vdupq_n_s32(0);
5578
- int32x4_t sumi_3 = vdupq_n_s32(0);
5579
-
5580
- for (int k = 0; k < 4; k++) {
5581
- int8x16_t a_0 = vld1q_s8(a_ptr[l].qs + 16 * k + 0);
5582
- int8x16_t a_1 = vld1q_s8(a_ptr[l].qs + 16 * k + 64);
5583
-
5584
- uint8x16_t b = vld1q_u8(b_ptr[l].qs + 16 * k);
5585
- int8x16_t b_hi = vqtbl1q_s8(kvalues, b >> 4);
5586
- int8x16_t b_lo = vqtbl1q_s8(kvalues, b & 0xF);
5587
-
5588
- sumi_0 = vdotq_laneq_s32(sumi_0, b_lo, a_0, 0);
5589
- sumi_1 = vdotq_laneq_s32(sumi_1, b_lo, a_0, 1);
5590
- sumi_2 = vdotq_laneq_s32(sumi_2, b_lo, a_0, 2);
5591
- sumi_3 = vdotq_laneq_s32(sumi_3, b_lo, a_0, 3);
5592
- sumi_0 = vdotq_laneq_s32(sumi_0, b_hi, a_1, 0);
5593
- sumi_1 = vdotq_laneq_s32(sumi_1, b_hi, a_1, 1);
5594
- sumi_2 = vdotq_laneq_s32(sumi_2, b_hi, a_1, 2);
5595
- sumi_3 = vdotq_laneq_s32(sumi_3, b_hi, a_1, 3);
5596
- }
5597
-
5598
- sumf[0] = vmlaq_f32(sumf[0], vmulq_laneq_f32(b_d, a_d, 0), vcvtq_f32_s32(sumi_0));
5599
- sumf[1] = vmlaq_f32(sumf[1], vmulq_laneq_f32(b_d, a_d, 1), vcvtq_f32_s32(sumi_1));
5600
- sumf[2] = vmlaq_f32(sumf[2], vmulq_laneq_f32(b_d, a_d, 2), vcvtq_f32_s32(sumi_2));
5601
- sumf[3] = vmlaq_f32(sumf[3], vmulq_laneq_f32(b_d, a_d, 3), vcvtq_f32_s32(sumi_3));
5602
- }
5603
-
5604
- for (int m = 0; m < 4; m++) {
5605
- vst1q_f32(s + (y * 4 + m) * bs + x * 4, sumf[m]);
5606
- }
5607
- }
5608
- }
5609
- return;
5610
- }
5611
- #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
5612
- {
5613
- float sumf[4][4];
5614
- int sumi;
5615
-
5616
- for (int y = 0; y < nr / 4; y++) {
5617
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
5618
- for (int x = 0; x < nc / ncols_interleaved; x++) {
5619
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
5620
- for (int m = 0; m < 4; m++) {
5621
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
5622
- }
5623
- for (int l = 0; l < nb; l++) {
5624
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
5625
- for (int m = 0; m < 4; m++) {
5626
- for (int j = 0; j < ncols_interleaved; j++) {
5627
- sumi = 0;
5628
- for (int i = 0; i < blocklen; ++i) {
5629
- const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
5630
- const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
5631
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
5632
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
5633
- }
5634
- sumf[m][j] += sumi * LM_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_FP16_TO_FP32(a_ptr[l].d[m]);
5635
- }
5636
- }
5637
- }
5638
- }
5639
- for (int m = 0; m < 4; m++) {
5640
- for (int j = 0; j < ncols_interleaved; j++)
5641
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
5642
- }
5643
- }
5644
- }
5645
- }
5646
- }
5647
-
5648
- static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
5649
- block_q4_0x4 out;
5650
-
5651
- for (int i = 0; i < 4; i++) {
5652
- out.d[i] = in[i].d;
5653
- }
5654
-
5655
- const int end = QK4_0 * 2 / blck_size_interleave;
5656
-
5657
- if (blck_size_interleave == 8) {
5658
- const uint64_t xor_mask = 0x8888888888888888ULL;
5659
- for (int i = 0; i < end; ++i) {
5660
- int src_id = i % 4;
5661
- int src_offset = (i / 4) * blck_size_interleave;
5662
- int dst_offset = i * blck_size_interleave;
5663
-
5664
- uint64_t elems;
5665
- // Using memcpy to avoid unaligned memory accesses
5666
- memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
5667
- elems ^= xor_mask;
5668
- memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
5669
- }
5670
- } else if (blck_size_interleave == 4) {
5671
- const uint32_t xor_mask = 0x88888888;
5672
- for (int i = 0; i < end; ++i) {
5673
- int src_id = i % 4;
5674
- int src_offset = (i / 4) * blck_size_interleave;
5675
- int dst_offset = i * blck_size_interleave;
5676
-
5677
- uint32_t elems;
5678
- memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
5679
- elems ^= xor_mask;
5680
- memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
5681
- }
5682
- } else {
5683
- LM_GGML_ASSERT(false);
5684
- }
5685
-
5686
- return out;
5687
- }
5688
-
5689
- // interleave 8 block_q4_0s in blocks of blck_size_interleave
5690
- // returns an interleaved block_q4_0x8
5691
- // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
5692
- // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
5693
- static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
5694
- block_q4_0x8 out;
5695
-
5696
- for (int i = 0; i < 8; i++) {
5697
- out.d[i] = in[i].d;
5698
- }
5699
-
5700
- const int end = QK4_0 * 4 / blck_size_interleave;
5701
- const uint64_t xor_mask = 0x8888888888888888ULL;
5702
-
5703
- for (int i = 0; i < end; ++i) {
5704
- int src_id = i % 8;
5705
- int src_offset = (i / 8) * blck_size_interleave;
5706
- int dst_offset = i * blck_size_interleave;
5707
-
5708
- uint64_t elems;
5709
- memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
5710
- elems ^= xor_mask;
5711
- memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
5712
- }
5713
-
5714
- return out;
5715
- }
5716
-
5717
- static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
5718
- block_q4_Kx8 out;
5719
- //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
5720
- for (int i = 0; i < 8; i++) {
5721
- out.d[i] = in[i].LM_GGML_COMMON_AGGR_U.LM_GGML_COMMON_AGGR_S.d;
5722
- }
5723
-
5724
- for (int i = 0; i < 8; i++) {
5725
- out.dmin[i] = in[i].LM_GGML_COMMON_AGGR_U.LM_GGML_COMMON_AGGR_S.dmin;
5726
- }
5727
-
5728
- const int end = QK_K * 4 / blck_size_interleave;
5729
-
5730
- // Interleave Q4_K quants by taking 8 bytes at a time
5731
- for (int i = 0; i < end; ++i) {
5732
- int src_id = i % 8;
5733
- int src_offset = (i / 8) * blck_size_interleave;
5734
- int dst_offset = i * blck_size_interleave;
5735
-
5736
- uint64_t elems;
5737
- memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
5738
- memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
5739
- }
5740
-
5741
- // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
5742
- // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
5743
- // The output Q4_Kx8 structure has 96 bytes
5744
- // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
5745
- // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
5746
- uint8_t s[8], m[8];
5747
-
5748
- for (int i = 0; i < 4; i++) {
5749
- for (int j = 0; j < 8; j++) {
5750
- s[j] = in[j].scales[i] & 63;
5751
- m[j] = in[j].scales[i + 4] & 63;
5752
- }
5753
-
5754
- out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2);
5755
- out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2);
5756
- out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2);
5757
- out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2);
5758
- out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2);
5759
- out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2);
5760
- out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2);
5761
- out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2);
5762
- out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4);
5763
- out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
5764
- out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
5765
- out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
5766
-
5767
- }
5768
-
5769
- for (int i = 0; i < 4; i++) {
5770
- for (int j = 0; j < 8; j++) {
5771
- s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
5772
- m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
5773
- }
5774
-
5775
- out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
5776
- out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
5777
- out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
5778
- out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
5779
- out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
5780
- out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
5781
- out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
5782
- out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
5783
- out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
5784
- out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
5785
- out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
5786
- out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
5787
-
5788
- }
5789
-
5790
- return out;
5791
- }
5792
-
5793
- static int repack_q4_0_to_q4_0_4_bl(struct lm_ggml_tensor * t, int interleave_block, const void * LM_GGML_RESTRICT data, size_t data_size) {
5794
- LM_GGML_ASSERT(t->type == LM_GGML_TYPE_Q4_0);
5795
- LM_GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
5796
- constexpr int nrows_interleaved = 4;
5797
-
5798
- block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
5799
- const block_q4_0 * src = (const block_q4_0 *)data;
5800
- block_q4_0 dst_tmp[4];
5801
- int nrow = lm_ggml_nrows(t);
5802
- int nblocks = t->ne[0] / QK4_0;
5803
-
5804
- LM_GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
5805
-
5806
- if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
5807
- return -1;
5808
- }
5809
-
5810
- for (int b = 0; b < nrow; b += nrows_interleaved) {
5811
- for (int64_t x = 0; x < nblocks; x++) {
5812
- for (int i = 0; i < nrows_interleaved; i++) {
5813
- dst_tmp[i] = src[x + i * nblocks];
5814
- }
5815
- *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
5816
- }
5817
- src += nrows_interleaved * nblocks;
5818
- }
5819
- return 0;
5820
-
5821
- LM_GGML_UNUSED(data_size);
5822
- }
5823
- static int repack_q4_K_to_q4_K_8_bl(struct lm_ggml_tensor * t, int interleave_block, const void * LM_GGML_RESTRICT data, size_t data_size) {
5824
- LM_GGML_ASSERT(t->type == LM_GGML_TYPE_Q4_K);
5825
- LM_GGML_ASSERT(interleave_block == 8);
5826
- constexpr int nrows_interleaved = 8;
5827
-
5828
- block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
5829
- const block_q4_K * src = (const block_q4_K*) data;
5830
- block_q4_K dst_tmp[8];
5831
- int nrow = lm_ggml_nrows(t);
5832
- int nblocks = t->ne[0] / QK_K;
5833
-
5834
- LM_GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
5835
-
5836
- if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
5837
- return -1;
5838
- }
5839
-
5840
- for (int b = 0; b < nrow; b += nrows_interleaved) {
5841
- for (int64_t x = 0; x < nblocks; x++) {
5842
- for (int i = 0; i < nrows_interleaved; i++ ) {
5843
- dst_tmp[i] = src[x + i * nblocks];
5844
- }
5845
- *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block);
5846
- }
5847
- src += nrows_interleaved * nblocks;
5848
- }
5849
- return 0;
5850
-
5851
- LM_GGML_UNUSED(data_size);
5852
- }
5853
-
5854
- static int repack_q4_0_to_q4_0_8_bl(struct lm_ggml_tensor * t, int interleave_block, const void * LM_GGML_RESTRICT data, size_t data_size) {
5855
- LM_GGML_ASSERT(t->type == LM_GGML_TYPE_Q4_0);
5856
- LM_GGML_ASSERT(interleave_block == 8);
5857
- constexpr int nrows_interleaved = 8;
5858
-
5859
- block_q4_0x8 * dst = (block_q4_0x8*)t->data;
5860
- const block_q4_0 * src = (const block_q4_0*) data;
5861
- block_q4_0 dst_tmp[8];
5862
- int nrow = lm_ggml_nrows(t);
5863
- int nblocks = t->ne[0] / QK4_0;
5864
-
5865
- LM_GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
5866
-
5867
- if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
5868
- return -1;
5869
- }
5870
-
5871
- for (int b = 0; b < nrow; b += nrows_interleaved) {
5872
- for (int64_t x = 0; x < nblocks; x++) {
5873
- for (int i = 0; i < nrows_interleaved; i++ ) {
5874
- dst_tmp[i] = src[x + i * nblocks];
5875
- }
5876
- *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
5877
- }
5878
- src += nrows_interleaved * nblocks;
5879
- }
5880
- return 0;
5881
-
5882
- LM_GGML_UNUSED(data_size);
5883
- }
5884
-
5885
- static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
5886
- block_iq4_nlx4 out;
5887
-
5888
- for (int i = 0; i < 4; i++) {
5889
- out.d[i] = in[i].d;
5890
- }
5891
-
5892
- const int end = QK4_NL * 2 / blck_size_interleave;
5893
-
5894
- // TODO: this branch seems wrong
5895
- //if (blck_size_interleave == 8) {
5896
- // for (int i = 0; i < end; ++i) {
5897
- // int src_id = i % 4;
5898
- // int src_offset = (i / 4) * blck_size_interleave;
5899
- // int dst_offset = i * blck_size_interleave;
5900
-
5901
- // // Using memcpy to avoid unaligned memory accesses
5902
- // memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
5903
- // }
5904
- //} else
5905
- if (blck_size_interleave == 4) {
5906
- for (int i = 0; i < end; ++i) {
5907
- int src_id = i % 4;
5908
- int src_offset = (i / 4) * blck_size_interleave;
5909
- int dst_offset = i * blck_size_interleave;
5910
-
5911
- memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
5912
- }
5913
- } else {
5914
- LM_GGML_ASSERT(false);
5915
- }
5916
-
5917
- return out;
5918
- }
5919
-
5920
- static int repack_iq4_nl_to_iq4_nl_4_bl(struct lm_ggml_tensor * t, int interleave_block, const void * LM_GGML_RESTRICT data, size_t data_size) {
5921
- LM_GGML_ASSERT(t->type == LM_GGML_TYPE_IQ4_NL);
5922
- //LM_GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
5923
- LM_GGML_ASSERT(interleave_block == 4);
5924
-
5925
- block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
5926
- const block_iq4_nl * src = (const block_iq4_nl *)data;
5927
- block_iq4_nl dst_tmp[4];
5928
- int nrow = lm_ggml_nrows(t);
5929
- int nrows_interleaved = 4;
5930
- int nblocks = t->ne[0] / QK4_0;
5931
-
5932
- LM_GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
5933
-
5934
- if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
5935
- return -1;
5936
- }
5937
-
5938
- for (int b = 0; b < nrow; b += nrows_interleaved) {
5939
- for (int64_t x = 0; x < nblocks; x++) {
5940
- for (int i = 0; i < nrows_interleaved; i++) {
5941
- dst_tmp[i] = src[x + i * nblocks];
5942
- }
5943
- *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
5944
- }
5945
- src += nrows_interleaved * nblocks;
5946
- }
5947
- return 0;
5948
-
5949
- LM_GGML_UNUSED(data_size);
5950
- }
5951
-
5952
- namespace ggml::cpu::aarch64 {
5953
- // repack
5954
- template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
5955
- int repack(struct lm_ggml_tensor *, const void *, size_t);
5956
-
5957
- // TODO: generalise.
5958
- template <> int repack<block_q4_0, 4, 4>(struct lm_ggml_tensor * t, const void * data, size_t data_size) {
5959
- return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
5960
- }
5961
-
5962
- template <> int repack<block_q4_0, 8, 4>(struct lm_ggml_tensor * t, const void * data, size_t data_size) {
5963
- return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
5964
- }
5965
-
5966
- template <> int repack<block_q4_0, 8, 8>(struct lm_ggml_tensor * t, const void * data, size_t data_size) {
5967
- return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
5968
- }
5969
-
5970
- template <> int repack<block_q4_K, 8, 8>(struct lm_ggml_tensor * t, const void * data, size_t data_size) {
5971
- return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
5972
- }
5973
-
5974
- template <> int repack<block_iq4_nl, 4, 4>(struct lm_ggml_tensor * t, const void * data, size_t data_size) {
5975
- return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
5976
- }
5977
-
5978
- // TODO: needs to be revisited
5979
- //template <> int repack<block_iq4_nl, 8, 4>(struct lm_ggml_tensor * t, const void * data, size_t data_size) {
5980
- // return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
5981
- //}
5982
-
5983
- // gemv
5984
- template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, lm_ggml_type PARAM_TYPE>
5985
- void gemv(int, float *, size_t, const void *, const void *, int, int);
5986
-
5987
- template <> void gemv<block_q4_0, 4, 4, LM_GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
5988
- lm_ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
5989
- }
5990
-
5991
- template <> void gemv<block_q4_0, 8, 4, LM_GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
5992
- lm_ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
5993
- }
5994
-
5995
- template <> void gemv<block_q4_0, 8, 8, LM_GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
5996
- lm_ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
5997
- }
5998
-
5999
- template <> void gemv<block_q4_K, 8, 8, LM_GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
6000
- lm_ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
6001
- }
6002
-
6003
- template <> void gemv<block_iq4_nl, 4, 4, LM_GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
6004
- lm_ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
6005
- }
6006
-
6007
- // gemm
6008
- template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, lm_ggml_type PARAM_TYPE>
6009
- void gemm(int, float *, size_t, const void *, const void *, int, int);
6010
-
6011
- template <> void gemm<block_q4_0, 4, 4, LM_GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
6012
- lm_ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
6013
- }
6014
-
6015
- template <> void gemm<block_q4_0, 8, 4, LM_GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
6016
- lm_ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
6017
- }
6018
-
6019
- template <> void gemm<block_q4_0, 8, 8, LM_GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
6020
- lm_ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
6021
- }
6022
-
6023
- template <> void gemm<block_q4_K, 8, 8, LM_GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
6024
- lm_ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
6025
- }
6026
-
6027
- template <> void gemm<block_iq4_nl, 4, 4, LM_GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
6028
- lm_ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
6029
- }
6030
-
6031
- class tensor_traits_base : public ggml::cpu::tensor_traits {
6032
- public:
6033
- virtual int repack(struct lm_ggml_tensor * t, const void * data, size_t data_size) = 0;
6034
- };
6035
-
6036
- template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, lm_ggml_type PARAM_TYPE> class tensor_traits : public tensor_traits_base {
6037
-
6038
- bool work_size(int /* n_threads */, const struct lm_ggml_tensor * op, size_t & size) override {
6039
- // not realy a LM_GGML_TYPE_Q8_0 but same size.
6040
- switch (op->op) {
6041
- case LM_GGML_OP_MUL_MAT:
6042
- size = lm_ggml_row_size(PARAM_TYPE, lm_ggml_nelements(op->src[1]));
6043
- return true;
6044
- case LM_GGML_OP_MUL_MAT_ID:
6045
- size = lm_ggml_row_size(PARAM_TYPE, lm_ggml_nelements(op->src[1]));
6046
- size = LM_GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
6047
- size += sizeof(int64_t) * (1+op->src[0]->ne[2]) * op->src[1]->ne[2];
6048
- return true;
6049
- default:
6050
- // LM_GGML_ABORT("fatal error");
6051
- break;
6052
- }
6053
- return false;
6054
- }
6055
-
6056
- bool compute_forward(struct lm_ggml_compute_params * params, struct lm_ggml_tensor * op) override {
6057
- switch (op->op) {
6058
- case LM_GGML_OP_MUL_MAT:
6059
- forward_mul_mat(params, op);
6060
- return true;
6061
- case LM_GGML_OP_MUL_MAT_ID:
6062
- forward_mul_mat_id(params, op);
6063
- return true;
6064
- default:
6065
- // LM_GGML_ABORT("fatal error");
6066
- break;
6067
- }
6068
- return false;
6069
- }
6070
-
6071
- void forward_mul_mat(lm_ggml_compute_params * params, lm_ggml_tensor * op) {
6072
- const lm_ggml_tensor * src0 = op->src[0];
6073
- const lm_ggml_tensor * src1 = op->src[1];
6074
- lm_ggml_tensor * dst = op;
6075
-
6076
- LM_GGML_TENSOR_BINARY_OP_LOCALS
6077
-
6078
- const int ith = params->ith;
6079
- const int nth = params->nth;
6080
-
6081
- LM_GGML_ASSERT(ne0 == ne01);
6082
- LM_GGML_ASSERT(ne1 == ne11);
6083
- LM_GGML_ASSERT(ne2 == ne12);
6084
- LM_GGML_ASSERT(ne3 == ne13);
6085
-
6086
- // dst cannot be transposed or permuted
6087
- LM_GGML_ASSERT(nb0 == sizeof(float));
6088
- LM_GGML_ASSERT(nb0 <= nb1);
6089
- LM_GGML_ASSERT(nb1 <= nb2);
6090
- LM_GGML_ASSERT(nb2 <= nb3);
6091
-
6092
- LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32);
6093
-
6094
- LM_GGML_ASSERT(lm_ggml_n_dims(op->src[0]) == 2);
6095
- // LM_GGML_ASSERT(lm_ggml_n_dims(op->src[1]) == 2);
6096
-
6097
- char * wdata = static_cast<char *>(params->wdata);
6098
- const size_t nbw1 = lm_ggml_row_size(PARAM_TYPE, ne10);
6099
-
6100
- assert(params->wsize >= nbw1 * ne11);
6101
-
6102
- const lm_ggml_from_float_t from_float = lm_ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
6103
-
6104
- int64_t i11_processed = 0;
6105
- for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
6106
- lm_ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10);
6107
- }
6108
-
6109
- i11_processed = ne11 - ne11 % 4;
6110
- for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
6111
- from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
6112
- }
6113
-
6114
- lm_ggml_barrier(params->threadpool);
6115
-
6116
- const void * src1_wdata = params->wdata;
6117
- const size_t src1_col_stride = lm_ggml_row_size(PARAM_TYPE, ne10);
6118
- int64_t src0_start = (ith * ne01) / nth;
6119
- int64_t src0_end = ((ith + 1) * ne01) / nth;
6120
- src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
6121
- src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
6122
- if (src0_start >= src0_end) {
6123
- return;
6124
- }
6125
-
6126
- // If there are more than three rows in src1, use gemm; otherwise, use gemv.
6127
- if (ne11 > 3) {
6128
- gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
6129
- (float *) ((char *) dst->data) + src0_start, ne01,
6130
- (const char *) src0->data + src0_start * nb01,
6131
- (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
6132
- }
6133
- for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
6134
- gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
6135
- (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
6136
- (const char *) src0->data + src0_start * nb01,
6137
- (const char *) src1_wdata + (src1_col_stride * iter), 1,
6138
- src0_end - src0_start);
6139
- }
6140
- }
6141
-
6142
- void forward_mul_mat_id(lm_ggml_compute_params * params, lm_ggml_tensor * op) {
6143
- const lm_ggml_tensor * src0 = op->src[0];
6144
- const lm_ggml_tensor * src1 = op->src[1];
6145
- const lm_ggml_tensor * ids = op->src[2];
6146
- lm_ggml_tensor * dst = op;
6147
-
6148
- LM_GGML_TENSOR_BINARY_OP_LOCALS
6149
-
6150
- const int ith = params->ith;
6151
- const int nth = params->nth;
6152
-
6153
- const lm_ggml_from_float_t from_float = lm_ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
6154
-
6155
- // we don't support permuted src0 or src1
6156
- LM_GGML_ASSERT(nb00 == lm_ggml_type_size(src0->type));
6157
- LM_GGML_ASSERT(nb10 == lm_ggml_type_size(src1->type));
6158
-
6159
- // dst cannot be transposed or permuted
6160
- LM_GGML_ASSERT(nb0 == sizeof(float));
6161
- LM_GGML_ASSERT(nb0 <= nb1);
6162
- LM_GGML_ASSERT(nb1 <= nb2);
6163
- LM_GGML_ASSERT(nb2 <= nb3);
6164
-
6165
- LM_GGML_ASSERT(ne03 == 1);
6166
- LM_GGML_ASSERT(ne13 == 1);
6167
- LM_GGML_ASSERT(ne3 == 1);
6168
-
6169
- LM_GGML_ASSERT(src1->type == LM_GGML_TYPE_F32);
6170
-
6171
- // row groups
6172
- const int n_ids = ids->ne[0]; // n_expert_used
6173
- const int n_as = ne02; // n_expert
6174
-
6175
- const size_t nbw1 = lm_ggml_row_size(PARAM_TYPE, ne10);
6176
- const size_t nbw2 = nbw1*ne11;
6177
- const size_t nbw3 = nbw2*ne12;
6178
-
6179
- struct mmid_row_mapping {
6180
- int32_t i1;
6181
- int32_t i2;
6182
- };
6183
-
6184
- LM_GGML_ASSERT(params->wsize >= (LM_GGML_PAD(nbw3, sizeof(int64_t)) + n_as * sizeof(int64_t) +
6185
- n_as * ne12 * sizeof(mmid_row_mapping)));
6186
-
6187
- auto * wdata = (char *) params->wdata;
6188
- auto * wdata_src1_end = (char *) wdata + LM_GGML_PAD(nbw3, sizeof(int64_t));
6189
- auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
6190
-
6191
- struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
6192
-
6193
- // src1: float32 => param type
6194
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
6195
- for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
6196
- from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11),
6197
- (void *) (wdata + i12 * nbw2 + i11 * nbw1),
6198
- ne10);
6199
- }
6200
- }
6201
-
6202
- #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)]
6203
-
6204
- if (ith == 0) {
6205
- // initialize matrix_row_counts
6206
- memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
6207
-
6208
- // group rows by src0 matrix
6209
- for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
6210
- for (int32_t id = 0; id < n_ids; ++id) {
6211
- const int32_t i02 =
6212
- *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
6213
-
6214
- LM_GGML_ASSERT(i02 >= 0 && i02 < n_as);
6215
-
6216
- MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 };
6217
- matrix_row_counts[i02] += 1;
6218
- }
6219
- }
6220
- }
6221
-
6222
- lm_ggml_barrier(params->threadpool);
6223
-
6224
- // compute each matrix multiplication in sequence
6225
- for (int cur_a = 0; cur_a < n_as; ++cur_a) {
6226
- const int64_t cne1 = matrix_row_counts[cur_a];
6227
-
6228
- if (cne1 == 0) {
6229
- continue;
6230
- }
6231
-
6232
- const auto * src0_cur = (const char *) src0->data + cur_a*nb02;
6233
-
6234
- //const int64_t nr0 = ne01; // src0 rows
6235
- const int64_t nr1 = cne1; // src1 rows
6236
-
6237
- int64_t src0_cur_start = (ith * ne01) / nth;
6238
- int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
6239
-
6240
- src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
6241
- src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
6242
-
6243
- if (src0_cur_start >= src0_cur_end) {
6244
- return;
6245
- }
6246
-
6247
- for (int ir1 = 0; ir1 < nr1; ir1++) {
6248
- struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
6249
-
6250
- const int id = row_mapping.i1; // selected expert index
6251
-
6252
- const int64_t i11 = id % ne11;
6253
- const int64_t i12 = row_mapping.i2; // row index in src1
6254
-
6255
- const int64_t i1 = id; // selected expert index
6256
- const int64_t i2 = i12; // row
6257
-
6258
- const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
6259
-
6260
- gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
6261
- (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
6262
- src0_cur + src0_cur_start * nb01,
6263
- src1_col, 1, src0_cur_end - src0_cur_start);
6264
- }
6265
- }
6266
- #undef MMID_MATRIX_ROW
6267
- }
6268
-
6269
- int repack(struct lm_ggml_tensor * t, const void * data, size_t data_size) override {
6270
- LM_GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, lm_ggml_type_name(t->type),
6271
- (int) NB_COLS, (int) INTER_SIZE);
6272
- return ggml::cpu::aarch64::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
6273
- }
6274
- };
6275
-
6276
- // instance for Q4
6277
- static const tensor_traits<block_q4_0, 4, 4, LM_GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
6278
- static const tensor_traits<block_q4_0, 8, 4, LM_GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
6279
- static const tensor_traits<block_q4_0, 8, 8, LM_GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
6280
- static const tensor_traits<block_q4_K, 8, 8, LM_GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
6281
-
6282
- // instance for IQ4
6283
- static const tensor_traits<block_iq4_nl, 4, 4, LM_GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
6284
-
6285
- } // namespace ggml::cpu::aarch64
6286
-
6287
- static const ggml::cpu::tensor_traits * lm_ggml_aarch64_get_optimal_repack_type(const struct lm_ggml_tensor * cur) {
6288
- if (cur->type == LM_GGML_TYPE_Q4_0) {
6289
- if (lm_ggml_cpu_has_avx2() || (lm_ggml_cpu_has_sve() && lm_ggml_cpu_has_matmul_int8() && lm_ggml_cpu_get_sve_cnt() == QK8_0)) {
6290
- if (cur->ne[1] % 8 == 0) {
6291
- return &ggml::cpu::aarch64::q4_0_8x8_q8_0;
6292
- }
6293
- }
6294
- if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
6295
- if (cur->ne[1] % 4 == 0) {
6296
- return &ggml::cpu::aarch64::q4_0_4x8_q8_0;
6297
- }
6298
- }
6299
- if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
6300
- if (cur->ne[1] % 4 == 0) {
6301
- return &ggml::cpu::aarch64::q4_0_4x4_q8_0;
6302
- }
6303
- }
6304
- } else if (cur->type == LM_GGML_TYPE_Q4_K) {
6305
- if (lm_ggml_cpu_has_avx2()) {
6306
- if (cur->ne[1] % 8 == 0) {
6307
- return &ggml::cpu::aarch64::q4_K_8x8_q8_K;
6308
- }
6309
- }
6310
- } else if (cur->type == LM_GGML_TYPE_IQ4_NL) {
6311
- if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_dotprod()) {
6312
- if (cur->ne[1] % 4 == 0) {
6313
- return &ggml::cpu::aarch64::iq4_nl_4x4_q8_0;
6314
- }
6315
- }
6316
- }
6317
-
6318
- return nullptr;
6319
- }
6320
-
6321
- static enum lm_ggml_status lm_ggml_backend_cpu_aarch64_buffer_init_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor) {
6322
- tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(lm_ggml_aarch64_get_optimal_repack_type(tensor));
6323
-
6324
- LM_GGML_UNUSED(buffer);
6325
- return LM_GGML_STATUS_SUCCESS;
6326
- }
6327
-
6328
- static void lm_ggml_backend_cpu_aarch64_buffer_set_tensor(lm_ggml_backend_buffer_t buffer, struct lm_ggml_tensor * tensor,
6329
- const void * data, size_t offset, size_t size) {
6330
- LM_GGML_ASSERT(offset == 0);
6331
- LM_GGML_ASSERT(size == lm_ggml_nbytes(tensor));
6332
-
6333
- auto tensor_traits = (ggml::cpu::aarch64::tensor_traits_base *) tensor->extra;
6334
- auto OK = tensor_traits->repack(tensor, data, size);
6335
-
6336
- LM_GGML_ASSERT(OK == 0);
6337
- LM_GGML_UNUSED(buffer);
6338
- }
6339
-
6340
- static const char * lm_ggml_backend_cpu_aarch64_buffer_type_get_name(lm_ggml_backend_buffer_type_t buft) {
6341
- return "CPU_AARCH64";
6342
-
6343
- LM_GGML_UNUSED(buft);
6344
- }
6345
-
6346
- static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(lm_ggml_backend_buffer_type_t buft, size_t size) {
6347
- lm_ggml_backend_buffer_t buffer = lm_ggml_backend_buft_alloc_buffer(lm_ggml_backend_cpu_buffer_type(), size);
6348
-
6349
- if (buffer == nullptr) {
6350
- return nullptr;
6351
- }
6352
-
6353
- buffer->buft = buft;
6354
- buffer->iface.init_tensor = lm_ggml_backend_cpu_aarch64_buffer_init_tensor;
6355
- buffer->iface.set_tensor = lm_ggml_backend_cpu_aarch64_buffer_set_tensor;
6356
- buffer->iface.get_tensor = nullptr;
6357
- buffer->iface.cpy_tensor = nullptr;
6358
- return buffer;
6359
- }
6360
-
6361
- static size_t lm_ggml_backend_cpu_aarch64_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
6362
- return TENSOR_ALIGNMENT;
6363
-
6364
- LM_GGML_UNUSED(buft);
6365
- }
6366
-
6367
- namespace ggml::cpu::aarch64 {
6368
- class extra_buffer_type : ggml::cpu::extra_buffer_type {
6369
- bool supports_op(lm_ggml_backend_dev_t, const struct lm_ggml_tensor * op) override {
6370
- if ( op->op == LM_GGML_OP_MUL_MAT &&
6371
- op->src[0]->buffer &&
6372
- (lm_ggml_n_dims(op->src[0]) == 2) &&
6373
- op->src[0]->buffer->buft == lm_ggml_backend_cpu_aarch64_buffer_type() &&
6374
- lm_ggml_aarch64_get_optimal_repack_type(op->src[0])
6375
- ) {
6376
- if (op->src[1]->buffer && !lm_ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
6377
- return false;
6378
- }
6379
- if (op->src[1]->type == LM_GGML_TYPE_F32) {
6380
- return true;
6381
- }
6382
- //if (op->src[1]->type == LM_GGML_TYPE_Q8_0) {
6383
- // return true;
6384
- //}
6385
- // may be possible if Q8_0 packed...
6386
- } else if (op->op == LM_GGML_OP_MUL_MAT_ID
6387
- && op->src[0]->buffer
6388
- && (lm_ggml_n_dims(op->src[0]) == 3)
6389
- && op->src[0]->buffer->buft == lm_ggml_backend_cpu_aarch64_buffer_type()
6390
- && lm_ggml_aarch64_get_optimal_repack_type(op->src[0])
6391
- ) {
6392
- if (op->src[1]->buffer && !lm_ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
6393
- return false;
6394
- }
6395
- if (op->src[1]->type == LM_GGML_TYPE_F32) {
6396
- return true;
6397
- }
6398
- //if (op->src[1]->type == LM_GGML_TYPE_Q8_0) {
6399
- // return true;
6400
- //}
6401
- }
6402
- return false;
6403
- }
6404
-
6405
- ggml::cpu::tensor_traits * get_tensor_traits(const struct lm_ggml_tensor * op) override {
6406
- if (op->op == LM_GGML_OP_MUL_MAT || op->op == LM_GGML_OP_MUL_MAT_ID) {
6407
- if (op->src[0]->buffer && op->src[0]->buffer->buft == lm_ggml_backend_cpu_aarch64_buffer_type()) {
6408
- return (ggml::cpu::tensor_traits *) op->src[0]->extra;
6409
- }
6410
- }
6411
- return nullptr;
6412
- }
6413
- };
6414
- } // namespace ggml::cpu::aarch64
6415
-
6416
- lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_aarch64_buffer_type(void) {
6417
- static struct lm_ggml_backend_buffer_type lm_ggml_backend_cpu_buffer_type_aarch64 = {
6418
- /* .iface = */ {
6419
- /* .get_name = */ lm_ggml_backend_cpu_aarch64_buffer_type_get_name,
6420
- /* .alloc_buffer = */ lm_ggml_backend_cpu_aarch64_buffer_type_alloc_buffer,
6421
- /* .get_alignment = */ lm_ggml_backend_cpu_aarch64_buffer_type_get_alignment,
6422
- /* .get_max_size = */ nullptr, // defaults to SIZE_MAX
6423
- /* .get_alloc_size = */ nullptr, // defaults to lm_ggml_nbytes
6424
- /* .is_host = */ nullptr,
6425
- },
6426
- /* .device = */ lm_ggml_backend_reg_dev_get(lm_ggml_backend_cpu_reg(), 0),
6427
- /* .context = */ new ggml::cpu::aarch64::extra_buffer_type(),
6428
- };
6429
-
6430
- return &lm_ggml_backend_cpu_buffer_type_aarch64;
6431
- }