cui-llama.rn 1.4.4 → 1.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. package/android/src/main/CMakeLists.txt +2 -2
  2. package/android/src/main/jni.cpp +12 -10
  3. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  11. package/cpp/chat-template.hpp +529 -529
  12. package/cpp/chat.cpp +959 -265
  13. package/cpp/chat.h +135 -0
  14. package/cpp/common.cpp +2064 -1996
  15. package/cpp/common.h +700 -744
  16. package/cpp/ggml-alloc.c +1039 -1030
  17. package/cpp/ggml-alloc.h +1 -1
  18. package/cpp/ggml-backend-impl.h +255 -255
  19. package/cpp/ggml-backend-reg.cpp +586 -582
  20. package/cpp/ggml-backend.cpp +2004 -2002
  21. package/cpp/ggml-backend.h +354 -354
  22. package/cpp/ggml-common.h +1851 -1851
  23. package/cpp/ggml-cpp.h +39 -39
  24. package/cpp/ggml-cpu-aarch64.cpp +4248 -4247
  25. package/cpp/ggml-cpu-aarch64.h +8 -8
  26. package/cpp/ggml-cpu-impl.h +531 -380
  27. package/cpp/ggml-cpu-quants.c +12527 -11517
  28. package/cpp/ggml-cpu-traits.cpp +36 -36
  29. package/cpp/ggml-cpu-traits.h +38 -38
  30. package/cpp/ggml-cpu.c +15766 -14485
  31. package/cpp/ggml-cpu.cpp +655 -633
  32. package/cpp/ggml-cpu.h +138 -135
  33. package/cpp/ggml-impl.h +567 -567
  34. package/cpp/ggml-metal-impl.h +235 -0
  35. package/cpp/ggml-metal.h +66 -66
  36. package/cpp/ggml-metal.m +5146 -5002
  37. package/cpp/ggml-opt.cpp +854 -854
  38. package/cpp/ggml-opt.h +216 -216
  39. package/cpp/ggml-quants.c +5238 -5238
  40. package/cpp/ggml-threading.h +14 -14
  41. package/cpp/ggml.c +6529 -6524
  42. package/cpp/ggml.h +2198 -2194
  43. package/cpp/gguf.cpp +1329 -1329
  44. package/cpp/gguf.h +202 -202
  45. package/cpp/json-schema-to-grammar.cpp +1024 -1025
  46. package/cpp/json-schema-to-grammar.h +21 -22
  47. package/cpp/json.hpp +24766 -24766
  48. package/cpp/llama-adapter.cpp +347 -347
  49. package/cpp/llama-adapter.h +74 -74
  50. package/cpp/llama-arch.cpp +1513 -1492
  51. package/cpp/llama-arch.h +403 -402
  52. package/cpp/llama-batch.cpp +368 -368
  53. package/cpp/llama-batch.h +88 -88
  54. package/cpp/llama-chat.cpp +588 -587
  55. package/cpp/llama-chat.h +53 -53
  56. package/cpp/llama-context.cpp +1775 -1775
  57. package/cpp/llama-context.h +128 -128
  58. package/cpp/llama-cparams.cpp +1 -1
  59. package/cpp/llama-cparams.h +37 -37
  60. package/cpp/llama-cpp.h +30 -30
  61. package/cpp/llama-grammar.cpp +1219 -1219
  62. package/cpp/llama-grammar.h +173 -164
  63. package/cpp/llama-hparams.cpp +71 -71
  64. package/cpp/llama-hparams.h +139 -139
  65. package/cpp/llama-impl.cpp +167 -167
  66. package/cpp/llama-impl.h +61 -61
  67. package/cpp/llama-kv-cache.cpp +718 -718
  68. package/cpp/llama-kv-cache.h +219 -218
  69. package/cpp/llama-mmap.cpp +600 -590
  70. package/cpp/llama-mmap.h +68 -68
  71. package/cpp/llama-model-loader.cpp +1124 -1124
  72. package/cpp/llama-model-loader.h +167 -167
  73. package/cpp/llama-model.cpp +4087 -4023
  74. package/cpp/llama-model.h +370 -370
  75. package/cpp/llama-sampling.cpp +2558 -2525
  76. package/cpp/llama-sampling.h +32 -32
  77. package/cpp/llama-vocab.cpp +3264 -3252
  78. package/cpp/llama-vocab.h +125 -125
  79. package/cpp/llama.cpp +10284 -10137
  80. package/cpp/llama.h +1354 -1340
  81. package/cpp/log.cpp +393 -423
  82. package/cpp/log.h +132 -132
  83. package/cpp/minja/chat-template.hpp +529 -0
  84. package/cpp/minja/minja.hpp +2915 -0
  85. package/cpp/minja.hpp +2915 -2883
  86. package/cpp/rn-llama.cpp +20 -37
  87. package/cpp/rn-llama.h +12 -2
  88. package/cpp/sampling.cpp +570 -532
  89. package/cpp/sgemm.cpp +2598 -2598
  90. package/cpp/sgemm.h +14 -14
  91. package/cpp/speculative.cpp +278 -277
  92. package/cpp/speculative.h +28 -28
  93. package/package.json +1 -1
  94. package/android/src/main/build-arm64/CMakeCache.txt +0 -429
  95. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +0 -81
  96. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +0 -101
  97. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
  98. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
  99. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +0 -15
  100. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +0 -904
  101. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  102. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +0 -919
  103. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  104. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +0 -431
  105. package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +0 -16
  106. package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +0 -165
  107. package/android/src/main/build-arm64/CMakeFiles/Makefile2 +0 -297
  108. package/android/src/main/build-arm64/CMakeFiles/Progress/1 +0 -1
  109. package/android/src/main/build-arm64/CMakeFiles/Progress/2 +0 -1
  110. package/android/src/main/build-arm64/CMakeFiles/Progress/3 +0 -1
  111. package/android/src/main/build-arm64/CMakeFiles/Progress/4 +0 -1
  112. package/android/src/main/build-arm64/CMakeFiles/Progress/5 +0 -1
  113. package/android/src/main/build-arm64/CMakeFiles/Progress/6 +0 -1
  114. package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +0 -1
  115. package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +0 -8
  116. package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +0 -1
  117. package/android/src/main/build-arm64/CMakeFiles/progress.marks +0 -1
  118. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
  119. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +0 -58
  120. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
  121. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +0 -756
  122. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
  123. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +0 -709
  124. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
  125. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +0 -714
  126. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
  127. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +0 -62
  128. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
  129. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +0 -708
  130. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
  131. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +0 -113
  132. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
  133. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +0 -713
  134. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
  135. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +0 -763
  136. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
  137. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +0 -61
  138. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
  139. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +0 -707
  140. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
  141. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +0 -104
  142. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
  143. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +0 -714
  144. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
  145. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +0 -723
  146. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +0 -62
  147. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +0 -722
  148. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +0 -89
  149. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +0 -2
  150. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +0 -2
  151. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +0 -2
  152. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +0 -17
  153. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +0 -41
  154. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +0 -62
  155. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +0 -722
  156. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +0 -89
  157. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +0 -2
  158. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +0 -2
  159. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +0 -2
  160. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +0 -17
  161. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +0 -41
  162. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +0 -62
  163. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +0 -722
  164. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +0 -89
  165. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +0 -2
  166. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +0 -2
  167. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +0 -2
  168. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +0 -17
  169. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +0 -41
  170. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +0 -62
  171. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +0 -722
  172. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +0 -89
  173. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +0 -2
  174. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +0 -2
  175. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +0 -2
  176. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +0 -17
  177. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +0 -41
  178. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +0 -62
  179. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +0 -722
  180. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +0 -89
  181. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +0 -2
  182. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +0 -2
  183. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +0 -2
  184. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +0 -17
  185. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +0 -41
  186. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +0 -62
  187. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +0 -722
  188. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +0 -89
  189. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +0 -2
  190. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +0 -2
  191. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +0 -2
  192. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +0 -17
  193. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +0 -41
  194. package/android/src/main/build-arm64/Makefile +0 -1862
  195. package/android/src/main/build-arm64/cmake_install.cmake +0 -66
  196. package/cpp/chat.hpp +0 -55
  197. package/cpp/rn-llama.hpp +0 -913
package/cpp/sgemm.cpp CHANGED
@@ -1,2598 +1,2598 @@
1
- // Copyright 2024 Mozilla Foundation
2
- //
3
- // Permission is hereby granted, free of charge, to any person obtaining
4
- // a copy of this software and associated documentation files (the
5
- // "Software"), to deal in the Software without restriction, including
6
- // without limitation the rights to use, copy, modify, merge, publish,
7
- // distribute, sublicense, and/or sell copies of the Software, and to
8
- // permit persons to whom the Software is furnished to do so, subject to
9
- // the following conditions:
10
- //
11
- // The above copyright notice and this permission notice shall be
12
- // included in all copies or substantial portions of the Software.
13
- //
14
- // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
- // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
- // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
- // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18
- // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19
- // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
- // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- // SOFTWARE.
22
-
23
- //
24
- // _ _ ___ _ _ ___
25
- // | |_(_)_ _ _ _| _ ) | /_\ / __|
26
- // | _| | ' \ || | _ \ |__ / _ \\__ \.
27
- // \__|_|_||_\_, |___/____/_/ \_\___/
28
- // |__/
29
- //
30
- // BASIC LINEAR ALGEBRA SUBPROGRAMS
31
- //
32
- //
33
- // This file implements multithreaded CPU matrix multiplication for the
34
- // common contiguous use case C = Aᵀ * B. These kernels are designed to
35
- // have excellent performance[1] for matrices that fit in the CPU cache
36
- // without imposing any overhead such as cache filling or malloc calls.
37
- //
38
- // This implementation does not guarantee any upper bound with rounding
39
- // errors, which grow along with k. Our goal's to maximally exploit the
40
- // hardware for performance, and then use whatever resources remain for
41
- // improving numerical accuracy.
42
- //
43
- // [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
44
- // Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
45
-
46
- #if defined(__GNUC__)
47
- #pragma GCC diagnostic ignored "-Wpedantic"
48
- #pragma GCC diagnostic ignored "-Wignored-attributes"
49
- #endif
50
-
51
- #include "sgemm.h"
52
- #include "ggml-impl.h"
53
- #include "ggml-cpu-impl.h"
54
- #include "ggml-quants.h"
55
-
56
- #include <atomic>
57
- #include <array>
58
-
59
- #ifdef _MSC_VER
60
- #define NOINLINE __declspec(noinline)
61
- #else
62
- #define NOINLINE __attribute__((__noinline__))
63
- #endif
64
-
65
- #if defined(__ARM_NEON) || defined(__AVX512F__)
66
- #define VECTOR_REGISTERS 32
67
- #else
68
- #define VECTOR_REGISTERS 16
69
- #endif
70
-
71
- #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
72
-
73
- namespace {
74
-
75
- inline float unhalf(lm_ggml_fp16_t d) {
76
- return LM_GGML_FP16_TO_FP32(d);
77
- }
78
-
79
- ////////////////////////////////////////////////////////////////////////////////////////////////////
80
- // VECTORIZED ARITHMETIC OPERATIONS
81
-
82
- #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
83
- inline __m128 add(__m128 x, __m128 y) { return _mm_add_ps(x, y); }
84
- inline __m128 sub(__m128 x, __m128 y) { return _mm_sub_ps(x, y); }
85
- inline __m128 mul(__m128 x, __m128 y) { return _mm_mul_ps(x, y); }
86
- #endif // __SSE__
87
-
88
- #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
89
- inline __m256 add(__m256 x, __m256 y) { return _mm256_add_ps(x, y); }
90
- inline __m256 sub(__m256 x, __m256 y) { return _mm256_sub_ps(x, y); }
91
- inline __m256 mul(__m256 x, __m256 y) { return _mm256_mul_ps(x, y); }
92
- #endif // __AVX__
93
-
94
- #if defined(__AVX512F__)
95
- inline __m512 add(__m512 x, __m512 y) { return _mm512_add_ps(x, y); }
96
- inline __m512 sub(__m512 x, __m512 y) { return _mm512_sub_ps(x, y); }
97
- inline __m512 mul(__m512 x, __m512 y) { return _mm512_mul_ps(x, y); }
98
- #endif // __AVX512F__
99
-
100
- #if defined(__ARM_NEON)
101
- inline float32x4_t add(float32x4_t x, float32x4_t y) { return vaddq_f32(x, y); }
102
- inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vsubq_f32(x, y); }
103
- inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vmulq_f32(x, y); }
104
- #endif // __ARM_NEON
105
-
106
- #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
107
- inline float16x8_t add(float16x8_t x, float16x8_t y) { return vaddq_f16(x, y); }
108
- inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
109
- inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
110
- #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
111
-
112
- #if defined(__MMA__)
113
- typedef vector unsigned char vec_t;
114
- typedef __vector_quad acc_t;
115
- #endif
116
- ////////////////////////////////////////////////////////////////////////////////////////////////////
117
- // VECTORIZED FUSED MULTIPLY ADD
118
-
119
- /**
120
- * Computes a * b + c.
121
- */
122
- template <typename T, typename U>
123
- inline U madd(T a, T b, U c) {
124
- return add(mul(a, b), c);
125
- }
126
-
127
- #if defined(__FMA__)
128
- #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
129
- template <>
130
- inline __m256 madd(__m256 a, __m256 b, __m256 c) {
131
- return _mm256_fmadd_ps(a, b, c);
132
- }
133
- #endif
134
- #if defined(__AVX512F__)
135
- template <>
136
- inline __m512 madd(__m512 a, __m512 b, __m512 c) {
137
- return _mm512_fmadd_ps(a, b, c);
138
- }
139
- #endif
140
- #if defined(__AVX512BF16__)
141
- template <>
142
- inline __m512 madd(__m512bh a, __m512bh b, __m512 c) {
143
- return _mm512_dpbf16_ps(c, a, b);
144
- }
145
- template <>
146
- inline __m256 madd(__m256bh a, __m256bh b, __m256 c) {
147
- return _mm256_dpbf16_ps(c, a, b);
148
- }
149
- #endif
150
- #endif
151
-
152
- #if defined(__ARM_FEATURE_FMA)
153
- template <>
154
- inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
155
- return vfmaq_f32(c, b, a);
156
- }
157
- #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
158
- template <>
159
- inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) {
160
- return vfmaq_f16(c, b, a);
161
- }
162
- #endif
163
- #endif
164
-
165
- ////////////////////////////////////////////////////////////////////////////////////////////////////
166
- // VECTORIZED HORIZONTAL SUM
167
-
168
- #if defined(__ARM_NEON)
169
- inline float hsum(float32x4_t x) {
170
- return vaddvq_f32(x);
171
- }
172
- #endif // __ARM_NEON
173
-
174
- #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
175
- inline float hsum(float16x8_t x) {
176
- return vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(x)),
177
- vcvt_f32_f16(vget_high_f16(x))));
178
- }
179
- #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
180
-
181
- #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
182
- inline float hsum(__m128 x) {
183
- #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
184
- x = _mm_add_ps(x, _mm_movehl_ps(x, x));
185
- x = _mm_add_ss(x, _mm_movehdup_ps(x));
186
- #else
187
- __m128 t;
188
- t = _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1));
189
- x = _mm_add_ps(x, t);
190
- t = _mm_movehl_ps(t, x);
191
- x = _mm_add_ss(x, t);
192
- #endif
193
- return _mm_cvtss_f32(x);
194
- }
195
- #endif
196
-
197
- #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
198
- inline float hsum(__m256 x) {
199
- return hsum(_mm_add_ps(_mm256_extractf128_ps(x, 1),
200
- _mm256_castps256_ps128(x)));
201
- }
202
- #endif // __AVX__
203
-
204
- #if defined(__AVX512F__)
205
- inline float hsum(__m512 x) {
206
- return _mm512_reduce_add_ps(x);
207
- }
208
- #endif // __AVX512F__
209
-
210
- ////////////////////////////////////////////////////////////////////////////////////////////////////
211
- // VECTORIZED MEMORY LOADING
212
-
213
- template <typename T, typename U> T load(const U *);
214
-
215
- #if defined(__ARM_NEON)
216
- template <> inline float32x4_t load(const float *p) {
217
- return vld1q_f32(p);
218
- }
219
- #if !defined(_MSC_VER)
220
- // FIXME: this should check for __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
221
- template <> inline float16x8_t load(const lm_ggml_fp16_t *p) {
222
- return vld1q_f16((const float16_t *)p);
223
- }
224
- template <> inline float32x4_t load(const lm_ggml_fp16_t *p) {
225
- return vcvt_f32_f16(vld1_f16((const float16_t *)p));
226
- }
227
- #endif // _MSC_VER
228
- #endif // __ARM_NEON
229
-
230
- #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
231
- template <> inline __m128 load(const float *p) {
232
- return _mm_loadu_ps(p);
233
- }
234
- #endif // __SSE__
235
-
236
- #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
237
- template <> inline __m256 load(const float *p) {
238
- return _mm256_loadu_ps(p);
239
- }
240
- #endif // __AVX__
241
-
242
- #if defined(__AVX2__) || defined(__AVX512F__)
243
- template <> inline __m256 load(const lm_ggml_bf16_t *p) {
244
- return _mm256_castsi256_ps(
245
- _mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16));
246
- }
247
- #endif // __AVX2__
248
-
249
- #if defined(__F16C__)
250
- template <> inline __m256 load(const lm_ggml_fp16_t *p) {
251
- return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
252
- }
253
- #endif // __F16C__
254
-
255
- #if defined(__AVX512F__)
256
- template <> inline __m512 load(const float *p) {
257
- return _mm512_loadu_ps(p);
258
- }
259
- template <> inline __m512 load(const lm_ggml_fp16_t *p) {
260
- return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
261
- }
262
- template <> inline __m512 load(const lm_ggml_bf16_t *p) {
263
- return _mm512_castsi512_ps(
264
- _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16));
265
- }
266
- #endif // __AVX512F__
267
-
268
- #if defined(__AVX512BF16__)
269
- template <> inline __m512bh load(const lm_ggml_bf16_t *p) {
270
- return (__m512bh)_mm512_loadu_ps((const float *)p);
271
- }
272
- template <> inline __m256bh load(const lm_ggml_bf16_t *p) {
273
- return (__m256bh)_mm256_loadu_ps((const float *)p);
274
- }
275
- template <> inline __m512bh load(const float *p) {
276
- return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p));
277
- }
278
- template <> inline __m256bh load(const float *p) {
279
- return _mm512_cvtneps_pbh(_mm512_loadu_ps(p));
280
- }
281
- #endif
282
-
283
- ////////////////////////////////////////////////////////////////////////////////////////////////////
284
- // FLOATING POINT MATRIX MULTIPLICATION
285
-
286
- template <int M>
287
- static inline int64_t BLOCK_SIZE(size_t m) {
288
- const int64_t NB_BLOC_M = (m + M - 1) / M;
289
- return (m % NB_BLOC_M == 0) ? m / NB_BLOC_M : (m / NB_BLOC_M) + 1;
290
- }
291
-
292
- static constexpr inline int64_t BLOC_POS(int64_t ib, int64_t ibN, int64_t bloc_size) {
293
- return ib < ibN ? ib * bloc_size : ibN * bloc_size + (ib - ibN) * (bloc_size - 1);
294
- }
295
-
296
- template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
297
- class tinyBLAS {
298
- public:
299
- tinyBLAS(const lm_ggml_compute_params * params, int64_t k,
300
- const TA *A, int64_t lda,
301
- const TB *B, int64_t ldb,
302
- TC *C, int64_t ldc)
303
- : params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) {
304
- }
305
-
306
- bool matmul(int64_t m, int64_t n) {
307
- if (k % KN != 0)
308
- return false;
309
- // compute RM for only need tile with size RM&RM-1
310
- #if VECTOR_REGISTERS == 32
311
- if (m % 16 == 0 && (m/16 >= params->nth)) {
312
- const int64_t SIZE_N = BLOCK_SIZE<6>(n);
313
- mnpack<4, 6, 4>(m, n, SIZE_N, 12);
314
- return true;
315
- }
316
- if (m % 8 == 0 ) {
317
- const int64_t SIZE_N = BLOCK_SIZE<6>(n);
318
- mnpack<4, 6, 2>(m, n, SIZE_N, 12);
319
- return true;
320
- }
321
- if (m % 4 == 0) {
322
- const int64_t SIZE_N = BLOCK_SIZE<6>(n);
323
- mnpack<4, 6, 1>(m, n, SIZE_N, 12);
324
- return true;
325
- }
326
- #else // VECTOR_REGISTERS == 16
327
- if (m % 16 == 0 && (m/16 >= params->nth)) {
328
- const int64_t SIZE_N = BLOCK_SIZE<3>(n);
329
- mnpack<4, 3, 4>(m, n, SIZE_N, 24);
330
- return true;
331
- }
332
- if (m % 8 == 0 ) {
333
- const int64_t SIZE_N = BLOCK_SIZE<3>(n);
334
- mnpack<4, 3, 2>(m, n, SIZE_N, 24);
335
- return true;
336
- }
337
- if (m % 4 == 0) {
338
- const int64_t SIZE_N = BLOCK_SIZE<3>(n);
339
- mnpack<4, 3, 1>(m, n, SIZE_N, 24);
340
- return true;
341
- }
342
- #endif
343
- return false;
344
- }
345
-
346
- private:
347
- template <int RM, int RN, int BM>
348
- inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) {
349
- if (SIZE_N == RN) {
350
- return gemm<RM, RN, BM>(m, n, BN);
351
- }
352
- if constexpr (RN > 1) {
353
- return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
354
- } else {
355
- LM_GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
356
- LM_GGML_ASSERT(false); // we have miss something.
357
- }
358
- }
359
-
360
- template <int RM, int RN>
361
- inline void gemm_bloc(int64_t ii, int64_t jj) {
362
- D Cv[RN][RM] = {};
363
- for (int64_t l = 0; l < k; l += KN) {
364
- // help compiler for op order.
365
- if constexpr (RM <= RN) {
366
- V Av[RM];
367
- for (int64_t i = 0; i < RM; ++i) {
368
- Av[i] = load<V>(A + lda * (ii + i) + l);
369
- }
370
- for (int64_t j = 0; j < RN; ++j) {
371
- V Bv = load<V>(B + ldb * (jj + j) + l);
372
- for (int64_t i = 0; i < RM; ++i) {
373
- Cv[j][i] = madd(Av[i], Bv, Cv[j][i]);
374
- }
375
- }
376
- } else {
377
- V Bv[RN];
378
- for (int64_t j = 0; j < RN; ++j) {
379
- Bv[j] = load<V>(B + ldb * (jj + j) + l);
380
- }
381
- for (int64_t i = 0; i < RM; ++i) {
382
- V Av = load<V>(A + lda * (ii + i) + l);
383
- for (int64_t j = 0; j < RN; ++j) {
384
- Cv[j][i] = madd(Av, Bv[j], Cv[j][i]);
385
- }
386
- }
387
- }
388
- }
389
- for (int64_t j = 0; j < RN; ++j)
390
- for (int64_t i = 0; i < RM; ++i)
391
- C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
392
- }
393
-
394
- template <int RM, int RN, int BM>
395
- NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
396
- static std::atomic<int64_t> current_chunk;
397
-
398
- LM_GGML_ASSERT(m % (RM * BM) == 0);
399
- const int64_t ytiles = m / (RM * BM);
400
- const int64_t xtiles = (n + RN -1) / RN;
401
- const int64_t jj_RN = (xtiles - (xtiles * RN - n));
402
-
403
- // "round" bloc_size to "nearest" BN
404
- const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN;
405
- const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1;
406
- const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles));
407
- const int64_t nb_job = ytiles * NB_BN;
408
-
409
- if (params->ith == 0) {
410
- LM_GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
411
- // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
412
- std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
413
- }
414
-
415
- lm_ggml_barrier(params->threadpool);
416
-
417
- int64_t job = params->ith;
418
- while (job < nb_job) {
419
- const int64_t ii = (job % ytiles) * RM * BM;
420
- const int64_t jb = job / ytiles;
421
- const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN);
422
- const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN);
423
-
424
- const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN);
425
- const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN);
426
- const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN;
427
-
428
- for (int64_t bi = 0; bi < BM * RM; bi += RM) {
429
- int64_t jj = jj0;
430
- for (; jj < jj1; jj += RN) {
431
- gemm_bloc<RM, RN>(ii + bi, jj);
432
- }
433
- if constexpr (RN > 1) {
434
- for (; jj < jj2; jj += RN - 1) {
435
- gemm_bloc<RM, RN-1>(ii + bi, jj);
436
- }
437
- }
438
- LM_GGML_ASSERT(jj == jj2);
439
- }
440
-
441
- // next step.
442
- job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
443
- }
444
-
445
- lm_ggml_barrier(params->threadpool);
446
- return;
447
- }
448
-
449
- const lm_ggml_compute_params * params;
450
- const TA *const A;
451
- const TB *const B;
452
- TC *const C;
453
- const int64_t k;
454
- const int64_t lda;
455
- const int64_t ldb;
456
- const int64_t ldc;
457
- };
458
-
459
- //////////////////////////////////////////////////////////////////////////////////////////
460
- // QUANT ZERO MATRIX MULTIPLICATION
461
-
462
- #if defined(__ARM_FEATURE_DOTPROD)
463
- template <typename TA>
464
- class tinyBLAS_Q0_ARM {
465
- public:
466
- tinyBLAS_Q0_ARM(int64_t k,
467
- const TA *A, int64_t lda,
468
- const block_q8_0 *B, int64_t ldb,
469
- float *C, int64_t ldc,
470
- int ith, int nth)
471
- : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
472
- }
473
-
474
- void matmul(int64_t m, int64_t n) {
475
- mnpack(0, m, 0, n);
476
- }
477
-
478
- private:
479
- NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
480
- int64_t mc, nc, mp, np;
481
- switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3ll)) {
482
- case 0x33:
483
- mc = 3;
484
- nc = 3;
485
- gemm<3, 3>(m0, m, n0, n);
486
- break;
487
- case 0x32:
488
- mc = 3;
489
- nc = 2;
490
- gemm<3, 2>(m0, m, n0, n);
491
- break;
492
- case 0x23:
493
- mc = 2;
494
- nc = 3;
495
- gemm<2, 3>(m0, m, n0, n);
496
- break;
497
- case 0x22:
498
- mc = 2;
499
- nc = 2;
500
- gemm<2, 2>(m0, m, n0, n);
501
- break;
502
- case 0x31:
503
- mc = 3;
504
- nc = 1;
505
- gemm<3, 1>(m0, m, n0, n);
506
- break;
507
- case 0x13:
508
- mc = 1;
509
- nc = 3;
510
- gemm<1, 3>(m0, m, n0, n);
511
- break;
512
- case 0x21:
513
- mc = 2;
514
- nc = 1;
515
- gemm<2, 1>(m0, m, n0, n);
516
- break;
517
- case 0x12:
518
- mc = 1;
519
- nc = 2;
520
- gemm<1, 2>(m0, m, n0, n);
521
- break;
522
- case 0x11:
523
- mc = 1;
524
- nc = 1;
525
- gemm<1, 1>(m0, m, n0, n);
526
- break;
527
- default:
528
- return;
529
- }
530
- mp = m0 + (m - m0) / mc * mc;
531
- np = n0 + (n - n0) / nc * nc;
532
- mnpack(mp, m, n0, np);
533
- mnpack(m0, m, np, n);
534
- }
535
-
536
- template <int RM, int RN>
537
- NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
538
- int64_t ytiles = (m - m0) / RM;
539
- int64_t xtiles = (n - n0) / RN;
540
- int64_t tiles = xtiles * ytiles;
541
- int64_t duty = (tiles + nth - 1) / nth;
542
- int64_t start = duty * ith;
543
- int64_t end = start + duty;
544
- if (end > tiles)
545
- end = tiles;
546
- for (int64_t job = start; job < end; ++job) {
547
- int64_t ii = m0 + job / xtiles * RM;
548
- int64_t jj = n0 + job % xtiles * RN;
549
- float32x4_t Cv[RN][RM] = {};
550
- for (int64_t l = 0; l < k; ++l)
551
- for (int64_t j = 0; j < RN; ++j)
552
- for (int64_t i = 0; i < RM; ++i)
553
- Cv[j][i] = vmlaq_n_f32(Cv[j][i],
554
- vcvtq_f32_s32(vdotq_s32(
555
- vdotq_s32(vdupq_n_s32(0),
556
- load_lo(A + lda * (ii + i) + l),
557
- load_lo(B + ldb * (jj + j) + l)),
558
- load_hi(A + lda * (ii + i) + l),
559
- load_hi(B + ldb * (jj + j) + l))),
560
- unhalf(A[lda * (ii + i) + l].d) *
561
- unhalf(B[ldb * (jj + j) + l].d));
562
- for (int64_t j = 0; j < RN; ++j)
563
- for (int64_t i = 0; i < RM; ++i)
564
- C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
565
- }
566
- }
567
-
568
- inline int8x16_t load_lo(const block_q8_0 *b) {
569
- return vld1q_s8(b->qs);
570
- }
571
-
572
- inline int8x16_t load_hi(const block_q8_0 *b) {
573
- return vld1q_s8(b->qs + 16);
574
- }
575
-
576
- inline int8x16_t load_lo(const block_q4_0 *b) {
577
- return vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vld1q_u8(b->qs),
578
- vdupq_n_u8(0x0f))),
579
- vdupq_n_s8(0x8));
580
- }
581
-
582
- inline int8x16_t load_hi(const block_q4_0 *b) {
583
- return vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(vld1q_u8(b->qs), 4)),
584
- vdupq_n_s8(0x8));
585
- }
586
-
587
- const TA *const A;
588
- const block_q8_0 *const B;
589
- float *const C;
590
- const int64_t k;
591
- const int64_t lda;
592
- const int64_t ldb;
593
- const int64_t ldc;
594
- const int ith;
595
- const int nth;
596
- };
597
- #endif // __ARM_FEATURE_DOTPROD
598
-
599
- #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
600
- template <typename TA, typename TB, typename TC>
601
- class tinyBLAS_Q0_AVX {
602
- public:
603
- tinyBLAS_Q0_AVX(int64_t k,
604
- const TA *A, int64_t lda,
605
- const TB *B, int64_t ldb,
606
- TC *C, int64_t ldc,
607
- int ith, int nth)
608
- : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
609
- const int8_t kvalues_iq4nl[16] = {
610
- -127, -104, -83, -65,
611
- -49, -35, -22, -10,
612
- 1, 13, 25, 38,
613
- 53, 69, 89, 113
614
- };
615
-
616
- iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
617
- }
618
-
619
- void matmul(int64_t m, int64_t n) {
620
- mnpack(0, m, 0, n);
621
- }
622
-
623
- private:
624
- void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
625
- int64_t mc, nc, mp, np;
626
- switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 4)) {
627
- #if VECTOR_REGISTERS == 32
628
- case 0x44:
629
- mc = 4;
630
- nc = 4;
631
- #if defined(__AVX2__) && defined(__F16C__)
632
- gemm4xN<4>(m0, m, n0, n);
633
- #else
634
- gemm<4, 4>(m0, m, n0, n);
635
- #endif
636
- break;
637
- case 0x43:
638
- mc = 4;
639
- nc = 3;
640
- #if defined(__AVX2__) && defined(__F16C__)
641
- gemm4xN<3>(m0, m, n0, n);
642
- #else
643
- gemm<4, 3>(m0, m, n0, n);
644
- #endif
645
- break;
646
- case 0x34:
647
- mc = 3;
648
- nc = 4;
649
- #if defined(__AVX2__) && defined(__F16C__)
650
- gemmMx4<3>(m0, m, n0, n);
651
- #else
652
- gemm<3, 4>(m0, m, n0, n);
653
- #endif
654
- break;
655
- case 0x33:
656
- mc = 3;
657
- nc = 3;
658
- gemm<3, 3>(m0, m, n0, n);
659
- break;
660
- case 0x42:
661
- mc = 4;
662
- nc = 2;
663
- #if defined(__AVX2__) && defined(__F16C__)
664
- gemm4xN<2>(m0, m, n0, n);
665
- #else
666
- gemm<4, 2>(m0, m, n0, n);
667
- #endif
668
- break;
669
- case 0x24:
670
- mc = 2;
671
- nc = 4;
672
- #if defined(__AVX2__) && defined(__F16C__)
673
- gemmMx4<2>(m0, m, n0, n);
674
- #else
675
- gemm<2, 4>(m0, m, n0, n);
676
- #endif
677
- break;
678
- #else
679
- case 0x44:
680
- case 0x43:
681
- case 0x42:
682
- mc = 4;
683
- nc = 2;
684
- #if defined(__AVX2__) && defined(__F16C__)
685
- gemm4xN<2>(m0, m, n0, n);
686
- #else
687
- gemm<4, 2>(m0, m, n0, n);
688
- #endif
689
- break;
690
- case 0x34:
691
- case 0x24:
692
- mc = 2;
693
- nc = 4;
694
- #if defined(__AVX2__) && defined(__F16C__)
695
- gemmMx4<2>(m0, m, n0, n);
696
- #else
697
- gemm<2, 4>(m0, m, n0, n);
698
- #endif
699
- break;
700
- case 0x33:
701
- #endif
702
- case 0x32:
703
- mc = 3;
704
- nc = 2;
705
- gemm<3, 2>(m0, m, n0, n);
706
- break;
707
- case 0x23:
708
- mc = 2;
709
- nc = 3;
710
- gemm<2, 3>(m0, m, n0, n);
711
- break;
712
- case 0x41:
713
- mc = 4;
714
- nc = 1;
715
- #if defined(__AVX2__) && defined(__F16C__)
716
- gemm4xN<1>(m0, m, n0, n);
717
- #else
718
- gemm<4, 1>(m0, m, n0, n);
719
- #endif
720
- break;
721
- case 0x22:
722
- mc = 2;
723
- nc = 2;
724
- gemm<2, 2>(m0, m, n0, n);
725
- break;
726
- case 0x14:
727
- mc = 1;
728
- nc = 4;
729
- #if defined(__AVX2__) && defined(__F16C__)
730
- gemmMx4<1>(m0, m, n0, n);
731
- #else
732
- gemm<1, 4>(m0, m, n0, n);
733
- #endif
734
- break;
735
- case 0x31:
736
- mc = 3;
737
- nc = 1;
738
- gemm<3, 1>(m0, m, n0, n);
739
- break;
740
- case 0x13:
741
- mc = 1;
742
- nc = 3;
743
- gemm<1, 3>(m0, m, n0, n);
744
- break;
745
- case 0x21:
746
- mc = 2;
747
- nc = 1;
748
- gemm<2, 1>(m0, m, n0, n);
749
- break;
750
- case 0x12:
751
- mc = 1;
752
- nc = 2;
753
- gemm<1, 2>(m0, m, n0, n);
754
- break;
755
- case 0x11:
756
- mc = 1;
757
- nc = 1;
758
- gemm<1, 1>(m0, m, n0, n);
759
- break;
760
- default:
761
- return;
762
- }
763
- mp = m0 + (m - m0) / mc * mc;
764
- np = n0 + (n - n0) / nc * nc;
765
- mnpack(mp, m, n0, np);
766
- mnpack(m0, m, np, n);
767
- }
768
-
769
- #if defined(__AVX2__) && defined(__F16C__)
770
- // Templated functions for gemm of dimensions 4xN
771
- template <int RN>
772
- NOINLINE void gemm4xN(int64_t m0, int64_t m, int64_t n0, int64_t n) {
773
- int64_t ytiles = (m - m0) / 4;
774
- int64_t xtiles = (n - n0) / RN;
775
- int64_t tiles = xtiles * ytiles;
776
- int64_t duty = (tiles + nth - 1) / nth;
777
- int64_t start = duty * ith;
778
- int64_t end = start + duty;
779
- if (end > tiles)
780
- end = tiles;
781
- for (int64_t job = start; job < end; ++job) {
782
- int64_t ii = m0 + job / xtiles * 4;
783
- int64_t jj = n0 + job % xtiles * RN;
784
- __m256 Cv[RN][4] = {};
785
- for (int64_t l = 0; l < k; ++l) {
786
- uint64_t a_delta = ((uint64_t)A[lda * (ii + 3) + l].d << 48) | ((uint64_t)A[lda * (ii + 2) + l].d << 32) | ((uint64_t)A[lda * (ii + 1) + l].d << 16) | (A[lda * (ii + 0) + l].d);
787
- // Convert delta values for four blocks to float values
788
- __m128 da = _mm_cvtph_ps(_mm_set_epi64x(0, a_delta));
789
- __m256i avec0 = load(A + lda * (ii + 0) + l);
790
- __m256i avec1 = load(A + lda * (ii + 1) + l);
791
- __m256i avec2 = load(A + lda * (ii + 2) + l);
792
- __m256i avec3 = load(A + lda * (ii + 3) + l);
793
- for (int64_t j = 0; j < RN; ++j) {
794
- __m128 db = _mm_set1_ps(unhalf(B[ldb * (jj + j) + l].d));
795
- // Computation of product of delta values for four blocks and replicate it across 256 bit lane
796
- __m256 dvec = _mm256_castps128_ps256(_mm_mul_ps(da, db));
797
- dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
798
- // Computation of dot product and multiplication with appropriate delta value products
799
- Cv[j][0] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
800
- updot(_mm256_sign_epi8(avec0, avec0),
801
- _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec0)),
802
- Cv[j][0]);
803
- Cv[j][1] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
804
- updot(_mm256_sign_epi8(avec1, avec1),
805
- _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec1)),
806
- Cv[j][1]);
807
- Cv[j][2] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
808
- updot(_mm256_sign_epi8(avec2, avec2),
809
- _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec2)),
810
- Cv[j][2]);
811
- Cv[j][3] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
812
- updot(_mm256_sign_epi8(avec3, avec3),
813
- _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec3)),
814
- Cv[j][3]);
815
- }
816
- }
817
-
818
- for (int64_t j = 0; j < RN; ++j)
819
- for (int64_t i = 0; i < 4; ++i)
820
- C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
821
- }
822
- }
823
-
824
- // Templated functions for gemm of dimensions Mx4
825
- template <int RM>
826
- NOINLINE void gemmMx4(int64_t m0, int64_t m, int64_t n0, int64_t n) {
827
- int64_t ytiles = (m - m0) / RM;
828
- int64_t xtiles = (n - n0) / 4;
829
- int64_t tiles = xtiles * ytiles;
830
- int64_t duty = (tiles + nth - 1) / nth;
831
- int64_t start = duty * ith;
832
- int64_t end = start + duty;
833
- if (end > tiles)
834
- end = tiles;
835
- for (int64_t job = start; job < end; ++job) {
836
- int64_t ii = m0 + job / xtiles * RM;
837
- int64_t jj = n0 + job % xtiles * 4;
838
- __m256 Cv[4][RM] = {};
839
- for (int64_t l = 0; l < k; ++l) {
840
- uint64_t b_delta = ((uint64_t)B[ldb * (jj + 3) + l].d << 48) | ((uint64_t)B[ldb * (jj + 2) + l].d << 32) | ((uint64_t)B[ldb * (jj + 1) + l].d << 16) | (B[ldb * (jj + 0) + l].d);
841
- // Convert delta values for four blocks to float values
842
- __m128 db = _mm_cvtph_ps(_mm_set_epi64x(0, b_delta));
843
- __m256i bvec0 = load(B + ldb * (jj + 0) + l);
844
- __m256i bvec1 = load(B + ldb * (jj + 1) + l);
845
- __m256i bvec2 = load(B + ldb * (jj + 2) + l);
846
- __m256i bvec3 = load(B + ldb * (jj + 3) + l);
847
- for (int64_t i = 0; i < RM; ++i) {
848
- __m128 da = _mm_set1_ps(unhalf((A[lda * (ii + i) + l].d)));
849
- // Computation of product of delta values for four blocks and replicate it across 256 bit lane
850
- __m256 dvec = _mm256_castps128_ps256(_mm_mul_ps(da, db));
851
- dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
852
- // Computation of dot product and multiplication with appropriate delta value products
853
- Cv[0][i] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
854
- updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
855
- load(A + lda * (ii + i) + l)),
856
- _mm256_sign_epi8(bvec0, load(A + lda * (ii + i) + l))),
857
- Cv[0][i]);
858
- Cv[1][i] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
859
- updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
860
- load(A + lda * (ii + i) + l)),
861
- _mm256_sign_epi8(bvec1, load(A + lda * (ii + i) + l))),
862
- Cv[1][i]);
863
- Cv[2][i] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
864
- updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
865
- load(A + lda * (ii + i) + l)),
866
- _mm256_sign_epi8(bvec2, load(A + lda * (ii + i) + l))),
867
- Cv[2][i]);
868
- Cv[3][i] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
869
- updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
870
- load(A + lda * (ii + i) + l)),
871
- _mm256_sign_epi8(bvec3, load(A + lda * (ii + i) + l))),
872
- Cv[3][i]);
873
- }
874
- }
875
- for (int64_t j = 0; j < 4; ++j)
876
- for (int64_t i = 0; i < RM; ++i)
877
- C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
878
- }
879
- }
880
- #endif
881
-
882
- template <int RM, int RN>
883
- NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
884
- int64_t ytiles = (m - m0) / RM;
885
- int64_t xtiles = (n - n0) / RN;
886
- int64_t tiles = xtiles * ytiles;
887
- int64_t duty = (tiles + nth - 1) / nth;
888
- int64_t start = duty * ith;
889
- int64_t end = start + duty;
890
- if (end > tiles)
891
- end = tiles;
892
- for (int64_t job = start; job < end; ++job) {
893
- int64_t ii = m0 + job / xtiles * RM;
894
- int64_t jj = n0 + job % xtiles * RN;
895
- __m256 Cv[RN][RM] = {};
896
- for (int64_t l = 0; l < k; ++l)
897
- for (int64_t j = 0; j < RN; ++j)
898
- for (int64_t i = 0; i < RM; ++i) {
899
- #if defined(__AVX2__)
900
- __m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
901
- load(A + lda * (ii + i) + l)),
902
- _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
903
- load(A + lda * (ii + i) + l)));
904
- #else
905
- __m128i ali0 = load0(A + lda * (ii + i) + l);
906
- __m128i ali1 = load1(A + lda * (ii + i) + l);
907
- __m128i blj0 = load0(B + ldb * (jj + j) + l);
908
- __m128i blj1 = load1(B + ldb * (jj + j) + l);
909
-
910
- __m128i sepAA0 = _mm_sign_epi8(ali0, ali0);
911
- __m128i sepAA1 = _mm_sign_epi8(ali1, ali1);
912
- __m128i sepBA0 = _mm_sign_epi8(blj0, ali0);
913
- __m128i sepBA1 = _mm_sign_epi8(blj1, ali1);
914
-
915
- // updot
916
- const __m128i oneFill = _mm_set1_epi16(1);
917
- __m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0);
918
- __m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1);
919
- __m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0)));
920
- #endif
921
- Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
922
- unhalf(B[ldb * (jj + j) + l].d)),
923
- udTmp,
924
- Cv[j][i]);
925
- }
926
- for (int64_t j = 0; j < RN; ++j)
927
- for (int64_t i = 0; i < RM; ++i)
928
- C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
929
- }
930
- }
931
-
932
- inline __m256i load(const block_q8_0 *b) {
933
- return _mm256_loadu_si256((const __m256i *)b->qs);
934
- }
935
-
936
- inline __m128i load0(const block_q8_0 *b) {
937
- return _mm_loadu_si128((const __m128i *)b->qs);
938
- }
939
-
940
- inline __m128i load1(const block_q8_0 *b) {
941
- return _mm_loadu_si128(((const __m128i *)b->qs) + 1);
942
- }
943
-
944
- inline __m256i load(const block_q4_0 *b) {
945
- return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
946
- }
947
-
948
- inline __m128i load0(const block_q4_0 *b) {
949
- const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
950
- return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8));
951
- }
952
-
953
- inline __m128i load1(const block_q4_0 *b) {
954
- const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
955
- return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
956
- }
957
-
958
- inline __m256i load(const block_q5_0 *b) {
959
- return _mm256_or_si256(denibble(b->qs), bittobyte(b->qh));
960
- }
961
-
962
- inline __m128i load0(const block_q5_0* b) {
963
- const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
964
- uint32_t x32;
965
- memcpy(&x32, b->qh, sizeof(uint32_t));
966
- __m128i qxl = _mm_and_si128(_mm_set1_epi8(15), x);
967
- __m128i bytesl = _mm_cmpeq_epi8(_mm_set1_epi64x(-1),
968
- _mm_or_si128(_mm_set1_epi64x(0x7fbfdfeff7fbfdfe),
969
- _mm_shuffle_epi8(_mm_set1_epi32(x32),
970
- _mm_set_epi64x(0x0101010101010101, 0x0000000000000000))));
971
- bytesl = _mm_andnot_si128(bytesl, _mm_set1_epi8((char)0xF0));
972
- return _mm_or_si128(qxl, bytesl);
973
- }
974
-
975
- inline __m128i load1(const block_q5_0* b) {
976
- const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
977
- uint32_t x32;
978
- memcpy(&x32, b->qh, sizeof(uint32_t));
979
- __m128i qxh = _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4));
980
- __m128i bytesh = _mm_cmpeq_epi8(_mm_set1_epi64x(-1),
981
- _mm_or_si128(_mm_set1_epi64x(0x7fbfdfeff7fbfdfe),
982
- _mm_shuffle_epi8(_mm_set1_epi32(x32),
983
- _mm_set_epi64x(0x0303030303030303, 0x0202020202020202))));
984
- bytesh = _mm_andnot_si128(bytesh, _mm_set1_epi8((char)0xF0));
985
- return _mm_or_si128(qxh, bytesh);
986
- }
987
-
988
- inline __m256i load(const block_iq4_nl *b) {
989
- return MM256_SET_M128I(load1(b), load0(b));
990
- }
991
-
992
- inline __m128i load0(const block_iq4_nl *b) {
993
- const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
994
- return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x));
995
- }
996
-
997
- inline __m128i load1(const block_iq4_nl *b) {
998
- const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
999
- return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)));
1000
- }
1001
-
1002
- inline __m256 updot(__m256i u, __m256i s) {
1003
- __m256i res;
1004
- #if defined(__AVX512VNNI__) && defined(__AVX512VL__)
1005
- res = _mm256_dpbusd_epi32(_mm256_setzero_si256(), u, s);
1006
- #elif defined(__AVXVNNI__)
1007
- res = _mm256_dpbusd_avx_epi32(_mm256_setzero_si256(), u, s);
1008
- #else
1009
- res = _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(u, s));
1010
- #endif
1011
- return _mm256_cvtepi32_ps(res);
1012
- }
1013
-
1014
- static inline __m256i denibble(const uint8_t *p) {
1015
- __m128i x = _mm_loadu_si128((const __m128i *)p);
1016
- return _mm256_and_si256(_mm256_set1_epi8(15),
1017
- _mm256_insertf128_si256(_mm256_castsi128_si256(x),
1018
- _mm_srli_epi16(x, 4), 1));
1019
- }
1020
-
1021
- static inline __m256i bittobyte(const uint8_t *p) {
1022
- uint32_t x32;
1023
- memcpy(&x32, p, sizeof(uint32_t));
1024
- __m256i bytes = _mm256_cmpeq_epi8(_mm256_set1_epi64x(-1),
1025
- _mm256_or_si256(_mm256_set1_epi64x(0x7fbfdfeff7fbfdfe),
1026
- _mm256_shuffle_epi8(_mm256_set1_epi32(x32),
1027
- _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202,
1028
- 0x0101010101010101, 0x0000000000000000))));
1029
- return _mm256_andnot_si256(bytes, _mm256_set1_epi8((char)0xF0));
1030
- }
1031
-
1032
- const TA *const A;
1033
- const TB *const B;
1034
- TC *const C;
1035
- const int64_t k;
1036
- const int64_t lda;
1037
- const int64_t ldb;
1038
- const int64_t ldc;
1039
- const int ith;
1040
- const int nth;
1041
- __m128i iq4nlt;
1042
- };
1043
- #endif // __AVX__
1044
-
1045
- //PPC Implementation
1046
- #if defined(__MMA__)
1047
-
1048
- #define SAVE_ACC(ACC, ii, jj) \
1049
- __builtin_mma_disassemble_acc(vec_C, ACC); \
1050
- for (int I = 0; I < 4; I++) { \
1051
- for (int J = 0; J < 4; J++) { \
1052
- *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J); \
1053
- } \
1054
- } \
1055
-
1056
- template <typename TA, typename TB, typename TC>
1057
- class tinyBLAS_Q0_PPC {
1058
- public:
1059
- tinyBLAS_Q0_PPC(int64_t k,
1060
- const TA *A, int64_t lda,
1061
- const TB *B, int64_t ldb,
1062
- TC *C, int64_t ldc,
1063
- int ith, int nth)
1064
- : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
1065
- }
1066
-
1067
- void matmul(int64_t m, int64_t n) {
1068
- mnpack(0, m, 0, n);
1069
- }
1070
-
1071
- private:
1072
-
1073
- template<int RM, int RN>
1074
- inline void save_res(int ii, int jj, int idx, vector float* fin_res) {
1075
- for (int I = 0; I < RM; I++) {
1076
- for (int J = 0; J < RN; J++) {
1077
- *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
1078
- }
1079
- }
1080
- }
1081
-
1082
- template<int size>
1083
- inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array<int, size>& comparray, vector float* vs, vector float* fin_res) {
1084
- vector signed int vec_C[4];
1085
- vector float CA[4] = {0};
1086
- vector float res[4] = {0};
1087
- __builtin_mma_disassemble_acc(vec_C, ACC);
1088
- for (int i = 0; i < 4; i++) {
1089
- CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
1090
- res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
1091
- fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
1092
- }
1093
- }
1094
-
1095
- template<typename VA, typename VB>
1096
- void packNormal(const TA* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
1097
- int64_t i, j;
1098
- TA *aoffset = NULL;
1099
- VA *vecOffset = NULL;
1100
- TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
1101
- TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
1102
- __vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
1103
- VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0};
1104
- VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0};
1105
- VB t1, t2, t3, t4, t5, t6, t7, t8;
1106
- vector unsigned char xor_vector;
1107
- uint8_t flip_vec = 0x80;
1108
- xor_vector = vec_splats(flip_vec);
1109
- vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
1110
- vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
1111
- vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
1112
- vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
1113
-
1114
- aoffset = const_cast<TA*>(a);
1115
- vecOffset = vec;
1116
- j = (rows >> 3);
1117
- if (j > 0) {
1118
- do {
1119
- aoffset1 = aoffset;
1120
- aoffset2 = aoffset1 + lda;
1121
- aoffset3 = aoffset2 + lda;
1122
- aoffset4 = aoffset3 + lda;
1123
- aoffset5 = aoffset4 + lda;
1124
- aoffset6 = aoffset5 + lda;
1125
- aoffset7 = aoffset6 + lda;
1126
- aoffset8 = aoffset7 + lda;
1127
- aoffset += 8 * lda;
1128
-
1129
- i = (cols >> 3);
1130
- if (i > 0) {
1131
- do {
1132
- C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
1133
- C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
1134
- C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
1135
- C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs);
1136
- C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5->qs);
1137
- C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6->qs);
1138
- C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7->qs);
1139
- C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8->qs);
1140
-
1141
- __builtin_vsx_disassemble_pair(c1, &C1);
1142
- __builtin_vsx_disassemble_pair(c2, &C2);
1143
- __builtin_vsx_disassemble_pair(c3, &C3);
1144
- __builtin_vsx_disassemble_pair(c4, &C4);
1145
- __builtin_vsx_disassemble_pair(c5, &C5);
1146
- __builtin_vsx_disassemble_pair(c6, &C6);
1147
- __builtin_vsx_disassemble_pair(c7, &C7);
1148
- __builtin_vsx_disassemble_pair(c8, &C8);
1149
-
1150
- t1 = vec_perm(c1[0], c2[0], swiz1);
1151
- t2 = vec_perm(c1[0], c2[0], swiz2);
1152
- t3 = vec_perm(c3[0], c4[0], swiz1);
1153
- t4 = vec_perm(c3[0], c4[0], swiz2);
1154
- t5 = vec_perm(t1, t3, swiz3);
1155
- t6 = vec_perm(t1, t3, swiz4);
1156
- t7 = vec_perm(t2, t4, swiz3);
1157
- t8 = vec_perm(t2, t4, swiz4);
1158
- if (flip == true) {
1159
- t5 = vec_xor(t5, xor_vector);
1160
- t6 = vec_xor(t6, xor_vector);
1161
- t7 = vec_xor(t7, xor_vector);
1162
- t8 = vec_xor(t8, xor_vector);
1163
- }
1164
- vec_xst(t5, 0, vecOffset);
1165
- vec_xst(t6, 0, vecOffset+16);
1166
- vec_xst(t7, 0, vecOffset+32);
1167
- vec_xst(t8, 0, vecOffset+48);
1168
-
1169
- t1 = vec_perm(c1[1], c2[1], swiz1);
1170
- t2 = vec_perm(c1[1], c2[1], swiz2);
1171
- t3 = vec_perm(c3[1], c4[1], swiz1);
1172
- t4 = vec_perm(c3[1], c4[1], swiz2);
1173
- t5 = vec_perm(t1, t3, swiz3);
1174
- t6 = vec_perm(t1, t3, swiz4);
1175
- t7 = vec_perm(t2, t4, swiz3);
1176
- t8 = vec_perm(t2, t4, swiz4);
1177
- if (flip == true) {
1178
- t5 = vec_xor(t5, xor_vector);
1179
- t6 = vec_xor(t6, xor_vector);
1180
- t7 = vec_xor(t7, xor_vector);
1181
- t8 = vec_xor(t8, xor_vector);
1182
- }
1183
- vec_xst(t5, 0, vecOffset+64);
1184
- vec_xst(t6, 0, vecOffset+80);
1185
- vec_xst(t7, 0, vecOffset+96);
1186
- vec_xst(t8, 0, vecOffset+112);
1187
-
1188
- t1 = vec_perm(c5[0], c6[0], swiz1);
1189
- t2 = vec_perm(c5[0], c6[0], swiz2);
1190
- t3 = vec_perm(c7[0], c8[0], swiz1);
1191
- t4 = vec_perm(c7[0], c8[0], swiz2);
1192
- t5 = vec_perm(t1, t3, swiz3);
1193
- t6 = vec_perm(t1, t3, swiz4);
1194
- t7 = vec_perm(t2, t4, swiz3);
1195
- t8 = vec_perm(t2, t4, swiz4);
1196
- if (flip == true) {
1197
- t5 = vec_xor(t5, xor_vector);
1198
- t6 = vec_xor(t6, xor_vector);
1199
- t7 = vec_xor(t7, xor_vector);
1200
- t8 = vec_xor(t8, xor_vector);
1201
- }
1202
- vec_xst(t5, 0, vecOffset+128);
1203
- vec_xst(t6, 0, vecOffset+144);
1204
- vec_xst(t7, 0, vecOffset+160);
1205
- vec_xst(t8, 0, vecOffset+176);
1206
-
1207
- t1 = vec_perm(c5[1], c6[1], swiz1);
1208
- t2 = vec_perm(c5[1], c6[1], swiz2);
1209
- t3 = vec_perm(c7[1], c8[1], swiz1);
1210
- t4 = vec_perm(c7[1], c8[1], swiz2);
1211
- t5 = vec_perm(t1, t3, swiz3);
1212
- t6 = vec_perm(t1, t3, swiz4);
1213
- t7 = vec_perm(t2, t4, swiz3);
1214
- t8 = vec_perm(t2, t4, swiz4);
1215
- if (flip == true) {
1216
- t5 = vec_xor(t5, xor_vector);
1217
- t6 = vec_xor(t6, xor_vector);
1218
- t7 = vec_xor(t7, xor_vector);
1219
- t8 = vec_xor(t8, xor_vector);
1220
- }
1221
- vec_xst(t5, 0, vecOffset+192);
1222
- vec_xst(t6, 0, vecOffset+208);
1223
- vec_xst(t7, 0, vecOffset+224);
1224
- vec_xst(t8, 0, vecOffset+240);
1225
-
1226
- aoffset1 += lda;
1227
- aoffset2 += lda;
1228
- aoffset3 += lda;
1229
- aoffset4 += lda;
1230
- aoffset5 += lda;
1231
- aoffset6 += lda;
1232
- aoffset7 += lda;
1233
- aoffset8 += lda;
1234
- vecOffset += 256;
1235
- i--;
1236
- } while(i > 0);
1237
- }
1238
- j--;
1239
- } while(j > 0);
1240
- }
1241
-
1242
- if (rows & 4) {
1243
- aoffset1 = aoffset;
1244
- aoffset2 = aoffset1 + lda;
1245
- aoffset3 = aoffset2 + lda;
1246
- aoffset4 = aoffset3 + lda;
1247
- aoffset += 4 * lda;
1248
-
1249
- i = (cols >> 3);
1250
- if (i > 0) {
1251
- do {
1252
- C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
1253
- C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
1254
- C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
1255
- C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs);
1256
-
1257
- __builtin_vsx_disassemble_pair(c1, &C1);
1258
- __builtin_vsx_disassemble_pair(c2, &C2);
1259
- __builtin_vsx_disassemble_pair(c3, &C3);
1260
- __builtin_vsx_disassemble_pair(c4, &C4);
1261
-
1262
- t1 = vec_perm(c1[0], c2[0], swiz1);
1263
- t2 = vec_perm(c1[0], c2[0], swiz2);
1264
- t3 = vec_perm(c3[0], c4[0], swiz1);
1265
- t4 = vec_perm(c3[0], c4[0], swiz2);
1266
- t5 = vec_perm(t1, t3, swiz3);
1267
- t6 = vec_perm(t1, t3, swiz4);
1268
- t7 = vec_perm(t2, t4, swiz3);
1269
- t8 = vec_perm(t2, t4, swiz4);
1270
- if (flip == true) {
1271
- t5 = vec_xor(t5, xor_vector);
1272
- t6 = vec_xor(t6, xor_vector);
1273
- t7 = vec_xor(t7, xor_vector);
1274
- t8 = vec_xor(t8, xor_vector);
1275
- }
1276
- vec_xst(t5, 0, vecOffset);
1277
- vec_xst(t6, 0, vecOffset+16);
1278
- vec_xst(t7, 0, vecOffset+32);
1279
- vec_xst(t8, 0, vecOffset+48);
1280
-
1281
- t1 = vec_perm(c1[1], c2[1], swiz1);
1282
- t2 = vec_perm(c1[1], c2[1], swiz2);
1283
- t3 = vec_perm(c3[1], c4[1], swiz1);
1284
- t4 = vec_perm(c3[1], c4[1], swiz2);
1285
- t5 = vec_perm(t1, t3, swiz3);
1286
- t6 = vec_perm(t1, t3, swiz4);
1287
- t7 = vec_perm(t2, t4, swiz3);
1288
- t8 = vec_perm(t2, t4, swiz4);
1289
- if (flip == true) {
1290
- t5 = vec_xor(t5, xor_vector);
1291
- t6 = vec_xor(t6, xor_vector);
1292
- t7 = vec_xor(t7, xor_vector);
1293
- t8 = vec_xor(t8, xor_vector);
1294
- }
1295
- vec_xst(t5, 0, vecOffset+64);
1296
- vec_xst(t6, 0, vecOffset+80);
1297
- vec_xst(t7, 0, vecOffset+96);
1298
- vec_xst(t8, 0, vecOffset+112);
1299
-
1300
- aoffset1 += lda;
1301
- aoffset2 += lda;
1302
- aoffset3 += lda;
1303
- aoffset4 += lda;
1304
- vecOffset += 128;
1305
- i--;
1306
- } while(i > 0);
1307
- }
1308
- }
1309
- if (rows & 3) {
1310
- aoffset1 = aoffset;
1311
- aoffset2 = aoffset1 + lda;
1312
- aoffset3 = aoffset2 + lda;
1313
- i = (cols >> 3);
1314
- if (i > 0) {
1315
- do {
1316
- switch(rows) {
1317
- case 3: C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
1318
- __builtin_vsx_disassemble_pair(c3, &C3);
1319
- case 2: C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
1320
- __builtin_vsx_disassemble_pair(c2, &C2);
1321
- case 1: C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
1322
- __builtin_vsx_disassemble_pair(c1, &C1);
1323
- break;
1324
- }
1325
- t1 = vec_perm(c1[0], c2[0], swiz1);
1326
- t2 = vec_perm(c1[0], c2[0], swiz2);
1327
- t3 = vec_perm(c3[0], c4[0], swiz1);
1328
- t4 = vec_perm(c3[0], c4[0], swiz2);
1329
- t5 = vec_perm(t1, t3, swiz3);
1330
- t6 = vec_perm(t1, t3, swiz4);
1331
- t7 = vec_perm(t2, t4, swiz3);
1332
- t8 = vec_perm(t2, t4, swiz4);
1333
- if (flip == true) {
1334
- t5 = vec_xor(t5, xor_vector);
1335
- t6 = vec_xor(t6, xor_vector);
1336
- t7 = vec_xor(t7, xor_vector);
1337
- t8 = vec_xor(t8, xor_vector);
1338
- }
1339
- vec_xst(t5, 0, vecOffset);
1340
- vec_xst(t6, 0, vecOffset+16);
1341
- vec_xst(t7, 0, vecOffset+32);
1342
- vec_xst(t8, 0, vecOffset+48);
1343
-
1344
- t1 = vec_perm(c1[1], c2[1], swiz1);
1345
- t2 = vec_perm(c1[1], c2[1], swiz2);
1346
- t3 = vec_perm(c3[1], c4[1], swiz1);
1347
- t4 = vec_perm(c3[1], c4[1], swiz2);
1348
- t5 = vec_perm(t1, t3, swiz3);
1349
- t6 = vec_perm(t1, t3, swiz4);
1350
- t7 = vec_perm(t2, t4, swiz3);
1351
- t8 = vec_perm(t2, t4, swiz4);
1352
- if (flip == true) {
1353
- t5 = vec_xor(t5, xor_vector);
1354
- t6 = vec_xor(t6, xor_vector);
1355
- t7 = vec_xor(t7, xor_vector);
1356
- t8 = vec_xor(t8, xor_vector);
1357
- }
1358
- vec_xst(t5, 0, vecOffset+64);
1359
- vec_xst(t6, 0, vecOffset+80);
1360
- vec_xst(t7, 0, vecOffset+96);
1361
- vec_xst(t8, 0, vecOffset+112);
1362
-
1363
- aoffset1 += lda;
1364
- aoffset2 += lda;
1365
- aoffset3 += lda;
1366
- vecOffset += 128;
1367
- i--;
1368
- } while(i > 0);
1369
- }
1370
- }
1371
- }
1372
-
1373
- void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
1374
- int64_t mc, nc, mp, np;
1375
- int m_rem = MIN(m - m0, 8);
1376
- int n_rem = MIN(n - n0, 8);
1377
- // TO-DO: KERNEL_16x8 and KERNEL_8x16 are having some performance
1378
- // issues. After resolving them, below code will be enabled.
1379
- /*if (m_rem >= 16 && n_rem >= 8) {
1380
- mc = 16;
1381
- nc = 8;
1382
- gemm<16,8>(m0, m, n0, n);
1383
- } else if(m_rem >= 8 && n_rem >= 16) {
1384
- mc = 8;
1385
- nc = 16;
1386
- gemm<8,16>(m0, m, n0, n);
1387
- }*/
1388
- if (m_rem >= 8 && n_rem >= 8) {
1389
- mc = 8;
1390
- nc = 8;
1391
- gemm<8,8>(m0, m, n0, n);
1392
- } else if (m_rem >= 4 && n_rem >= 8) {
1393
- mc = 4;
1394
- nc = 8;
1395
- gemm<4,8>(m0, m, n0, n);
1396
- } else if (m_rem >= 8 && n_rem >= 4) {
1397
- mc = 8;
1398
- nc = 4;
1399
- gemm<8,4>(m0, m, n0, n);
1400
- } else if (m_rem >= 4 && n_rem >= 4) {
1401
- mc = 4;
1402
- nc = 4;
1403
- gemm_small<4, 4>(m0, m, n0, n);
1404
- } else if ((m_rem < 4) && (n_rem > 4)) {
1405
- nc = 4;
1406
- switch(m_rem) {
1407
- case 1:
1408
- mc = 1;
1409
- gemm_small<1, 4>(m0, m, n0, n);
1410
- break;
1411
- case 2:
1412
- mc = 2;
1413
- gemm_small<2, 4>(m0, m, n0, n);
1414
- break;
1415
- case 3:
1416
- mc = 3;
1417
- gemm_small<3, 4>(m0, m, n0, n);
1418
- break;
1419
- default:
1420
- return;
1421
- }
1422
- } else if ((m_rem > 4) && (n_rem < 4)) {
1423
- mc = 4;
1424
- switch(n_rem) {
1425
- case 1:
1426
- nc = 1;
1427
- gemm_small<4, 1>(m0, m, n0, n);
1428
- break;
1429
- case 2:
1430
- nc = 2;
1431
- gemm_small<4, 2>(m0, m, n0, n);
1432
- break;
1433
- case 3:
1434
- nc = 3;
1435
- gemm_small<4, 3>(m0, m, n0, n);
1436
- break;
1437
- default:
1438
- return;
1439
- }
1440
- } else {
1441
- switch((m_rem << 4) | n_rem) {
1442
- case 0x43:
1443
- mc = 4;
1444
- nc = 3;
1445
- gemm_small<4, 3>(m0, m, n0, n);
1446
- break;
1447
- case 0x42:
1448
- mc = 4;
1449
- nc = 2;
1450
- gemm_small<4, 2>(m0, m, n0, n);
1451
- break;
1452
- case 0x41:
1453
- mc = 4;
1454
- nc = 1;
1455
- gemm_small<4, 1>(m0, m, n0, n);
1456
- break;
1457
- case 0x34:
1458
- mc = 3;
1459
- nc = 4;
1460
- gemm_small<3, 4>(m0, m, n0, n);
1461
- break;
1462
- case 0x33:
1463
- mc = 3;
1464
- nc = 3;
1465
- gemm_small<3, 3>(m0, m, n0, n);
1466
- break;
1467
- case 0x32:
1468
- mc = 3;
1469
- nc = 2;
1470
- gemm_small<3, 2>(m0, m, n0, n);
1471
- break;
1472
- case 0x31:
1473
- mc = 3;
1474
- nc = 1;
1475
- gemm_small<3, 1>(m0, m, n0, n);
1476
- break;
1477
- case 0x24:
1478
- mc = 2;
1479
- nc = 4;
1480
- gemm_small<2, 4>(m0, m, n0, n);
1481
- break;
1482
- case 0x23:
1483
- mc = 2;
1484
- nc = 3;
1485
- gemm_small<2, 3>(m0, m, n0, n);
1486
- break;
1487
- case 0x22:
1488
- mc = 2;
1489
- nc = 2;
1490
- gemm_small<2, 2>(m0, m, n0, n);
1491
- break;
1492
- case 0x21:
1493
- mc = 2;
1494
- nc = 1;
1495
- gemm_small<2, 1>(m0, m, n0, n);
1496
- break;
1497
- case 0x14:
1498
- mc = 1;
1499
- nc = 4;
1500
- gemm_small<1, 4>(m0, m, n0, n);
1501
- break;
1502
- case 0x13:
1503
- mc = 1;
1504
- nc = 3;
1505
- gemm_small<1, 3>(m0, m, n0, n);
1506
- break;
1507
- case 0x12:
1508
- mc = 1;
1509
- nc = 2;
1510
- gemm_small<1, 2>(m0, m, n0, n);
1511
- break;
1512
- case 0x11:
1513
- mc = 1;
1514
- nc = 1;
1515
- gemm_small<1, 1>(m0, m, n0, n);
1516
- break;
1517
- default:
1518
- return;
1519
- }
1520
- }
1521
- mp = m0 + (m - m0) / mc * mc;
1522
- np = n0 + (n - n0) / nc * nc;
1523
- mnpack(mp, m, n0, np);
1524
- mnpack(m0, m, np, n);
1525
- }
1526
-
1527
- void KERNEL_4x8(int64_t ii, int64_t jj) {
1528
- vec_t vec_A[8], vec_B[16] = {0};
1529
- acc_t acc_0, acc_1;
1530
- std::array<int, 4> comparray;
1531
- vector float fin_res[8] = {0};
1532
- vector float vs[8] = {0};
1533
- for (int l = 0; l < k; l++) {
1534
- __builtin_mma_xxsetaccz(&acc_0);
1535
- __builtin_mma_xxsetaccz(&acc_1);
1536
- packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
1537
- packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
1538
- for(int x = 0; x < 8; x++) {
1539
- __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
1540
- __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x], vec_B[x+8]);
1541
- }
1542
- for (int I = 0; I<4; I++) {
1543
- for (int J = 0; J<4; J++) {
1544
- *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
1545
- *((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
1546
- }
1547
- }
1548
- auto aoffset = A+(ii*lda)+l;
1549
- for (int i = 0; i < 4; i++) {
1550
- comparray[i] = 0;
1551
- int ca = 0;
1552
- const int8_t *at = aoffset->qs;
1553
- for (int j = 0; j < 32; j++)
1554
- ca += (int)*at++;
1555
- comparray[i] = ca;
1556
- aoffset += lda;
1557
- }
1558
- compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
1559
- compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
1560
- }
1561
- save_res<4, 4>(ii, jj, 0, fin_res);
1562
- save_res<4, 4>(ii, jj+4, 4, fin_res);
1563
- }
1564
-
1565
- void KERNEL_8x4(int64_t ii, int64_t jj) {
1566
- vec_t vec_A[16], vec_B[8] = {0};
1567
- acc_t acc_0, acc_1;
1568
- std::array<int, 8> comparray;
1569
- vector float fin_res[8] = {0};
1570
- vector float vs[8] = {0};
1571
- for (int l = 0; l < k; l++) {
1572
- __builtin_mma_xxsetaccz(&acc_0);
1573
- __builtin_mma_xxsetaccz(&acc_1);
1574
- packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
1575
- packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true);
1576
- for(int x = 0; x < 8; x++) {
1577
- __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
1578
- __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
1579
- }
1580
- for (int I = 0; I<8; I++) {
1581
- for (int J = 0; J<4; J++) {
1582
- *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
1583
- }
1584
- }
1585
- auto aoffset = A+(ii*lda)+l;
1586
- for (int i = 0; i < 8; i++) {
1587
- comparray[i] = 0;
1588
- int ca = 0;
1589
- const int8_t *at = aoffset->qs;
1590
- for (int j = 0; j < 32; j++)
1591
- ca += (int)*at++;
1592
- comparray[i] = ca;
1593
- aoffset += lda;
1594
- }
1595
- compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
1596
- compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
1597
- }
1598
- save_res<4, 4>(ii, jj, 0, fin_res);
1599
- save_res<4, 4>(ii+4, jj, 4, fin_res);
1600
- }
1601
-
1602
- void KERNEL_8x8(int64_t ii, int64_t jj) {
1603
- vec_t vec_A[16], vec_B[16] = {0};
1604
- acc_t acc_0, acc_1, acc_2, acc_3;
1605
- std::array<int, 8> comparray;
1606
- vector float fin_res[16] = {0};
1607
- vector float vs[16] = {0};
1608
- for (int l = 0; l < k; l++) {
1609
- __builtin_mma_xxsetaccz(&acc_0);
1610
- __builtin_mma_xxsetaccz(&acc_1);
1611
- __builtin_mma_xxsetaccz(&acc_2);
1612
- __builtin_mma_xxsetaccz(&acc_3);
1613
- packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
1614
- packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
1615
- for(int x = 0; x < 8; x++) {
1616
- __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
1617
- __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
1618
- __builtin_mma_xvi8ger4pp(&acc_2, vec_A[x], vec_B[x+8]);
1619
- __builtin_mma_xvi8ger4pp(&acc_3, vec_A[x+8], vec_B[x+8]);
1620
- }
1621
- for (int I = 0; I<8; I++) {
1622
- for (int J = 0; J<4; J++) {
1623
- *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
1624
- *((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
1625
- }
1626
- }
1627
- auto aoffset = A+(ii*lda)+l;
1628
- for (int i = 0; i < 8; i++) {
1629
- comparray[i] = 0;
1630
- int ca = 0;
1631
- const int8_t *at = aoffset->qs;
1632
- for (int j = 0; j < 32; j++)
1633
- ca += (int)*at++;
1634
- comparray[i] = ca;
1635
- aoffset += lda;
1636
- }
1637
- compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
1638
- compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
1639
- compute<8>(&acc_2, 0, 8, comparray, vs, fin_res);
1640
- compute<8>(&acc_3, 4, 12, comparray, vs, fin_res);
1641
- }
1642
- save_res<4, 4>(ii, jj, 0, fin_res);
1643
- save_res<4, 4>(ii+4, jj, 4, fin_res);
1644
- save_res<4, 4>(ii, jj+4, 8, fin_res);
1645
- save_res<4, 4>(ii+4, jj+4, 12, fin_res);
1646
- }
1647
-
1648
- template<int RM, int RN>
1649
- void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n) {
1650
- int64_t ytiles = (m - m0) / RM;
1651
- int64_t xtiles = (n - n0) / RN;
1652
- int64_t tiles = xtiles * ytiles;
1653
- int64_t duty = (tiles + nth - 1) / nth;
1654
- int64_t start = duty * ith;
1655
- int64_t end = start + duty;
1656
- vec_t vec_A[8], vec_B[8] = {0};
1657
- vector signed int vec_C[4];
1658
- acc_t acc_0;
1659
-
1660
- if (end > tiles)
1661
- end = tiles;
1662
- for (int64_t job = start; job < end; ++job) {
1663
- int64_t ii = m0 + job / xtiles * RM;
1664
- int64_t jj = n0 + job % xtiles * RN;
1665
- std::array<int, RM> comparray;
1666
- vector float res[4] = {0};
1667
- vector float fin_res[4] = {0};
1668
- vector float vs[4] = {0};
1669
- vector float CA[4] = {0};
1670
- __builtin_prefetch((A+(ii*lda)+0)->qs, 0, 1); // prefetch first value
1671
- __builtin_prefetch((B+(jj*ldb)+0)->qs, 0, 1); // prefetch first value
1672
- for (int l = 0; l < k; l++) {
1673
- __builtin_prefetch((A+(ii*lda)+(l+1))->qs, 0, 1); // prefetch one loop ahead
1674
- __builtin_prefetch((B+(jj*ldb)+(l+1))->qs, 0, 1); // prefetch one loop ahead
1675
- __builtin_mma_xxsetaccz(&acc_0);
1676
- packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
1677
- packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true);
1678
- for(int x = 0; x < 8; x+=4) {
1679
- __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
1680
- __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+1], vec_B[x+1]);
1681
- __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+2], vec_B[x+2]);
1682
- __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+3], vec_B[x+3]);
1683
- }
1684
- for (int I = 0; I<RM; I++) {
1685
- for (int J = 0; J<RN; J++) {
1686
- *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
1687
- }
1688
- }
1689
- __builtin_mma_disassemble_acc(vec_C, &acc_0);
1690
- auto aoffset = A+(ii*lda)+l;
1691
- for (int i = 0; i < RM; i++) {
1692
- comparray[i] = 0;
1693
- int ca = 0;
1694
- const int8_t *at = aoffset->qs;
1695
- for (int j = 0; j < 32; j++)
1696
- ca += (int)*at++;
1697
- comparray[i] = ca;
1698
- aoffset += lda;
1699
- }
1700
-
1701
- for (int i = 0; i < RM; i++) {
1702
- CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0));
1703
- res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
1704
- fin_res[i] = vec_madd(res[i], vs[i], fin_res[i]);
1705
- }
1706
- }
1707
- save_res<RM, RN>(ii, jj, 0, fin_res);
1708
- }
1709
- }
1710
-
1711
- template<int RM, int RN>
1712
- inline void kernel(int64_t ii, int64_t jj) {
1713
- if constexpr(RM == 4 && RN == 8) {
1714
- KERNEL_4x8(ii,jj);
1715
- } else if constexpr(RM == 8 && RN == 4) {
1716
- KERNEL_8x4(ii,jj);
1717
- } else if constexpr(RM == 8 && RN == 8) {
1718
- KERNEL_8x8(ii,jj);
1719
- } else {
1720
- static_assert(false, "RN/RM values not supported");
1721
- }
1722
- }
1723
-
1724
- template <int RM, int RN>
1725
- NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
1726
- int64_t ytiles = (m - m0) / RM;
1727
- int64_t xtiles = (n - n0) / RN;
1728
- int64_t tiles = xtiles * ytiles;
1729
- int64_t duty = (tiles + nth - 1) / nth;
1730
- int64_t start = duty * ith;
1731
- int64_t end = start + duty;
1732
- if (end > tiles)
1733
- end = tiles;
1734
- for (int64_t job = start; job < end; ++job) {
1735
- int64_t ii = m0 + job / xtiles * RM;
1736
- int64_t jj = n0 + job % xtiles * RN;
1737
- kernel<RM, RN>(ii, jj);
1738
- }
1739
- }
1740
-
1741
- const TA *const A;
1742
- const TB *const B;
1743
- TC *C;
1744
- TA *At;
1745
- TB *Bt;
1746
- const int64_t k;
1747
- const int64_t lda;
1748
- const int64_t ldb;
1749
- const int64_t ldc;
1750
- const int ith;
1751
- const int nth;
1752
- };
1753
-
1754
- template <typename TA, typename TB, typename TC>
1755
- class tinyBLAS_PPC {
1756
- public:
1757
- tinyBLAS_PPC(int64_t k,
1758
- const TA *A, int64_t lda,
1759
- const TB *B, int64_t ldb,
1760
- TC *C, int64_t ldc,
1761
- int ith, int nth)
1762
- : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
1763
- }
1764
-
1765
- void matmul(int64_t m, int64_t n) {
1766
- mnpack(0, m, 0, n);
1767
- }
1768
-
1769
- private:
1770
-
1771
- void (tinyBLAS_PPC::*kernel)(int64_t, int64_t);
1772
-
1773
- template<typename VA>
1774
- void packTranspose(const TA* a, int64_t lda, int rows, int cols, TA* vec) {
1775
- int64_t i, j;
1776
- TA *aoffset = NULL, *boffset = NULL;
1777
- TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
1778
- TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
1779
- __vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
1780
- VA c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
1781
- VA c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
1782
- VA t1, t2, t3, t4, t5, t6, t7, t8;
1783
- aoffset = const_cast<TA*>(a);
1784
- boffset = vec;
1785
- j = (rows >> 3);
1786
- if (j > 0) {
1787
- do {
1788
- aoffset1 = aoffset;
1789
- aoffset2 = aoffset1 + lda;
1790
- aoffset3 = aoffset2 + lda;
1791
- aoffset4 = aoffset3 + lda;
1792
- aoffset5 = aoffset4 + lda;
1793
- aoffset6 = aoffset5 + lda;
1794
- aoffset7 = aoffset6 + lda;
1795
- aoffset8 = aoffset7 + lda;
1796
- aoffset += 8 * lda;
1797
- i = (cols >> 3);
1798
- if (i > 0) {
1799
- do {
1800
- C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
1801
- C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
1802
- C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3);
1803
- C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4);
1804
- C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5);
1805
- C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6);
1806
- C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7);
1807
- C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8);
1808
- __builtin_vsx_disassemble_pair(c1, &C1);
1809
- __builtin_vsx_disassemble_pair(c2, &C2);
1810
- __builtin_vsx_disassemble_pair(c3, &C3);
1811
- __builtin_vsx_disassemble_pair(c4, &C4);
1812
- __builtin_vsx_disassemble_pair(c5, &C5);
1813
- __builtin_vsx_disassemble_pair(c6, &C6);
1814
- __builtin_vsx_disassemble_pair(c7, &C7);
1815
- __builtin_vsx_disassemble_pair(c8, &C8);
1816
-
1817
- t1 = vec_mergeh(c1[0], c2[0]);
1818
- t2 = vec_mergeh(c3[0], c4[0]);
1819
- t3 = vec_mergeh(c5[0], c6[0]);
1820
- t4 = vec_mergeh(c7[0], c8[0]);
1821
- t5 = vec_xxpermdi(t1, t2, 0);
1822
- t6 = vec_xxpermdi(t3, t4, 0);
1823
- t7 = vec_xxpermdi(t1, t2, 3);
1824
- t8 = vec_xxpermdi(t3, t4, 3);
1825
- vec_xst(t5, 0, boffset);
1826
- vec_xst(t6, 0, boffset+4);
1827
- vec_xst(t7, 0, boffset+8);
1828
- vec_xst(t8, 0, boffset+12);
1829
-
1830
- t1 = vec_mergel(c1[0], c2[0]);
1831
- t2 = vec_mergel(c3[0], c4[0]);
1832
- t3 = vec_mergel(c5[0], c6[0]);
1833
- t4 = vec_mergel(c7[0], c8[0]);
1834
- t5 = vec_xxpermdi(t1, t2, 0);
1835
- t6 = vec_xxpermdi(t3, t4, 0);
1836
- t7 = vec_xxpermdi(t1, t2, 3);
1837
- t8 = vec_xxpermdi(t3, t4, 3);
1838
- vec_xst(t5, 0, boffset+16);
1839
- vec_xst(t6, 0, boffset+20);
1840
- vec_xst(t7, 0, boffset+24);
1841
- vec_xst(t8, 0, boffset+28);
1842
-
1843
- t1 = vec_mergeh(c1[1], c2[1]);
1844
- t2 = vec_mergeh(c3[1], c4[1]);
1845
- t3 = vec_mergeh(c5[1], c6[1]);
1846
- t4 = vec_mergeh(c7[1], c8[1]);
1847
- t5 = vec_xxpermdi(t1, t2, 0);
1848
- t6 = vec_xxpermdi(t3, t4, 0);
1849
- t7 = vec_xxpermdi(t1, t2, 3);
1850
- t8 = vec_xxpermdi(t3, t4, 3);
1851
- vec_xst(t5, 0, boffset+32);
1852
- vec_xst(t6, 0, boffset+36);
1853
- vec_xst(t7, 0, boffset+40);
1854
- vec_xst(t8, 0, boffset+44);
1855
-
1856
- t1 = vec_mergel(c1[1], c2[1]);
1857
- t2 = vec_mergel(c3[1], c4[1]);
1858
- t3 = vec_mergel(c5[1], c6[1]);
1859
- t4 = vec_mergel(c7[1], c8[1]);
1860
- t5 = vec_xxpermdi(t1, t2, 0);
1861
- t6 = vec_xxpermdi(t3, t4, 0);
1862
- t7 = vec_xxpermdi(t1, t2, 3);
1863
- t8 = vec_xxpermdi(t3, t4, 3);
1864
- vec_xst(t5, 0, boffset+48);
1865
- vec_xst(t6, 0, boffset+52);
1866
- vec_xst(t7, 0, boffset+56);
1867
- vec_xst(t8, 0, boffset+60);
1868
-
1869
- aoffset1 += 8*lda;
1870
- aoffset2 += 8*lda;
1871
- aoffset3 += 8*lda;
1872
- aoffset4 += 8*lda;
1873
- boffset += 64;
1874
- i--;
1875
- } while(i > 0);
1876
- }
1877
- if (cols & 4) {
1878
- c1[0] = vec_xl(0, aoffset1);
1879
- c2[0] = vec_xl(0, aoffset2);
1880
- c3[0] = vec_xl(0, aoffset3);
1881
- c4[0] = vec_xl(0, aoffset4);
1882
- c5[0] = vec_xl(0, aoffset5);
1883
- c6[0] = vec_xl(0, aoffset6);
1884
- c7[0] = vec_xl(0, aoffset7);
1885
- c8[0] = vec_xl(0, aoffset8);
1886
-
1887
- t1 = vec_mergeh(c1[0], c2[0]);
1888
- t2 = vec_mergeh(c3[0], c4[0]);
1889
- t3 = vec_mergeh(c5[0], c6[0]);
1890
- t4 = vec_mergeh(c7[0], c8[0]);
1891
- t5 = vec_xxpermdi(t1, t2, 0);
1892
- t6 = vec_xxpermdi(t3, t4, 0);
1893
- t7 = vec_xxpermdi(t1, t2, 3);
1894
- t8 = vec_xxpermdi(t3, t4, 3);
1895
- vec_xst(t5, 0, boffset);
1896
- vec_xst(t6, 0, boffset+4);
1897
- vec_xst(t7, 0, boffset+8);
1898
- vec_xst(t8, 0, boffset+12);
1899
-
1900
- t1 = vec_mergel(c1[0], c2[0]);
1901
- t2 = vec_mergel(c3[0], c4[0]);
1902
- t3 = vec_mergel(c5[0], c6[0]);
1903
- t4 = vec_mergel(c7[0], c8[0]);
1904
- t5 = vec_xxpermdi(t1, t2, 0);
1905
- t6 = vec_xxpermdi(t3, t4, 0);
1906
- t7 = vec_xxpermdi(t1, t2, 3);
1907
- t8 = vec_xxpermdi(t3, t4, 3);
1908
- vec_xst(t5, 0, boffset+16);
1909
- vec_xst(t6, 0, boffset+20);
1910
- vec_xst(t7, 0, boffset+24);
1911
- vec_xst(t8, 0, boffset+28);
1912
- }
1913
- j--;
1914
- } while(j > 0);
1915
- }
1916
-
1917
- if (rows & 4) {
1918
- aoffset1 = aoffset;
1919
- aoffset2 = aoffset1 + lda;
1920
- aoffset3 = aoffset2 + lda;
1921
- aoffset4 = aoffset3 + lda;
1922
- aoffset += 4 * lda;
1923
- i = (cols >> 3);
1924
- if (i > 0) {
1925
- do {
1926
- C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
1927
- C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
1928
- C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3);
1929
- C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4);
1930
- __builtin_vsx_disassemble_pair(c1, &C1);
1931
- __builtin_vsx_disassemble_pair(c2, &C2);
1932
- __builtin_vsx_disassemble_pair(c3, &C3);
1933
- __builtin_vsx_disassemble_pair(c4, &C4);
1934
-
1935
- t1 = vec_mergeh(c1[0], c2[0]);
1936
- t2 = vec_mergeh(c3[0], c4[0]);
1937
- t3 = vec_mergel(c1[0], c2[0]);
1938
- t4 = vec_mergel(c3[0], c4[0]);
1939
- t5 = vec_xxpermdi(t1, t2, 0);
1940
- t6 = vec_xxpermdi(t1, t2, 3);
1941
- t7 = vec_xxpermdi(t3, t4, 0);
1942
- t8 = vec_xxpermdi(t3, t4, 3);
1943
- vec_xst(t5, 0, boffset);
1944
- vec_xst(t6, 0, boffset+4);
1945
- vec_xst(t7, 0, boffset+8);
1946
- vec_xst(t8, 0, boffset+12);
1947
-
1948
- t1 = vec_mergeh(c1[1], c2[1]);
1949
- t2 = vec_mergeh(c3[1], c4[1]);
1950
- t3 = vec_mergel(c1[1], c2[1]);
1951
- t4 = vec_mergel(c3[1], c4[1]);
1952
- t5 = vec_xxpermdi(t1, t2, 0);
1953
- t6 = vec_xxpermdi(t1, t2, 3);
1954
- t7 = vec_xxpermdi(t3, t4, 0);
1955
- t8 = vec_xxpermdi(t3, t4, 3);
1956
- vec_xst(t5, 0, boffset+16);
1957
- vec_xst(t6, 0, boffset+20);
1958
- vec_xst(t7, 0, boffset+24);
1959
- vec_xst(t8, 0, boffset+28);
1960
-
1961
- aoffset1 += 8*lda;
1962
- aoffset2 += 8*lda;
1963
- aoffset3 += 8*lda;
1964
- aoffset4 += 8*lda;
1965
- boffset += 32;
1966
- i--;
1967
- } while(i > 0);
1968
- }
1969
-
1970
- if (cols & 4) {
1971
- c1[0] = vec_xl(0, aoffset1);
1972
- c2[0] = vec_xl(0, aoffset2);
1973
- c3[0] = vec_xl(0, aoffset3);
1974
- c4[0] = vec_xl(0, aoffset4);
1975
-
1976
- t1 = vec_mergeh(c1[0], c2[0]);
1977
- t2 = vec_mergeh(c3[0], c4[0]);
1978
- t3 = vec_xxpermdi(t1, t2, 0);
1979
- t4 = vec_xxpermdi(t1, t2, 3);
1980
- vec_xst(t3, 0, boffset);
1981
- vec_xst(t4, 0, boffset+4);
1982
-
1983
- t1 = vec_mergel(c1[0], c2[0]);
1984
- t2 = vec_mergel(c3[0], c4[0]);
1985
- t3 = vec_xxpermdi(t1, t2, 0);
1986
- t4 = vec_xxpermdi(t1, t2, 3);
1987
- vec_xst(t3, 0, boffset+8);
1988
- vec_xst(t4, 0, boffset+12);
1989
- }
1990
- }
1991
- if (rows & 3) {
1992
- aoffset1 = aoffset;
1993
- aoffset2 = aoffset1 + lda;
1994
- aoffset3 = aoffset2 + lda;
1995
- if (cols & 4) {
1996
- c1[0] = vec_xl(0, aoffset1);
1997
- c2[0] = vec_xl(0, aoffset2);
1998
- c3[0] = vec_xl(0, aoffset3);
1999
-
2000
- t1 = vec_mergeh(c1[0], c2[0]);
2001
- t2 = vec_mergeh(c3[0], c4[0]);
2002
- t3 = vec_xxpermdi(t1, t2, 0);
2003
- t4 = vec_xxpermdi(t1, t2, 3);
2004
- vec_xst(t3, 0, boffset);
2005
- vec_xst(t4, 0, boffset+4);
2006
-
2007
- t1 = vec_mergel(c1[0], c2[0]);
2008
- t2 = vec_mergel(c3[0], c4[0]);
2009
- t3 = vec_xxpermdi(t1, t2, 0);
2010
- t4 = vec_xxpermdi(t1, t2, 3);
2011
- vec_xst(t3, 0, boffset+8);
2012
- vec_xst(t4, 0, boffset+12);
2013
- }
2014
- }
2015
- }
2016
- void KERNEL_4x4(int64_t ii, int64_t jj) {
2017
- vec_t vec_A[4], vec_B[4], vec_C[4];
2018
- acc_t acc_0;
2019
- __builtin_mma_xxsetaccz(&acc_0);
2020
- for (int l = 0; l < k; l+=4) {
2021
- packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
2022
- packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
2023
- __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
2024
- __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
2025
- __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
2026
- __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], vec_B[3]);
2027
- }
2028
- SAVE_ACC(&acc_0, ii, jj);
2029
- }
2030
-
2031
- void KERNEL_4x8(int64_t ii, int64_t jj) {
2032
- vec_t vec_A[4], vec_B[8], vec_C[4];
2033
- acc_t acc_0, acc_1;
2034
- __builtin_mma_xxsetaccz(&acc_0);
2035
- __builtin_mma_xxsetaccz(&acc_1);
2036
- for (int64_t l = 0; l < k; l+=4) {
2037
- packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
2038
- packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 4, (TA*)vec_B);
2039
- __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]);
2040
- __builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]);
2041
- __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]);
2042
- __builtin_mma_xvf32gerpp(&acc_1, vec_A[1], (vec_t)vec_B[3]);
2043
- __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], (vec_t)vec_B[4]);
2044
- __builtin_mma_xvf32gerpp(&acc_1, vec_A[2], (vec_t)vec_B[5]);
2045
- __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], (vec_t)vec_B[6]);
2046
- __builtin_mma_xvf32gerpp(&acc_1, vec_A[3], (vec_t)vec_B[7]);
2047
- }
2048
- SAVE_ACC(&acc_0, ii, jj);
2049
- SAVE_ACC(&acc_1, ii, jj+4);
2050
- }
2051
-
2052
- void KERNEL_8x4(int64_t ii, int64_t jj) {
2053
- vec_t vec_A[8], vec_B[4], vec_C[4];
2054
- acc_t acc_0, acc_1;
2055
- __builtin_mma_xxsetaccz(&acc_0);
2056
- __builtin_mma_xxsetaccz(&acc_1);
2057
- for (int64_t l = 0; l < k; l+=4) {
2058
- packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 4, (TA*)vec_A);
2059
- packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
2060
- __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]);
2061
- __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]);
2062
- __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]);
2063
- __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[3], vec_B[1]);
2064
- __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[4], vec_B[2]);
2065
- __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[5], vec_B[2]);
2066
- __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[6], vec_B[3]);
2067
- __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[7], vec_B[3]);
2068
- }
2069
- SAVE_ACC(&acc_0, ii, jj);
2070
- SAVE_ACC(&acc_1, ii+4, jj);
2071
- }
2072
-
2073
- void KERNEL_8x8(int64_t ii, int64_t jj) {
2074
- vec_t vec_A[16], vec_B[16], vec_C[4];
2075
- acc_t acc_0, acc_1, acc_2, acc_3;
2076
- __builtin_mma_xxsetaccz(&acc_0);
2077
- __builtin_mma_xxsetaccz(&acc_1);
2078
- __builtin_mma_xxsetaccz(&acc_2);
2079
- __builtin_mma_xxsetaccz(&acc_3);
2080
- for (int l = 0; l < k; l+=8) {
2081
- packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 8, (TA*)vec_A);
2082
- packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 8, (TA*)vec_B);
2083
- for(int x = 0; x < 16; x+=2) {
2084
- __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]);
2085
- __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]);
2086
- __builtin_mma_xvf32gerpp(&acc_2, (vec_t)vec_A[x+1], vec_B[x]);
2087
- __builtin_mma_xvf32gerpp(&acc_3, (vec_t)vec_A[x+1], vec_B[x+1]);
2088
- }
2089
- }
2090
- SAVE_ACC(&acc_0, ii, jj);
2091
- SAVE_ACC(&acc_1, ii, jj+4);
2092
- SAVE_ACC(&acc_2, ii+4, jj);
2093
- SAVE_ACC(&acc_3, ii+4, jj+4);
2094
- }
2095
-
2096
- void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
2097
- int64_t mc, nc, mp, np;
2098
- int m_rem = MIN(m - m0, 16);
2099
- int n_rem = MIN(n - n0, 16);
2100
- if (m_rem >= 16 && n_rem >= 8) {
2101
- mc = 8;
2102
- nc = 8;
2103
- gemm<8,8>(m0, m, n0, n);
2104
- } else if(m_rem >= 8 && n_rem >= 16) {
2105
- mc = 8;
2106
- nc = 8;
2107
- gemm<8,8>(m0, m, n0, n);
2108
- } else if (m_rem >= 8 && n_rem >= 8) {
2109
- mc = 8;
2110
- nc = 8;
2111
- gemm<8,8>(m0, m, n0, n);
2112
- } else if (m_rem >= 4 && n_rem >= 8) {
2113
- mc = 4;
2114
- nc = 8;
2115
- gemm<4,8>(m0, m, n0, n);
2116
- } else if (m_rem >= 8 && n_rem >= 4) {
2117
- mc = 8;
2118
- nc = 4;
2119
- gemm<8,4>(m0, m, n0, n);
2120
- } else if (m_rem >= 4 && n_rem >= 4) {
2121
- mc = 4;
2122
- nc = 4;
2123
- gemm<4,4>(m0, m, n0, n);
2124
- } else if ((m_rem < 4) && (n_rem > 4)) {
2125
- nc = 4;
2126
- switch(m_rem) {
2127
- case 1:
2128
- mc = 1;
2129
- gemm_small(m0, m, n0, n, mc, nc);
2130
- break;
2131
- case 2:
2132
- mc = 2;
2133
- gemm_small(m0, m, n0, n, mc, nc);
2134
- break;
2135
- case 3:
2136
- mc = 3;
2137
- gemm_small(m0, m, n0, n, mc, nc);
2138
- break;
2139
- default:
2140
- return;
2141
- }
2142
- } else if ((m_rem > 4) && (n_rem < 4)) {
2143
- mc = 4;
2144
- switch(n_rem) {
2145
- case 1:
2146
- nc = 1;
2147
- gemm_small(m0, m, n0, n, mc, nc);
2148
- break;
2149
- case 2:
2150
- nc = 2;
2151
- gemm_small(m0, m, n0, n, mc, nc);
2152
- break;
2153
- case 3:
2154
- nc = 3;
2155
- gemm_small(m0, m, n0, n, mc, nc);
2156
- break;
2157
- default:
2158
- return;
2159
- }
2160
- } else {
2161
- switch((m_rem << 4) | n_rem) {
2162
- case 0x43:
2163
- mc = 4;
2164
- nc = 3;
2165
- gemm_small(m0, m, n0, n, mc, nc);
2166
- break;
2167
- case 0x42:
2168
- mc = 4;
2169
- nc = 2;
2170
- gemm_small(m0, m, n0, n, mc, nc);
2171
- break;
2172
- case 0x41:
2173
- mc = 4;
2174
- nc = 1;
2175
- gemm_small(m0, m, n0, n, mc, nc);
2176
- break;
2177
- case 0x34:
2178
- mc = 3;
2179
- nc = 4;
2180
- gemm_small(m0, m, n0, n, mc, nc);
2181
- break;
2182
- case 0x33:
2183
- mc = 3;
2184
- nc = 3;
2185
- gemm_small(m0, m, n0, n, mc, nc);
2186
- break;
2187
- case 0x32:
2188
- mc = 3;
2189
- nc = 2;
2190
- gemm_small(m0, m, n0, n, mc, nc);
2191
- break;
2192
- case 0x31:
2193
- mc = 3;
2194
- nc = 1;
2195
- gemm_small(m0, m, n0, n, mc, nc);
2196
- break;
2197
- case 0x24:
2198
- mc = 2;
2199
- nc = 4;
2200
- gemm_small(m0, m, n0, n, mc, nc);
2201
- break;
2202
- case 0x23:
2203
- mc = 2;
2204
- nc = 3;
2205
- gemm_small(m0, m, n0, n, mc, nc);
2206
- break;
2207
- case 0x22:
2208
- mc = 2;
2209
- nc = 2;
2210
- gemm_small(m0, m, n0, n, mc, nc);
2211
- break;
2212
- case 0x21:
2213
- mc = 2;
2214
- nc = 1;
2215
- gemm_small(m0, m, n0, n, mc, nc);
2216
- break;
2217
- case 0x14:
2218
- mc = 1;
2219
- nc = 4;
2220
- gemm_small(m0, m, n0, n, mc, nc);
2221
- break;
2222
- case 0x13:
2223
- mc = 1;
2224
- nc = 3;
2225
- gemm_small(m0, m, n0, n, mc, nc);
2226
- break;
2227
- case 0x12:
2228
- mc = 1;
2229
- nc = 2;
2230
- gemm_small(m0, m, n0, n, mc, nc);
2231
- break;
2232
- case 0x11:
2233
- mc = 1;
2234
- nc = 1;
2235
- gemm_small(m0, m, n0, n, mc, nc);
2236
- break;
2237
- default:
2238
- return;
2239
- }
2240
- }
2241
- mp = m0 + (m - m0) / mc * mc;
2242
- np = n0 + (n - n0) / nc * nc;
2243
- mnpack(mp, m, n0, np);
2244
- mnpack(m0, m, np, n);
2245
- }
2246
-
2247
- void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
2248
- int64_t ytiles = (m - m0) / RM;
2249
- int64_t xtiles = (n - n0) / RN;
2250
- int64_t tiles = xtiles * ytiles;
2251
- int64_t duty = (tiles + nth - 1) / nth;
2252
- int64_t start = duty * ith;
2253
- int64_t end = start + duty;
2254
- if (end > tiles)
2255
- end = tiles;
2256
- for (int64_t job = start; job < end; ++job) {
2257
- int64_t ii = m0 + job / xtiles * RM;
2258
- int64_t jj = n0 + job % xtiles * RN;
2259
- vec_t vec_C[4];
2260
- acc_t acc_0;
2261
- __builtin_mma_xxsetaccz(&acc_0);
2262
- vec_t vec_A[4], vec_B[4];
2263
- for (int l=0; l<k; l+=4) {
2264
- if (RN >= 4 && RM == 1) {
2265
- TA* a = const_cast<TA*>(A+(ii)*lda+l);
2266
- packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
2267
- vec_A[0] = (vec_t)vec_xl(0,a);
2268
- vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1));
2269
- vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2));
2270
- vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3));
2271
- } else {
2272
- packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
2273
- packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
2274
- }
2275
- __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
2276
- __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
2277
- __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
2278
- __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], vec_B[3]);
2279
- }
2280
- __builtin_mma_disassemble_acc(vec_C, &acc_0);
2281
- for (int I = 0; I < RM; I++) {
2282
- for (int J = 0; J < RN; J++) {
2283
- *((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
2284
- }
2285
- }
2286
- }
2287
- }
2288
-
2289
- template <int RM, int RN>
2290
- NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
2291
- int64_t ytiles = (m - m0) / RM;
2292
- int64_t xtiles = (n - n0) / RN;
2293
- int64_t tiles = xtiles * ytiles;
2294
- int64_t duty = (tiles + nth - 1) / nth;
2295
- int64_t start = duty * ith;
2296
- int64_t end = start + duty;
2297
- if (RM == 4 && RN == 4) {
2298
- kernel = &tinyBLAS_PPC::KERNEL_4x4;
2299
- } else if (RM == 4 && RN == 8) {
2300
- kernel = &tinyBLAS_PPC::KERNEL_4x8;
2301
- } else if (RM == 8 && RN == 4) {
2302
- kernel = &tinyBLAS_PPC::KERNEL_8x4;
2303
- } else if (RM == 8 && RN == 8) {
2304
- kernel = &tinyBLAS_PPC::KERNEL_8x8;
2305
- }
2306
- if (end > tiles)
2307
- end = tiles;
2308
- for (int64_t job = start; job < end; ++job) {
2309
- int64_t ii = m0 + job / xtiles * RM;
2310
- int64_t jj = n0 + job % xtiles * RN;
2311
- (this->*kernel)(ii, jj);
2312
- }
2313
- }
2314
-
2315
- const TA *const A;
2316
- const TB *const B;
2317
- TC *C;
2318
- TA *At;
2319
- TB *Bt;
2320
- const int64_t k;
2321
- const int64_t lda;
2322
- const int64_t ldb;
2323
- const int64_t ldc;
2324
- const int ith;
2325
- const int nth;
2326
- };
2327
- #endif
2328
- } // namespace
2329
-
2330
- /**
2331
- * Performs optimized matrix multiplication on CPU.
2332
- *
2333
- * This subroutine may compute C = Aᵀ * B with column major ordering.
2334
- * Despite its name, this isn't a generalized implementation. Work is
2335
- * only performed when a handwritten kernel is written and available.
2336
- * Otherwise the caller should fall back to a general matmul routine.
2337
- *
2338
- * For example, for single-threaded single-precision GEMM you can say
2339
- *
2340
- * llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
2341
- * 0, 1,
2342
- * LM_GGML_TYPE_F32, LM_GGML_TYPE_F32, LM_GGML_TYPE_F32);
2343
- *
2344
- * @param m is rows in `A` and `C`
2345
- * @param n is cols in `B` and `C`
2346
- * @param k is cols in `A` and rows in `B`
2347
- * @param A is first input matrix (always transposed)
2348
- * @param lda is row stride of `A`
2349
- * @param B is second input matrix (never transposed)
2350
- * @param ldb is row stride of `B`
2351
- * @param C is input/output array of output matrices
2352
- * @param ldc is row stride of `C`
2353
- * @param ith is thread id (must be less than `nth`)
2354
- * @param nth is number of threads (must be greater than zero)
2355
- * @param Atype is GGML data type of `A`
2356
- * @param Btype is GGML data type of `B`
2357
- * @param Ctype is GGML data type of `C`
2358
- * @return true if this function was able to service the matmul request
2359
- */
2360
- bool llamafile_sgemm(const struct lm_ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
2361
- const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
2362
- int64_t ldc, int Atype, int Btype, int Ctype) {
2363
-
2364
- assert(m >= 0);
2365
- assert(n >= 0);
2366
- assert(k >= 0);
2367
- assert(lda >= k);
2368
- assert(ldb >= k);
2369
- assert(ldc >= m);
2370
- assert(params->nth > 0);
2371
- assert(params->ith < params->nth);
2372
-
2373
- // only enable sgemm for prompt processing
2374
- if (n < 2)
2375
- return false;
2376
-
2377
- if (Ctype != LM_GGML_TYPE_F32)
2378
- return false;
2379
-
2380
- switch (Atype) {
2381
-
2382
- case LM_GGML_TYPE_F32: {
2383
- if (Btype != LM_GGML_TYPE_F32)
2384
- return false;
2385
- #if defined(__AVX512F__)
2386
- tinyBLAS<16, __m512, __m512, float, float, float> tb{ params,
2387
- k, (const float *)A, lda,
2388
- (const float *)B, ldb,
2389
- (float *)C, ldc};
2390
- return tb.matmul(m, n);
2391
- #elif defined(__AVX__) || defined(__AVX2__)
2392
- tinyBLAS<8, __m256, __m256, float, float, float> tb{ params,
2393
- k, (const float *)A, lda,
2394
- (const float *)B, ldb,
2395
- (float *)C, ldc};
2396
- return tb.matmul(m, n);
2397
- #elif defined(__ARM_NEON)
2398
- if (n < 4)
2399
- return false;
2400
- tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
2401
- k, (const float *)A, lda,
2402
- (const float *)B, ldb,
2403
- (float *)C, ldc};
2404
- return tb.matmul(m, n);
2405
- #elif defined(__MMA__)
2406
- if (k % 8)
2407
- return false;
2408
- tinyBLAS_PPC<float, float, float> tb{
2409
- k, (const float *)A, lda,
2410
- (const float *)B, ldb,
2411
- (float *)C, ldc,
2412
- params->ith, params->nth};
2413
- tb.matmul(m, n);
2414
- return true;
2415
- #else
2416
- return false;
2417
- #endif
2418
- }
2419
-
2420
- case LM_GGML_TYPE_BF16: {
2421
- #if defined(__AVX512BF16__)
2422
- if (Btype == LM_GGML_TYPE_BF16) {
2423
- tinyBLAS<32, __m512, __m512bh, lm_ggml_bf16_t, lm_ggml_bf16_t, float> tb{ params, k,
2424
- (const lm_ggml_bf16_t *)A, lda,
2425
- (const lm_ggml_bf16_t *)B, ldb,
2426
- (float *)C, ldc};
2427
- return tb.matmul(m, n);
2428
- }
2429
- #elif defined(__AVX512F__)
2430
- if (Btype == LM_GGML_TYPE_BF16) {
2431
- tinyBLAS<16, __m512, __m512, lm_ggml_bf16_t, lm_ggml_bf16_t, float> tb{ params, k,
2432
- (const lm_ggml_bf16_t *)A, lda,
2433
- (const lm_ggml_bf16_t *)B, ldb,
2434
- (float *)C, ldc};
2435
- return tb.matmul(m, n);
2436
- }
2437
- #elif defined(__AVX2__)
2438
- if (Btype == LM_GGML_TYPE_BF16) {
2439
- tinyBLAS<8, __m256, __m256, lm_ggml_bf16_t, lm_ggml_bf16_t, float> tb{ params, k,
2440
- (const lm_ggml_bf16_t *)A, lda,
2441
- (const lm_ggml_bf16_t *)B, ldb,
2442
- (float *)C, ldc};
2443
- return tb.matmul(m, n);
2444
- }
2445
- #endif
2446
- return false;
2447
- }
2448
- case LM_GGML_TYPE_F16: {
2449
- #if defined(__AVX512F__)
2450
- if (Btype == LM_GGML_TYPE_F16) {
2451
- tinyBLAS<16, __m512, __m512, lm_ggml_fp16_t, lm_ggml_fp16_t, float> tb{ params, k,
2452
- (const lm_ggml_fp16_t *)A, lda,
2453
- (const lm_ggml_fp16_t *)B, ldb,
2454
- (float *)C, ldc};
2455
- return tb.matmul(m, n);
2456
- }
2457
- #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
2458
- if (Btype == LM_GGML_TYPE_F16) {
2459
- tinyBLAS<8, __m256, __m256, lm_ggml_fp16_t, lm_ggml_fp16_t, float> tb{ params, k,
2460
- (const lm_ggml_fp16_t *)A, lda,
2461
- (const lm_ggml_fp16_t *)B, ldb,
2462
- (float *)C, ldc};
2463
- return tb.matmul(m, n);
2464
- }
2465
- #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
2466
- if (n < 8)
2467
- return false;
2468
- if (Btype == LM_GGML_TYPE_F16) {
2469
- tinyBLAS<8, float16x8_t, float16x8_t, lm_ggml_fp16_t, lm_ggml_fp16_t, float> tb{ params,
2470
- k, (const lm_ggml_fp16_t *)A, lda,
2471
- (const lm_ggml_fp16_t *)B, ldb,
2472
- (float *)C, ldc};
2473
- return tb.matmul(m, n);
2474
- }
2475
- #elif defined(__ARM_NEON) && !defined(_MSC_VER)
2476
- if (Btype == LM_GGML_TYPE_F32) {
2477
- tinyBLAS<4, float32x4_t, float32x4_t, lm_ggml_fp16_t, float, float> tb{ params,
2478
- k, (const lm_ggml_fp16_t *)A, lda,
2479
- (const float *)B, ldb,
2480
- (float *)C, ldc};
2481
- return tb.matmul(m, n);
2482
- }
2483
- #endif
2484
- return false;
2485
- }
2486
-
2487
- case LM_GGML_TYPE_Q8_0: {
2488
- if (Btype != LM_GGML_TYPE_Q8_0)
2489
- return false;
2490
- #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
2491
- tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
2492
- k, (const block_q8_0 *)A, lda,
2493
- (const block_q8_0 *)B, ldb,
2494
- (float *)C, ldc,
2495
- params->ith, params->nth};
2496
- tb.matmul(m, n);
2497
- return true;
2498
- #elif defined(__ARM_FEATURE_DOTPROD)
2499
- tinyBLAS_Q0_ARM<block_q8_0> tb{
2500
- k, (const block_q8_0 *)A, lda,
2501
- (const block_q8_0 *)B, ldb,
2502
- (float *)C, ldc,
2503
- params->ith, params->nth};
2504
- tb.matmul(m, n);
2505
- return true;
2506
-
2507
- #elif defined(__MMA__)
2508
- if (n < 8 && n != 4)
2509
- return false;
2510
- if (m < 8 && m != 4)
2511
- return false;
2512
- tinyBLAS_Q0_PPC<block_q8_0, block_q8_0, float> tb{
2513
- k, (const block_q8_0 *)A, lda,
2514
- (const block_q8_0 *)B, ldb,
2515
- (float *)C, ldc,
2516
- params->ith, params->nth};
2517
- tb.matmul(m, n);
2518
- return true;
2519
-
2520
- #else
2521
- return false;
2522
- #endif
2523
- }
2524
-
2525
- case LM_GGML_TYPE_Q4_0: {
2526
- if (Btype != LM_GGML_TYPE_Q8_0)
2527
- return false;
2528
- #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
2529
- tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
2530
- k, (const block_q4_0 *)A, lda,
2531
- (const block_q8_0 *)B, ldb,
2532
- (float *)C, ldc,
2533
- params->ith, params->nth};
2534
- tb.matmul(m, n);
2535
- return true;
2536
- #elif defined(__ARM_FEATURE_DOTPROD)
2537
- tinyBLAS_Q0_ARM<block_q4_0> tb{
2538
- k, (const block_q4_0 *)A, lda,
2539
- (const block_q8_0 *)B, ldb,
2540
- (float *)C, ldc,
2541
- params->ith, params->nth};
2542
- tb.matmul(m, n);
2543
- return true;
2544
- #else
2545
- return false;
2546
- #endif
2547
- }
2548
-
2549
- case LM_GGML_TYPE_Q5_0: {
2550
- if (Btype != LM_GGML_TYPE_Q8_0)
2551
- return false;
2552
- #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
2553
- tinyBLAS_Q0_AVX<block_q5_0, block_q8_0, float> tb{
2554
- k, (const block_q5_0 *)A, lda,
2555
- (const block_q8_0 *)B, ldb,
2556
- (float *)C, ldc,
2557
- params->ith, params->nth};
2558
- tb.matmul(m, n);
2559
- return true;
2560
- #else
2561
- return false;
2562
- #endif
2563
- }
2564
-
2565
- case LM_GGML_TYPE_IQ4_NL: {
2566
- if (Btype != LM_GGML_TYPE_Q8_0)
2567
- return false;
2568
- #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
2569
- tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
2570
- k, (const block_iq4_nl *)A, lda,
2571
- (const block_q8_0 *)B, ldb,
2572
- (float *)C, ldc,
2573
- params->ith, params->nth};
2574
- tb.matmul(m, n);
2575
- return true;
2576
- #else
2577
- return false;
2578
- #endif
2579
- }
2580
-
2581
- default:
2582
- return false;
2583
- }
2584
-
2585
- (void)params;
2586
- (void)m;
2587
- (void)n;
2588
- (void)k;
2589
- (void)A;
2590
- (void)lda;
2591
- (void)B;
2592
- (void)ldb;
2593
- (void)C;
2594
- (void)ldc;
2595
- (void)Atype;
2596
- (void)Btype;
2597
- (void)Ctype;
2598
- }
1
+ // Copyright 2024 Mozilla Foundation
2
+ //
3
+ // Permission is hereby granted, free of charge, to any person obtaining
4
+ // a copy of this software and associated documentation files (the
5
+ // "Software"), to deal in the Software without restriction, including
6
+ // without limitation the rights to use, copy, modify, merge, publish,
7
+ // distribute, sublicense, and/or sell copies of the Software, and to
8
+ // permit persons to whom the Software is furnished to do so, subject to
9
+ // the following conditions:
10
+ //
11
+ // The above copyright notice and this permission notice shall be
12
+ // included in all copies or substantial portions of the Software.
13
+ //
14
+ // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18
+ // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19
+ // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ // SOFTWARE.
22
+
23
+ //
24
+ // _ _ ___ _ _ ___
25
+ // | |_(_)_ _ _ _| _ ) | /_\ / __|
26
+ // | _| | ' \ || | _ \ |__ / _ \\__ \.
27
+ // \__|_|_||_\_, |___/____/_/ \_\___/
28
+ // |__/
29
+ //
30
+ // BASIC LINEAR ALGEBRA SUBPROGRAMS
31
+ //
32
+ //
33
+ // This file implements multithreaded CPU matrix multiplication for the
34
+ // common contiguous use case C = Aᵀ * B. These kernels are designed to
35
+ // have excellent performance[1] for matrices that fit in the CPU cache
36
+ // without imposing any overhead such as cache filling or malloc calls.
37
+ //
38
+ // This implementation does not guarantee any upper bound with rounding
39
+ // errors, which grow along with k. Our goal's to maximally exploit the
40
+ // hardware for performance, and then use whatever resources remain for
41
+ // improving numerical accuracy.
42
+ //
43
+ // [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
44
+ // Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
45
+
46
+ #if defined(__GNUC__)
47
+ #pragma GCC diagnostic ignored "-Wpedantic"
48
+ #pragma GCC diagnostic ignored "-Wignored-attributes"
49
+ #endif
50
+
51
+ #include "sgemm.h"
52
+ #include "ggml-impl.h"
53
+ #include "ggml-cpu-impl.h"
54
+ #include "ggml-quants.h"
55
+
56
+ #include <atomic>
57
+ #include <array>
58
+
59
+ #ifdef _MSC_VER
60
+ #define NOINLINE __declspec(noinline)
61
+ #else
62
+ #define NOINLINE __attribute__((__noinline__))
63
+ #endif
64
+
65
+ #if defined(__ARM_NEON) || defined(__AVX512F__)
66
+ #define VECTOR_REGISTERS 32
67
+ #else
68
+ #define VECTOR_REGISTERS 16
69
+ #endif
70
+
71
+ #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
72
+
73
+ namespace {
74
+
75
+ inline float unhalf(lm_ggml_fp16_t d) {
76
+ return LM_GGML_FP16_TO_FP32(d);
77
+ }
78
+
79
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
80
+ // VECTORIZED ARITHMETIC OPERATIONS
81
+
82
+ #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
83
+ inline __m128 add(__m128 x, __m128 y) { return _mm_add_ps(x, y); }
84
+ inline __m128 sub(__m128 x, __m128 y) { return _mm_sub_ps(x, y); }
85
+ inline __m128 mul(__m128 x, __m128 y) { return _mm_mul_ps(x, y); }
86
+ #endif // __SSE__
87
+
88
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
89
+ inline __m256 add(__m256 x, __m256 y) { return _mm256_add_ps(x, y); }
90
+ inline __m256 sub(__m256 x, __m256 y) { return _mm256_sub_ps(x, y); }
91
+ inline __m256 mul(__m256 x, __m256 y) { return _mm256_mul_ps(x, y); }
92
+ #endif // __AVX__
93
+
94
+ #if defined(__AVX512F__)
95
+ inline __m512 add(__m512 x, __m512 y) { return _mm512_add_ps(x, y); }
96
+ inline __m512 sub(__m512 x, __m512 y) { return _mm512_sub_ps(x, y); }
97
+ inline __m512 mul(__m512 x, __m512 y) { return _mm512_mul_ps(x, y); }
98
+ #endif // __AVX512F__
99
+
100
+ #if defined(__ARM_NEON)
101
+ inline float32x4_t add(float32x4_t x, float32x4_t y) { return vaddq_f32(x, y); }
102
+ inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vsubq_f32(x, y); }
103
+ inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vmulq_f32(x, y); }
104
+ #endif // __ARM_NEON
105
+
106
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
107
+ inline float16x8_t add(float16x8_t x, float16x8_t y) { return vaddq_f16(x, y); }
108
+ inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
109
+ inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
110
+ #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
111
+
112
+ #if defined(__MMA__)
113
+ typedef vector unsigned char vec_t;
114
+ typedef __vector_quad acc_t;
115
+ #endif
116
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
117
+ // VECTORIZED FUSED MULTIPLY ADD
118
+
119
+ /**
120
+ * Computes a * b + c.
121
+ */
122
+ template <typename T, typename U>
123
+ inline U madd(T a, T b, U c) {
124
+ return add(mul(a, b), c);
125
+ }
126
+
127
+ #if defined(__FMA__)
128
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
129
+ template <>
130
+ inline __m256 madd(__m256 a, __m256 b, __m256 c) {
131
+ return _mm256_fmadd_ps(a, b, c);
132
+ }
133
+ #endif
134
+ #if defined(__AVX512F__)
135
+ template <>
136
+ inline __m512 madd(__m512 a, __m512 b, __m512 c) {
137
+ return _mm512_fmadd_ps(a, b, c);
138
+ }
139
+ #endif
140
+ #if defined(__AVX512BF16__)
141
+ template <>
142
+ inline __m512 madd(__m512bh a, __m512bh b, __m512 c) {
143
+ return _mm512_dpbf16_ps(c, a, b);
144
+ }
145
+ template <>
146
+ inline __m256 madd(__m256bh a, __m256bh b, __m256 c) {
147
+ return _mm256_dpbf16_ps(c, a, b);
148
+ }
149
+ #endif
150
+ #endif
151
+
152
+ #if defined(__ARM_FEATURE_FMA)
153
+ template <>
154
+ inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
155
+ return vfmaq_f32(c, b, a);
156
+ }
157
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
158
+ template <>
159
+ inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) {
160
+ return vfmaq_f16(c, b, a);
161
+ }
162
+ #endif
163
+ #endif
164
+
165
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
166
+ // VECTORIZED HORIZONTAL SUM
167
+
168
+ #if defined(__ARM_NEON)
169
+ inline float hsum(float32x4_t x) {
170
+ return vaddvq_f32(x);
171
+ }
172
+ #endif // __ARM_NEON
173
+
174
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
175
+ inline float hsum(float16x8_t x) {
176
+ return vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(x)),
177
+ vcvt_f32_f16(vget_high_f16(x))));
178
+ }
179
+ #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
180
+
181
+ #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
182
+ inline float hsum(__m128 x) {
183
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
184
+ x = _mm_add_ps(x, _mm_movehl_ps(x, x));
185
+ x = _mm_add_ss(x, _mm_movehdup_ps(x));
186
+ #else
187
+ __m128 t;
188
+ t = _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1));
189
+ x = _mm_add_ps(x, t);
190
+ t = _mm_movehl_ps(t, x);
191
+ x = _mm_add_ss(x, t);
192
+ #endif
193
+ return _mm_cvtss_f32(x);
194
+ }
195
+ #endif
196
+
197
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
198
+ inline float hsum(__m256 x) {
199
+ return hsum(_mm_add_ps(_mm256_extractf128_ps(x, 1),
200
+ _mm256_castps256_ps128(x)));
201
+ }
202
+ #endif // __AVX__
203
+
204
+ #if defined(__AVX512F__)
205
+ inline float hsum(__m512 x) {
206
+ return _mm512_reduce_add_ps(x);
207
+ }
208
+ #endif // __AVX512F__
209
+
210
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
211
+ // VECTORIZED MEMORY LOADING
212
+
213
+ template <typename T, typename U> T load(const U *);
214
+
215
+ #if defined(__ARM_NEON)
216
+ template <> inline float32x4_t load(const float *p) {
217
+ return vld1q_f32(p);
218
+ }
219
+ #if !defined(_MSC_VER)
220
+ // FIXME: this should check for __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
221
+ template <> inline float16x8_t load(const lm_ggml_fp16_t *p) {
222
+ return vld1q_f16((const float16_t *)p);
223
+ }
224
+ template <> inline float32x4_t load(const lm_ggml_fp16_t *p) {
225
+ return vcvt_f32_f16(vld1_f16((const float16_t *)p));
226
+ }
227
+ #endif // _MSC_VER
228
+ #endif // __ARM_NEON
229
+
230
+ #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
231
+ template <> inline __m128 load(const float *p) {
232
+ return _mm_loadu_ps(p);
233
+ }
234
+ #endif // __SSE__
235
+
236
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
237
+ template <> inline __m256 load(const float *p) {
238
+ return _mm256_loadu_ps(p);
239
+ }
240
+ #endif // __AVX__
241
+
242
+ #if defined(__AVX2__) || defined(__AVX512F__)
243
+ template <> inline __m256 load(const lm_ggml_bf16_t *p) {
244
+ return _mm256_castsi256_ps(
245
+ _mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16));
246
+ }
247
+ #endif // __AVX2__
248
+
249
+ #if defined(__F16C__)
250
+ template <> inline __m256 load(const lm_ggml_fp16_t *p) {
251
+ return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
252
+ }
253
+ #endif // __F16C__
254
+
255
+ #if defined(__AVX512F__)
256
+ template <> inline __m512 load(const float *p) {
257
+ return _mm512_loadu_ps(p);
258
+ }
259
+ template <> inline __m512 load(const lm_ggml_fp16_t *p) {
260
+ return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
261
+ }
262
+ template <> inline __m512 load(const lm_ggml_bf16_t *p) {
263
+ return _mm512_castsi512_ps(
264
+ _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16));
265
+ }
266
+ #endif // __AVX512F__
267
+
268
+ #if defined(__AVX512BF16__)
269
+ template <> inline __m512bh load(const lm_ggml_bf16_t *p) {
270
+ return (__m512bh)_mm512_loadu_ps((const float *)p);
271
+ }
272
+ template <> inline __m256bh load(const lm_ggml_bf16_t *p) {
273
+ return (__m256bh)_mm256_loadu_ps((const float *)p);
274
+ }
275
+ template <> inline __m512bh load(const float *p) {
276
+ return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p));
277
+ }
278
+ template <> inline __m256bh load(const float *p) {
279
+ return _mm512_cvtneps_pbh(_mm512_loadu_ps(p));
280
+ }
281
+ #endif
282
+
283
+ ////////////////////////////////////////////////////////////////////////////////////////////////////
284
+ // FLOATING POINT MATRIX MULTIPLICATION
285
+
286
+ template <int M>
287
+ static inline int64_t BLOCK_SIZE(size_t m) {
288
+ const int64_t NB_BLOC_M = (m + M - 1) / M;
289
+ return (m % NB_BLOC_M == 0) ? m / NB_BLOC_M : (m / NB_BLOC_M) + 1;
290
+ }
291
+
292
+ static constexpr inline int64_t BLOC_POS(int64_t ib, int64_t ibN, int64_t bloc_size) {
293
+ return ib < ibN ? ib * bloc_size : ibN * bloc_size + (ib - ibN) * (bloc_size - 1);
294
+ }
295
+
296
+ template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
297
+ class tinyBLAS {
298
+ public:
299
+ tinyBLAS(const lm_ggml_compute_params * params, int64_t k,
300
+ const TA *A, int64_t lda,
301
+ const TB *B, int64_t ldb,
302
+ TC *C, int64_t ldc)
303
+ : params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) {
304
+ }
305
+
306
+ bool matmul(int64_t m, int64_t n) {
307
+ if (k % KN != 0)
308
+ return false;
309
+ // compute RM for only need tile with size RM&RM-1
310
+ #if VECTOR_REGISTERS == 32
311
+ if (m % 16 == 0 && (m/16 >= params->nth)) {
312
+ const int64_t SIZE_N = BLOCK_SIZE<6>(n);
313
+ mnpack<4, 6, 4>(m, n, SIZE_N, 12);
314
+ return true;
315
+ }
316
+ if (m % 8 == 0 ) {
317
+ const int64_t SIZE_N = BLOCK_SIZE<6>(n);
318
+ mnpack<4, 6, 2>(m, n, SIZE_N, 12);
319
+ return true;
320
+ }
321
+ if (m % 4 == 0) {
322
+ const int64_t SIZE_N = BLOCK_SIZE<6>(n);
323
+ mnpack<4, 6, 1>(m, n, SIZE_N, 12);
324
+ return true;
325
+ }
326
+ #else // VECTOR_REGISTERS == 16
327
+ if (m % 16 == 0 && (m/16 >= params->nth)) {
328
+ const int64_t SIZE_N = BLOCK_SIZE<3>(n);
329
+ mnpack<4, 3, 4>(m, n, SIZE_N, 24);
330
+ return true;
331
+ }
332
+ if (m % 8 == 0 ) {
333
+ const int64_t SIZE_N = BLOCK_SIZE<3>(n);
334
+ mnpack<4, 3, 2>(m, n, SIZE_N, 24);
335
+ return true;
336
+ }
337
+ if (m % 4 == 0) {
338
+ const int64_t SIZE_N = BLOCK_SIZE<3>(n);
339
+ mnpack<4, 3, 1>(m, n, SIZE_N, 24);
340
+ return true;
341
+ }
342
+ #endif
343
+ return false;
344
+ }
345
+
346
+ private:
347
+ template <int RM, int RN, int BM>
348
+ inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) {
349
+ if (SIZE_N == RN) {
350
+ return gemm<RM, RN, BM>(m, n, BN);
351
+ }
352
+ if constexpr (RN > 1) {
353
+ return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
354
+ } else {
355
+ LM_GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
356
+ LM_GGML_ASSERT(false); // we have miss something.
357
+ }
358
+ }
359
+
360
+ template <int RM, int RN>
361
+ inline void gemm_bloc(int64_t ii, int64_t jj) {
362
+ D Cv[RN][RM] = {};
363
+ for (int64_t l = 0; l < k; l += KN) {
364
+ // help compiler for op order.
365
+ if constexpr (RM <= RN) {
366
+ V Av[RM];
367
+ for (int64_t i = 0; i < RM; ++i) {
368
+ Av[i] = load<V>(A + lda * (ii + i) + l);
369
+ }
370
+ for (int64_t j = 0; j < RN; ++j) {
371
+ V Bv = load<V>(B + ldb * (jj + j) + l);
372
+ for (int64_t i = 0; i < RM; ++i) {
373
+ Cv[j][i] = madd(Av[i], Bv, Cv[j][i]);
374
+ }
375
+ }
376
+ } else {
377
+ V Bv[RN];
378
+ for (int64_t j = 0; j < RN; ++j) {
379
+ Bv[j] = load<V>(B + ldb * (jj + j) + l);
380
+ }
381
+ for (int64_t i = 0; i < RM; ++i) {
382
+ V Av = load<V>(A + lda * (ii + i) + l);
383
+ for (int64_t j = 0; j < RN; ++j) {
384
+ Cv[j][i] = madd(Av, Bv[j], Cv[j][i]);
385
+ }
386
+ }
387
+ }
388
+ }
389
+ for (int64_t j = 0; j < RN; ++j)
390
+ for (int64_t i = 0; i < RM; ++i)
391
+ C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
392
+ }
393
+
394
+ template <int RM, int RN, int BM>
395
+ NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
396
+ static std::atomic<int64_t> current_chunk;
397
+
398
+ LM_GGML_ASSERT(m % (RM * BM) == 0);
399
+ const int64_t ytiles = m / (RM * BM);
400
+ const int64_t xtiles = (n + RN -1) / RN;
401
+ const int64_t jj_RN = (xtiles - (xtiles * RN - n));
402
+
403
+ // "round" bloc_size to "nearest" BN
404
+ const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN;
405
+ const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1;
406
+ const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles));
407
+ const int64_t nb_job = ytiles * NB_BN;
408
+
409
+ if (params->ith == 0) {
410
+ LM_GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
411
+ // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
412
+ std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
413
+ }
414
+
415
+ lm_ggml_barrier(params->threadpool);
416
+
417
+ int64_t job = params->ith;
418
+ while (job < nb_job) {
419
+ const int64_t ii = (job % ytiles) * RM * BM;
420
+ const int64_t jb = job / ytiles;
421
+ const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN);
422
+ const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN);
423
+
424
+ const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN);
425
+ const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN);
426
+ const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN;
427
+
428
+ for (int64_t bi = 0; bi < BM * RM; bi += RM) {
429
+ int64_t jj = jj0;
430
+ for (; jj < jj1; jj += RN) {
431
+ gemm_bloc<RM, RN>(ii + bi, jj);
432
+ }
433
+ if constexpr (RN > 1) {
434
+ for (; jj < jj2; jj += RN - 1) {
435
+ gemm_bloc<RM, RN-1>(ii + bi, jj);
436
+ }
437
+ }
438
+ LM_GGML_ASSERT(jj == jj2);
439
+ }
440
+
441
+ // next step.
442
+ job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
443
+ }
444
+
445
+ lm_ggml_barrier(params->threadpool);
446
+ return;
447
+ }
448
+
449
+ const lm_ggml_compute_params * params;
450
+ const TA *const A;
451
+ const TB *const B;
452
+ TC *const C;
453
+ const int64_t k;
454
+ const int64_t lda;
455
+ const int64_t ldb;
456
+ const int64_t ldc;
457
+ };
458
+
459
+ //////////////////////////////////////////////////////////////////////////////////////////
460
+ // QUANT ZERO MATRIX MULTIPLICATION
461
+
462
+ #if defined(__ARM_FEATURE_DOTPROD)
463
+ template <typename TA>
464
+ class tinyBLAS_Q0_ARM {
465
+ public:
466
+ tinyBLAS_Q0_ARM(int64_t k,
467
+ const TA *A, int64_t lda,
468
+ const block_q8_0 *B, int64_t ldb,
469
+ float *C, int64_t ldc,
470
+ int ith, int nth)
471
+ : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
472
+ }
473
+
474
+ void matmul(int64_t m, int64_t n) {
475
+ mnpack(0, m, 0, n);
476
+ }
477
+
478
+ private:
479
+ NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
480
+ int64_t mc, nc, mp, np;
481
+ switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3ll)) {
482
+ case 0x33:
483
+ mc = 3;
484
+ nc = 3;
485
+ gemm<3, 3>(m0, m, n0, n);
486
+ break;
487
+ case 0x32:
488
+ mc = 3;
489
+ nc = 2;
490
+ gemm<3, 2>(m0, m, n0, n);
491
+ break;
492
+ case 0x23:
493
+ mc = 2;
494
+ nc = 3;
495
+ gemm<2, 3>(m0, m, n0, n);
496
+ break;
497
+ case 0x22:
498
+ mc = 2;
499
+ nc = 2;
500
+ gemm<2, 2>(m0, m, n0, n);
501
+ break;
502
+ case 0x31:
503
+ mc = 3;
504
+ nc = 1;
505
+ gemm<3, 1>(m0, m, n0, n);
506
+ break;
507
+ case 0x13:
508
+ mc = 1;
509
+ nc = 3;
510
+ gemm<1, 3>(m0, m, n0, n);
511
+ break;
512
+ case 0x21:
513
+ mc = 2;
514
+ nc = 1;
515
+ gemm<2, 1>(m0, m, n0, n);
516
+ break;
517
+ case 0x12:
518
+ mc = 1;
519
+ nc = 2;
520
+ gemm<1, 2>(m0, m, n0, n);
521
+ break;
522
+ case 0x11:
523
+ mc = 1;
524
+ nc = 1;
525
+ gemm<1, 1>(m0, m, n0, n);
526
+ break;
527
+ default:
528
+ return;
529
+ }
530
+ mp = m0 + (m - m0) / mc * mc;
531
+ np = n0 + (n - n0) / nc * nc;
532
+ mnpack(mp, m, n0, np);
533
+ mnpack(m0, m, np, n);
534
+ }
535
+
536
+ template <int RM, int RN>
537
+ NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
538
+ int64_t ytiles = (m - m0) / RM;
539
+ int64_t xtiles = (n - n0) / RN;
540
+ int64_t tiles = xtiles * ytiles;
541
+ int64_t duty = (tiles + nth - 1) / nth;
542
+ int64_t start = duty * ith;
543
+ int64_t end = start + duty;
544
+ if (end > tiles)
545
+ end = tiles;
546
+ for (int64_t job = start; job < end; ++job) {
547
+ int64_t ii = m0 + job / xtiles * RM;
548
+ int64_t jj = n0 + job % xtiles * RN;
549
+ float32x4_t Cv[RN][RM] = {};
550
+ for (int64_t l = 0; l < k; ++l)
551
+ for (int64_t j = 0; j < RN; ++j)
552
+ for (int64_t i = 0; i < RM; ++i)
553
+ Cv[j][i] = vmlaq_n_f32(Cv[j][i],
554
+ vcvtq_f32_s32(vdotq_s32(
555
+ vdotq_s32(vdupq_n_s32(0),
556
+ load_lo(A + lda * (ii + i) + l),
557
+ load_lo(B + ldb * (jj + j) + l)),
558
+ load_hi(A + lda * (ii + i) + l),
559
+ load_hi(B + ldb * (jj + j) + l))),
560
+ unhalf(A[lda * (ii + i) + l].d) *
561
+ unhalf(B[ldb * (jj + j) + l].d));
562
+ for (int64_t j = 0; j < RN; ++j)
563
+ for (int64_t i = 0; i < RM; ++i)
564
+ C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
565
+ }
566
+ }
567
+
568
+ inline int8x16_t load_lo(const block_q8_0 *b) {
569
+ return vld1q_s8(b->qs);
570
+ }
571
+
572
+ inline int8x16_t load_hi(const block_q8_0 *b) {
573
+ return vld1q_s8(b->qs + 16);
574
+ }
575
+
576
+ inline int8x16_t load_lo(const block_q4_0 *b) {
577
+ return vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vld1q_u8(b->qs),
578
+ vdupq_n_u8(0x0f))),
579
+ vdupq_n_s8(0x8));
580
+ }
581
+
582
+ inline int8x16_t load_hi(const block_q4_0 *b) {
583
+ return vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(vld1q_u8(b->qs), 4)),
584
+ vdupq_n_s8(0x8));
585
+ }
586
+
587
+ const TA *const A;
588
+ const block_q8_0 *const B;
589
+ float *const C;
590
+ const int64_t k;
591
+ const int64_t lda;
592
+ const int64_t ldb;
593
+ const int64_t ldc;
594
+ const int ith;
595
+ const int nth;
596
+ };
597
+ #endif // __ARM_FEATURE_DOTPROD
598
+
599
+ #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
600
+ template <typename TA, typename TB, typename TC>
601
+ class tinyBLAS_Q0_AVX {
602
+ public:
603
+ tinyBLAS_Q0_AVX(int64_t k,
604
+ const TA *A, int64_t lda,
605
+ const TB *B, int64_t ldb,
606
+ TC *C, int64_t ldc,
607
+ int ith, int nth)
608
+ : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
609
+ const int8_t kvalues_iq4nl[16] = {
610
+ -127, -104, -83, -65,
611
+ -49, -35, -22, -10,
612
+ 1, 13, 25, 38,
613
+ 53, 69, 89, 113
614
+ };
615
+
616
+ iq4nlt = _mm_loadu_si128((const __m128i *)kvalues_iq4nl);
617
+ }
618
+
619
+ void matmul(int64_t m, int64_t n) {
620
+ mnpack(0, m, 0, n);
621
+ }
622
+
623
+ private:
624
+ void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
625
+ int64_t mc, nc, mp, np;
626
+ switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 4)) {
627
+ #if VECTOR_REGISTERS == 32
628
+ case 0x44:
629
+ mc = 4;
630
+ nc = 4;
631
+ #if defined(__AVX2__) && defined(__F16C__)
632
+ gemm4xN<4>(m0, m, n0, n);
633
+ #else
634
+ gemm<4, 4>(m0, m, n0, n);
635
+ #endif
636
+ break;
637
+ case 0x43:
638
+ mc = 4;
639
+ nc = 3;
640
+ #if defined(__AVX2__) && defined(__F16C__)
641
+ gemm4xN<3>(m0, m, n0, n);
642
+ #else
643
+ gemm<4, 3>(m0, m, n0, n);
644
+ #endif
645
+ break;
646
+ case 0x34:
647
+ mc = 3;
648
+ nc = 4;
649
+ #if defined(__AVX2__) && defined(__F16C__)
650
+ gemmMx4<3>(m0, m, n0, n);
651
+ #else
652
+ gemm<3, 4>(m0, m, n0, n);
653
+ #endif
654
+ break;
655
+ case 0x33:
656
+ mc = 3;
657
+ nc = 3;
658
+ gemm<3, 3>(m0, m, n0, n);
659
+ break;
660
+ case 0x42:
661
+ mc = 4;
662
+ nc = 2;
663
+ #if defined(__AVX2__) && defined(__F16C__)
664
+ gemm4xN<2>(m0, m, n0, n);
665
+ #else
666
+ gemm<4, 2>(m0, m, n0, n);
667
+ #endif
668
+ break;
669
+ case 0x24:
670
+ mc = 2;
671
+ nc = 4;
672
+ #if defined(__AVX2__) && defined(__F16C__)
673
+ gemmMx4<2>(m0, m, n0, n);
674
+ #else
675
+ gemm<2, 4>(m0, m, n0, n);
676
+ #endif
677
+ break;
678
+ #else
679
+ case 0x44:
680
+ case 0x43:
681
+ case 0x42:
682
+ mc = 4;
683
+ nc = 2;
684
+ #if defined(__AVX2__) && defined(__F16C__)
685
+ gemm4xN<2>(m0, m, n0, n);
686
+ #else
687
+ gemm<4, 2>(m0, m, n0, n);
688
+ #endif
689
+ break;
690
+ case 0x34:
691
+ case 0x24:
692
+ mc = 2;
693
+ nc = 4;
694
+ #if defined(__AVX2__) && defined(__F16C__)
695
+ gemmMx4<2>(m0, m, n0, n);
696
+ #else
697
+ gemm<2, 4>(m0, m, n0, n);
698
+ #endif
699
+ break;
700
+ case 0x33:
701
+ #endif
702
+ case 0x32:
703
+ mc = 3;
704
+ nc = 2;
705
+ gemm<3, 2>(m0, m, n0, n);
706
+ break;
707
+ case 0x23:
708
+ mc = 2;
709
+ nc = 3;
710
+ gemm<2, 3>(m0, m, n0, n);
711
+ break;
712
+ case 0x41:
713
+ mc = 4;
714
+ nc = 1;
715
+ #if defined(__AVX2__) && defined(__F16C__)
716
+ gemm4xN<1>(m0, m, n0, n);
717
+ #else
718
+ gemm<4, 1>(m0, m, n0, n);
719
+ #endif
720
+ break;
721
+ case 0x22:
722
+ mc = 2;
723
+ nc = 2;
724
+ gemm<2, 2>(m0, m, n0, n);
725
+ break;
726
+ case 0x14:
727
+ mc = 1;
728
+ nc = 4;
729
+ #if defined(__AVX2__) && defined(__F16C__)
730
+ gemmMx4<1>(m0, m, n0, n);
731
+ #else
732
+ gemm<1, 4>(m0, m, n0, n);
733
+ #endif
734
+ break;
735
+ case 0x31:
736
+ mc = 3;
737
+ nc = 1;
738
+ gemm<3, 1>(m0, m, n0, n);
739
+ break;
740
+ case 0x13:
741
+ mc = 1;
742
+ nc = 3;
743
+ gemm<1, 3>(m0, m, n0, n);
744
+ break;
745
+ case 0x21:
746
+ mc = 2;
747
+ nc = 1;
748
+ gemm<2, 1>(m0, m, n0, n);
749
+ break;
750
+ case 0x12:
751
+ mc = 1;
752
+ nc = 2;
753
+ gemm<1, 2>(m0, m, n0, n);
754
+ break;
755
+ case 0x11:
756
+ mc = 1;
757
+ nc = 1;
758
+ gemm<1, 1>(m0, m, n0, n);
759
+ break;
760
+ default:
761
+ return;
762
+ }
763
+ mp = m0 + (m - m0) / mc * mc;
764
+ np = n0 + (n - n0) / nc * nc;
765
+ mnpack(mp, m, n0, np);
766
+ mnpack(m0, m, np, n);
767
+ }
768
+
769
+ #if defined(__AVX2__) && defined(__F16C__)
770
+ // Templated functions for gemm of dimensions 4xN
771
+ template <int RN>
772
+ NOINLINE void gemm4xN(int64_t m0, int64_t m, int64_t n0, int64_t n) {
773
+ int64_t ytiles = (m - m0) / 4;
774
+ int64_t xtiles = (n - n0) / RN;
775
+ int64_t tiles = xtiles * ytiles;
776
+ int64_t duty = (tiles + nth - 1) / nth;
777
+ int64_t start = duty * ith;
778
+ int64_t end = start + duty;
779
+ if (end > tiles)
780
+ end = tiles;
781
+ for (int64_t job = start; job < end; ++job) {
782
+ int64_t ii = m0 + job / xtiles * 4;
783
+ int64_t jj = n0 + job % xtiles * RN;
784
+ __m256 Cv[RN][4] = {};
785
+ for (int64_t l = 0; l < k; ++l) {
786
+ uint64_t a_delta = ((uint64_t)A[lda * (ii + 3) + l].d << 48) | ((uint64_t)A[lda * (ii + 2) + l].d << 32) | ((uint64_t)A[lda * (ii + 1) + l].d << 16) | (A[lda * (ii + 0) + l].d);
787
+ // Convert delta values for four blocks to float values
788
+ __m128 da = _mm_cvtph_ps(_mm_set_epi64x(0, a_delta));
789
+ __m256i avec0 = load(A + lda * (ii + 0) + l);
790
+ __m256i avec1 = load(A + lda * (ii + 1) + l);
791
+ __m256i avec2 = load(A + lda * (ii + 2) + l);
792
+ __m256i avec3 = load(A + lda * (ii + 3) + l);
793
+ for (int64_t j = 0; j < RN; ++j) {
794
+ __m128 db = _mm_set1_ps(unhalf(B[ldb * (jj + j) + l].d));
795
+ // Computation of product of delta values for four blocks and replicate it across 256 bit lane
796
+ __m256 dvec = _mm256_castps128_ps256(_mm_mul_ps(da, db));
797
+ dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
798
+ // Computation of dot product and multiplication with appropriate delta value products
799
+ Cv[j][0] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
800
+ updot(_mm256_sign_epi8(avec0, avec0),
801
+ _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec0)),
802
+ Cv[j][0]);
803
+ Cv[j][1] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
804
+ updot(_mm256_sign_epi8(avec1, avec1),
805
+ _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec1)),
806
+ Cv[j][1]);
807
+ Cv[j][2] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
808
+ updot(_mm256_sign_epi8(avec2, avec2),
809
+ _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec2)),
810
+ Cv[j][2]);
811
+ Cv[j][3] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
812
+ updot(_mm256_sign_epi8(avec3, avec3),
813
+ _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec3)),
814
+ Cv[j][3]);
815
+ }
816
+ }
817
+
818
+ for (int64_t j = 0; j < RN; ++j)
819
+ for (int64_t i = 0; i < 4; ++i)
820
+ C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
821
+ }
822
+ }
823
+
824
+ // Templated functions for gemm of dimensions Mx4
825
+ template <int RM>
826
+ NOINLINE void gemmMx4(int64_t m0, int64_t m, int64_t n0, int64_t n) {
827
+ int64_t ytiles = (m - m0) / RM;
828
+ int64_t xtiles = (n - n0) / 4;
829
+ int64_t tiles = xtiles * ytiles;
830
+ int64_t duty = (tiles + nth - 1) / nth;
831
+ int64_t start = duty * ith;
832
+ int64_t end = start + duty;
833
+ if (end > tiles)
834
+ end = tiles;
835
+ for (int64_t job = start; job < end; ++job) {
836
+ int64_t ii = m0 + job / xtiles * RM;
837
+ int64_t jj = n0 + job % xtiles * 4;
838
+ __m256 Cv[4][RM] = {};
839
+ for (int64_t l = 0; l < k; ++l) {
840
+ uint64_t b_delta = ((uint64_t)B[ldb * (jj + 3) + l].d << 48) | ((uint64_t)B[ldb * (jj + 2) + l].d << 32) | ((uint64_t)B[ldb * (jj + 1) + l].d << 16) | (B[ldb * (jj + 0) + l].d);
841
+ // Convert delta values for four blocks to float values
842
+ __m128 db = _mm_cvtph_ps(_mm_set_epi64x(0, b_delta));
843
+ __m256i bvec0 = load(B + ldb * (jj + 0) + l);
844
+ __m256i bvec1 = load(B + ldb * (jj + 1) + l);
845
+ __m256i bvec2 = load(B + ldb * (jj + 2) + l);
846
+ __m256i bvec3 = load(B + ldb * (jj + 3) + l);
847
+ for (int64_t i = 0; i < RM; ++i) {
848
+ __m128 da = _mm_set1_ps(unhalf((A[lda * (ii + i) + l].d)));
849
+ // Computation of product of delta values for four blocks and replicate it across 256 bit lane
850
+ __m256 dvec = _mm256_castps128_ps256(_mm_mul_ps(da, db));
851
+ dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
852
+ // Computation of dot product and multiplication with appropriate delta value products
853
+ Cv[0][i] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
854
+ updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
855
+ load(A + lda * (ii + i) + l)),
856
+ _mm256_sign_epi8(bvec0, load(A + lda * (ii + i) + l))),
857
+ Cv[0][i]);
858
+ Cv[1][i] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
859
+ updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
860
+ load(A + lda * (ii + i) + l)),
861
+ _mm256_sign_epi8(bvec1, load(A + lda * (ii + i) + l))),
862
+ Cv[1][i]);
863
+ Cv[2][i] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
864
+ updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
865
+ load(A + lda * (ii + i) + l)),
866
+ _mm256_sign_epi8(bvec2, load(A + lda * (ii + i) + l))),
867
+ Cv[2][i]);
868
+ Cv[3][i] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
869
+ updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
870
+ load(A + lda * (ii + i) + l)),
871
+ _mm256_sign_epi8(bvec3, load(A + lda * (ii + i) + l))),
872
+ Cv[3][i]);
873
+ }
874
+ }
875
+ for (int64_t j = 0; j < 4; ++j)
876
+ for (int64_t i = 0; i < RM; ++i)
877
+ C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
878
+ }
879
+ }
880
+ #endif
881
+
882
+ template <int RM, int RN>
883
+ NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
884
+ int64_t ytiles = (m - m0) / RM;
885
+ int64_t xtiles = (n - n0) / RN;
886
+ int64_t tiles = xtiles * ytiles;
887
+ int64_t duty = (tiles + nth - 1) / nth;
888
+ int64_t start = duty * ith;
889
+ int64_t end = start + duty;
890
+ if (end > tiles)
891
+ end = tiles;
892
+ for (int64_t job = start; job < end; ++job) {
893
+ int64_t ii = m0 + job / xtiles * RM;
894
+ int64_t jj = n0 + job % xtiles * RN;
895
+ __m256 Cv[RN][RM] = {};
896
+ for (int64_t l = 0; l < k; ++l)
897
+ for (int64_t j = 0; j < RN; ++j)
898
+ for (int64_t i = 0; i < RM; ++i) {
899
+ #if defined(__AVX2__)
900
+ __m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
901
+ load(A + lda * (ii + i) + l)),
902
+ _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
903
+ load(A + lda * (ii + i) + l)));
904
+ #else
905
+ __m128i ali0 = load0(A + lda * (ii + i) + l);
906
+ __m128i ali1 = load1(A + lda * (ii + i) + l);
907
+ __m128i blj0 = load0(B + ldb * (jj + j) + l);
908
+ __m128i blj1 = load1(B + ldb * (jj + j) + l);
909
+
910
+ __m128i sepAA0 = _mm_sign_epi8(ali0, ali0);
911
+ __m128i sepAA1 = _mm_sign_epi8(ali1, ali1);
912
+ __m128i sepBA0 = _mm_sign_epi8(blj0, ali0);
913
+ __m128i sepBA1 = _mm_sign_epi8(blj1, ali1);
914
+
915
+ // updot
916
+ const __m128i oneFill = _mm_set1_epi16(1);
917
+ __m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0);
918
+ __m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1);
919
+ __m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0)));
920
+ #endif
921
+ Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
922
+ unhalf(B[ldb * (jj + j) + l].d)),
923
+ udTmp,
924
+ Cv[j][i]);
925
+ }
926
+ for (int64_t j = 0; j < RN; ++j)
927
+ for (int64_t i = 0; i < RM; ++i)
928
+ C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
929
+ }
930
+ }
931
+
932
+ inline __m256i load(const block_q8_0 *b) {
933
+ return _mm256_loadu_si256((const __m256i *)b->qs);
934
+ }
935
+
936
+ inline __m128i load0(const block_q8_0 *b) {
937
+ return _mm_loadu_si128((const __m128i *)b->qs);
938
+ }
939
+
940
+ inline __m128i load1(const block_q8_0 *b) {
941
+ return _mm_loadu_si128(((const __m128i *)b->qs) + 1);
942
+ }
943
+
944
+ inline __m256i load(const block_q4_0 *b) {
945
+ return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
946
+ }
947
+
948
+ inline __m128i load0(const block_q4_0 *b) {
949
+ const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
950
+ return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8));
951
+ }
952
+
953
+ inline __m128i load1(const block_q4_0 *b) {
954
+ const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
955
+ return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
956
+ }
957
+
958
+ inline __m256i load(const block_q5_0 *b) {
959
+ return _mm256_or_si256(denibble(b->qs), bittobyte(b->qh));
960
+ }
961
+
962
+ inline __m128i load0(const block_q5_0* b) {
963
+ const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
964
+ uint32_t x32;
965
+ memcpy(&x32, b->qh, sizeof(uint32_t));
966
+ __m128i qxl = _mm_and_si128(_mm_set1_epi8(15), x);
967
+ __m128i bytesl = _mm_cmpeq_epi8(_mm_set1_epi64x(-1),
968
+ _mm_or_si128(_mm_set1_epi64x(0x7fbfdfeff7fbfdfe),
969
+ _mm_shuffle_epi8(_mm_set1_epi32(x32),
970
+ _mm_set_epi64x(0x0101010101010101, 0x0000000000000000))));
971
+ bytesl = _mm_andnot_si128(bytesl, _mm_set1_epi8((char)0xF0));
972
+ return _mm_or_si128(qxl, bytesl);
973
+ }
974
+
975
+ inline __m128i load1(const block_q5_0* b) {
976
+ const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
977
+ uint32_t x32;
978
+ memcpy(&x32, b->qh, sizeof(uint32_t));
979
+ __m128i qxh = _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4));
980
+ __m128i bytesh = _mm_cmpeq_epi8(_mm_set1_epi64x(-1),
981
+ _mm_or_si128(_mm_set1_epi64x(0x7fbfdfeff7fbfdfe),
982
+ _mm_shuffle_epi8(_mm_set1_epi32(x32),
983
+ _mm_set_epi64x(0x0303030303030303, 0x0202020202020202))));
984
+ bytesh = _mm_andnot_si128(bytesh, _mm_set1_epi8((char)0xF0));
985
+ return _mm_or_si128(qxh, bytesh);
986
+ }
987
+
988
+ inline __m256i load(const block_iq4_nl *b) {
989
+ return MM256_SET_M128I(load1(b), load0(b));
990
+ }
991
+
992
+ inline __m128i load0(const block_iq4_nl *b) {
993
+ const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
994
+ return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x));
995
+ }
996
+
997
+ inline __m128i load1(const block_iq4_nl *b) {
998
+ const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
999
+ return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)));
1000
+ }
1001
+
1002
+ inline __m256 updot(__m256i u, __m256i s) {
1003
+ __m256i res;
1004
+ #if defined(__AVX512VNNI__) && defined(__AVX512VL__)
1005
+ res = _mm256_dpbusd_epi32(_mm256_setzero_si256(), u, s);
1006
+ #elif defined(__AVXVNNI__)
1007
+ res = _mm256_dpbusd_avx_epi32(_mm256_setzero_si256(), u, s);
1008
+ #else
1009
+ res = _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(u, s));
1010
+ #endif
1011
+ return _mm256_cvtepi32_ps(res);
1012
+ }
1013
+
1014
+ static inline __m256i denibble(const uint8_t *p) {
1015
+ __m128i x = _mm_loadu_si128((const __m128i *)p);
1016
+ return _mm256_and_si256(_mm256_set1_epi8(15),
1017
+ _mm256_insertf128_si256(_mm256_castsi128_si256(x),
1018
+ _mm_srli_epi16(x, 4), 1));
1019
+ }
1020
+
1021
+ static inline __m256i bittobyte(const uint8_t *p) {
1022
+ uint32_t x32;
1023
+ memcpy(&x32, p, sizeof(uint32_t));
1024
+ __m256i bytes = _mm256_cmpeq_epi8(_mm256_set1_epi64x(-1),
1025
+ _mm256_or_si256(_mm256_set1_epi64x(0x7fbfdfeff7fbfdfe),
1026
+ _mm256_shuffle_epi8(_mm256_set1_epi32(x32),
1027
+ _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202,
1028
+ 0x0101010101010101, 0x0000000000000000))));
1029
+ return _mm256_andnot_si256(bytes, _mm256_set1_epi8((char)0xF0));
1030
+ }
1031
+
1032
+ const TA *const A;
1033
+ const TB *const B;
1034
+ TC *const C;
1035
+ const int64_t k;
1036
+ const int64_t lda;
1037
+ const int64_t ldb;
1038
+ const int64_t ldc;
1039
+ const int ith;
1040
+ const int nth;
1041
+ __m128i iq4nlt;
1042
+ };
1043
+ #endif // __AVX__
1044
+
1045
+ //PPC Implementation
1046
+ #if defined(__MMA__)
1047
+
1048
+ #define SAVE_ACC(ACC, ii, jj) \
1049
+ __builtin_mma_disassemble_acc(vec_C, ACC); \
1050
+ for (int I = 0; I < 4; I++) { \
1051
+ for (int J = 0; J < 4; J++) { \
1052
+ *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J); \
1053
+ } \
1054
+ } \
1055
+
1056
+ template <typename TA, typename TB, typename TC>
1057
+ class tinyBLAS_Q0_PPC {
1058
+ public:
1059
+ tinyBLAS_Q0_PPC(int64_t k,
1060
+ const TA *A, int64_t lda,
1061
+ const TB *B, int64_t ldb,
1062
+ TC *C, int64_t ldc,
1063
+ int ith, int nth)
1064
+ : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
1065
+ }
1066
+
1067
+ void matmul(int64_t m, int64_t n) {
1068
+ mnpack(0, m, 0, n);
1069
+ }
1070
+
1071
+ private:
1072
+
1073
+ template<int RM, int RN>
1074
+ inline void save_res(int ii, int jj, int idx, vector float* fin_res) {
1075
+ for (int I = 0; I < RM; I++) {
1076
+ for (int J = 0; J < RN; J++) {
1077
+ *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
1078
+ }
1079
+ }
1080
+ }
1081
+
1082
+ template<int size>
1083
+ inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array<int, size>& comparray, vector float* vs, vector float* fin_res) {
1084
+ vector signed int vec_C[4];
1085
+ vector float CA[4] = {0};
1086
+ vector float res[4] = {0};
1087
+ __builtin_mma_disassemble_acc(vec_C, ACC);
1088
+ for (int i = 0; i < 4; i++) {
1089
+ CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
1090
+ res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
1091
+ fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
1092
+ }
1093
+ }
1094
+
1095
+ template<typename VA, typename VB>
1096
+ void packNormal(const TA* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
1097
+ int64_t i, j;
1098
+ TA *aoffset = NULL;
1099
+ VA *vecOffset = NULL;
1100
+ TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
1101
+ TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
1102
+ __vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
1103
+ VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0};
1104
+ VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0};
1105
+ VB t1, t2, t3, t4, t5, t6, t7, t8;
1106
+ vector unsigned char xor_vector;
1107
+ uint8_t flip_vec = 0x80;
1108
+ xor_vector = vec_splats(flip_vec);
1109
+ vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
1110
+ vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
1111
+ vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
1112
+ vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
1113
+
1114
+ aoffset = const_cast<TA*>(a);
1115
+ vecOffset = vec;
1116
+ j = (rows >> 3);
1117
+ if (j > 0) {
1118
+ do {
1119
+ aoffset1 = aoffset;
1120
+ aoffset2 = aoffset1 + lda;
1121
+ aoffset3 = aoffset2 + lda;
1122
+ aoffset4 = aoffset3 + lda;
1123
+ aoffset5 = aoffset4 + lda;
1124
+ aoffset6 = aoffset5 + lda;
1125
+ aoffset7 = aoffset6 + lda;
1126
+ aoffset8 = aoffset7 + lda;
1127
+ aoffset += 8 * lda;
1128
+
1129
+ i = (cols >> 3);
1130
+ if (i > 0) {
1131
+ do {
1132
+ C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
1133
+ C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
1134
+ C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
1135
+ C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs);
1136
+ C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5->qs);
1137
+ C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6->qs);
1138
+ C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7->qs);
1139
+ C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8->qs);
1140
+
1141
+ __builtin_vsx_disassemble_pair(c1, &C1);
1142
+ __builtin_vsx_disassemble_pair(c2, &C2);
1143
+ __builtin_vsx_disassemble_pair(c3, &C3);
1144
+ __builtin_vsx_disassemble_pair(c4, &C4);
1145
+ __builtin_vsx_disassemble_pair(c5, &C5);
1146
+ __builtin_vsx_disassemble_pair(c6, &C6);
1147
+ __builtin_vsx_disassemble_pair(c7, &C7);
1148
+ __builtin_vsx_disassemble_pair(c8, &C8);
1149
+
1150
+ t1 = vec_perm(c1[0], c2[0], swiz1);
1151
+ t2 = vec_perm(c1[0], c2[0], swiz2);
1152
+ t3 = vec_perm(c3[0], c4[0], swiz1);
1153
+ t4 = vec_perm(c3[0], c4[0], swiz2);
1154
+ t5 = vec_perm(t1, t3, swiz3);
1155
+ t6 = vec_perm(t1, t3, swiz4);
1156
+ t7 = vec_perm(t2, t4, swiz3);
1157
+ t8 = vec_perm(t2, t4, swiz4);
1158
+ if (flip == true) {
1159
+ t5 = vec_xor(t5, xor_vector);
1160
+ t6 = vec_xor(t6, xor_vector);
1161
+ t7 = vec_xor(t7, xor_vector);
1162
+ t8 = vec_xor(t8, xor_vector);
1163
+ }
1164
+ vec_xst(t5, 0, vecOffset);
1165
+ vec_xst(t6, 0, vecOffset+16);
1166
+ vec_xst(t7, 0, vecOffset+32);
1167
+ vec_xst(t8, 0, vecOffset+48);
1168
+
1169
+ t1 = vec_perm(c1[1], c2[1], swiz1);
1170
+ t2 = vec_perm(c1[1], c2[1], swiz2);
1171
+ t3 = vec_perm(c3[1], c4[1], swiz1);
1172
+ t4 = vec_perm(c3[1], c4[1], swiz2);
1173
+ t5 = vec_perm(t1, t3, swiz3);
1174
+ t6 = vec_perm(t1, t3, swiz4);
1175
+ t7 = vec_perm(t2, t4, swiz3);
1176
+ t8 = vec_perm(t2, t4, swiz4);
1177
+ if (flip == true) {
1178
+ t5 = vec_xor(t5, xor_vector);
1179
+ t6 = vec_xor(t6, xor_vector);
1180
+ t7 = vec_xor(t7, xor_vector);
1181
+ t8 = vec_xor(t8, xor_vector);
1182
+ }
1183
+ vec_xst(t5, 0, vecOffset+64);
1184
+ vec_xst(t6, 0, vecOffset+80);
1185
+ vec_xst(t7, 0, vecOffset+96);
1186
+ vec_xst(t8, 0, vecOffset+112);
1187
+
1188
+ t1 = vec_perm(c5[0], c6[0], swiz1);
1189
+ t2 = vec_perm(c5[0], c6[0], swiz2);
1190
+ t3 = vec_perm(c7[0], c8[0], swiz1);
1191
+ t4 = vec_perm(c7[0], c8[0], swiz2);
1192
+ t5 = vec_perm(t1, t3, swiz3);
1193
+ t6 = vec_perm(t1, t3, swiz4);
1194
+ t7 = vec_perm(t2, t4, swiz3);
1195
+ t8 = vec_perm(t2, t4, swiz4);
1196
+ if (flip == true) {
1197
+ t5 = vec_xor(t5, xor_vector);
1198
+ t6 = vec_xor(t6, xor_vector);
1199
+ t7 = vec_xor(t7, xor_vector);
1200
+ t8 = vec_xor(t8, xor_vector);
1201
+ }
1202
+ vec_xst(t5, 0, vecOffset+128);
1203
+ vec_xst(t6, 0, vecOffset+144);
1204
+ vec_xst(t7, 0, vecOffset+160);
1205
+ vec_xst(t8, 0, vecOffset+176);
1206
+
1207
+ t1 = vec_perm(c5[1], c6[1], swiz1);
1208
+ t2 = vec_perm(c5[1], c6[1], swiz2);
1209
+ t3 = vec_perm(c7[1], c8[1], swiz1);
1210
+ t4 = vec_perm(c7[1], c8[1], swiz2);
1211
+ t5 = vec_perm(t1, t3, swiz3);
1212
+ t6 = vec_perm(t1, t3, swiz4);
1213
+ t7 = vec_perm(t2, t4, swiz3);
1214
+ t8 = vec_perm(t2, t4, swiz4);
1215
+ if (flip == true) {
1216
+ t5 = vec_xor(t5, xor_vector);
1217
+ t6 = vec_xor(t6, xor_vector);
1218
+ t7 = vec_xor(t7, xor_vector);
1219
+ t8 = vec_xor(t8, xor_vector);
1220
+ }
1221
+ vec_xst(t5, 0, vecOffset+192);
1222
+ vec_xst(t6, 0, vecOffset+208);
1223
+ vec_xst(t7, 0, vecOffset+224);
1224
+ vec_xst(t8, 0, vecOffset+240);
1225
+
1226
+ aoffset1 += lda;
1227
+ aoffset2 += lda;
1228
+ aoffset3 += lda;
1229
+ aoffset4 += lda;
1230
+ aoffset5 += lda;
1231
+ aoffset6 += lda;
1232
+ aoffset7 += lda;
1233
+ aoffset8 += lda;
1234
+ vecOffset += 256;
1235
+ i--;
1236
+ } while(i > 0);
1237
+ }
1238
+ j--;
1239
+ } while(j > 0);
1240
+ }
1241
+
1242
+ if (rows & 4) {
1243
+ aoffset1 = aoffset;
1244
+ aoffset2 = aoffset1 + lda;
1245
+ aoffset3 = aoffset2 + lda;
1246
+ aoffset4 = aoffset3 + lda;
1247
+ aoffset += 4 * lda;
1248
+
1249
+ i = (cols >> 3);
1250
+ if (i > 0) {
1251
+ do {
1252
+ C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
1253
+ C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
1254
+ C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
1255
+ C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs);
1256
+
1257
+ __builtin_vsx_disassemble_pair(c1, &C1);
1258
+ __builtin_vsx_disassemble_pair(c2, &C2);
1259
+ __builtin_vsx_disassemble_pair(c3, &C3);
1260
+ __builtin_vsx_disassemble_pair(c4, &C4);
1261
+
1262
+ t1 = vec_perm(c1[0], c2[0], swiz1);
1263
+ t2 = vec_perm(c1[0], c2[0], swiz2);
1264
+ t3 = vec_perm(c3[0], c4[0], swiz1);
1265
+ t4 = vec_perm(c3[0], c4[0], swiz2);
1266
+ t5 = vec_perm(t1, t3, swiz3);
1267
+ t6 = vec_perm(t1, t3, swiz4);
1268
+ t7 = vec_perm(t2, t4, swiz3);
1269
+ t8 = vec_perm(t2, t4, swiz4);
1270
+ if (flip == true) {
1271
+ t5 = vec_xor(t5, xor_vector);
1272
+ t6 = vec_xor(t6, xor_vector);
1273
+ t7 = vec_xor(t7, xor_vector);
1274
+ t8 = vec_xor(t8, xor_vector);
1275
+ }
1276
+ vec_xst(t5, 0, vecOffset);
1277
+ vec_xst(t6, 0, vecOffset+16);
1278
+ vec_xst(t7, 0, vecOffset+32);
1279
+ vec_xst(t8, 0, vecOffset+48);
1280
+
1281
+ t1 = vec_perm(c1[1], c2[1], swiz1);
1282
+ t2 = vec_perm(c1[1], c2[1], swiz2);
1283
+ t3 = vec_perm(c3[1], c4[1], swiz1);
1284
+ t4 = vec_perm(c3[1], c4[1], swiz2);
1285
+ t5 = vec_perm(t1, t3, swiz3);
1286
+ t6 = vec_perm(t1, t3, swiz4);
1287
+ t7 = vec_perm(t2, t4, swiz3);
1288
+ t8 = vec_perm(t2, t4, swiz4);
1289
+ if (flip == true) {
1290
+ t5 = vec_xor(t5, xor_vector);
1291
+ t6 = vec_xor(t6, xor_vector);
1292
+ t7 = vec_xor(t7, xor_vector);
1293
+ t8 = vec_xor(t8, xor_vector);
1294
+ }
1295
+ vec_xst(t5, 0, vecOffset+64);
1296
+ vec_xst(t6, 0, vecOffset+80);
1297
+ vec_xst(t7, 0, vecOffset+96);
1298
+ vec_xst(t8, 0, vecOffset+112);
1299
+
1300
+ aoffset1 += lda;
1301
+ aoffset2 += lda;
1302
+ aoffset3 += lda;
1303
+ aoffset4 += lda;
1304
+ vecOffset += 128;
1305
+ i--;
1306
+ } while(i > 0);
1307
+ }
1308
+ }
1309
+ if (rows & 3) {
1310
+ aoffset1 = aoffset;
1311
+ aoffset2 = aoffset1 + lda;
1312
+ aoffset3 = aoffset2 + lda;
1313
+ i = (cols >> 3);
1314
+ if (i > 0) {
1315
+ do {
1316
+ switch(rows) {
1317
+ case 3: C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
1318
+ __builtin_vsx_disassemble_pair(c3, &C3);
1319
+ case 2: C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
1320
+ __builtin_vsx_disassemble_pair(c2, &C2);
1321
+ case 1: C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
1322
+ __builtin_vsx_disassemble_pair(c1, &C1);
1323
+ break;
1324
+ }
1325
+ t1 = vec_perm(c1[0], c2[0], swiz1);
1326
+ t2 = vec_perm(c1[0], c2[0], swiz2);
1327
+ t3 = vec_perm(c3[0], c4[0], swiz1);
1328
+ t4 = vec_perm(c3[0], c4[0], swiz2);
1329
+ t5 = vec_perm(t1, t3, swiz3);
1330
+ t6 = vec_perm(t1, t3, swiz4);
1331
+ t7 = vec_perm(t2, t4, swiz3);
1332
+ t8 = vec_perm(t2, t4, swiz4);
1333
+ if (flip == true) {
1334
+ t5 = vec_xor(t5, xor_vector);
1335
+ t6 = vec_xor(t6, xor_vector);
1336
+ t7 = vec_xor(t7, xor_vector);
1337
+ t8 = vec_xor(t8, xor_vector);
1338
+ }
1339
+ vec_xst(t5, 0, vecOffset);
1340
+ vec_xst(t6, 0, vecOffset+16);
1341
+ vec_xst(t7, 0, vecOffset+32);
1342
+ vec_xst(t8, 0, vecOffset+48);
1343
+
1344
+ t1 = vec_perm(c1[1], c2[1], swiz1);
1345
+ t2 = vec_perm(c1[1], c2[1], swiz2);
1346
+ t3 = vec_perm(c3[1], c4[1], swiz1);
1347
+ t4 = vec_perm(c3[1], c4[1], swiz2);
1348
+ t5 = vec_perm(t1, t3, swiz3);
1349
+ t6 = vec_perm(t1, t3, swiz4);
1350
+ t7 = vec_perm(t2, t4, swiz3);
1351
+ t8 = vec_perm(t2, t4, swiz4);
1352
+ if (flip == true) {
1353
+ t5 = vec_xor(t5, xor_vector);
1354
+ t6 = vec_xor(t6, xor_vector);
1355
+ t7 = vec_xor(t7, xor_vector);
1356
+ t8 = vec_xor(t8, xor_vector);
1357
+ }
1358
+ vec_xst(t5, 0, vecOffset+64);
1359
+ vec_xst(t6, 0, vecOffset+80);
1360
+ vec_xst(t7, 0, vecOffset+96);
1361
+ vec_xst(t8, 0, vecOffset+112);
1362
+
1363
+ aoffset1 += lda;
1364
+ aoffset2 += lda;
1365
+ aoffset3 += lda;
1366
+ vecOffset += 128;
1367
+ i--;
1368
+ } while(i > 0);
1369
+ }
1370
+ }
1371
+ }
1372
+
1373
+ void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
1374
+ int64_t mc, nc, mp, np;
1375
+ int m_rem = MIN(m - m0, 8);
1376
+ int n_rem = MIN(n - n0, 8);
1377
+ // TO-DO: KERNEL_16x8 and KERNEL_8x16 are having some performance
1378
+ // issues. After resolving them, below code will be enabled.
1379
+ /*if (m_rem >= 16 && n_rem >= 8) {
1380
+ mc = 16;
1381
+ nc = 8;
1382
+ gemm<16,8>(m0, m, n0, n);
1383
+ } else if(m_rem >= 8 && n_rem >= 16) {
1384
+ mc = 8;
1385
+ nc = 16;
1386
+ gemm<8,16>(m0, m, n0, n);
1387
+ }*/
1388
+ if (m_rem >= 8 && n_rem >= 8) {
1389
+ mc = 8;
1390
+ nc = 8;
1391
+ gemm<8,8>(m0, m, n0, n);
1392
+ } else if (m_rem >= 4 && n_rem >= 8) {
1393
+ mc = 4;
1394
+ nc = 8;
1395
+ gemm<4,8>(m0, m, n0, n);
1396
+ } else if (m_rem >= 8 && n_rem >= 4) {
1397
+ mc = 8;
1398
+ nc = 4;
1399
+ gemm<8,4>(m0, m, n0, n);
1400
+ } else if (m_rem >= 4 && n_rem >= 4) {
1401
+ mc = 4;
1402
+ nc = 4;
1403
+ gemm_small<4, 4>(m0, m, n0, n);
1404
+ } else if ((m_rem < 4) && (n_rem > 4)) {
1405
+ nc = 4;
1406
+ switch(m_rem) {
1407
+ case 1:
1408
+ mc = 1;
1409
+ gemm_small<1, 4>(m0, m, n0, n);
1410
+ break;
1411
+ case 2:
1412
+ mc = 2;
1413
+ gemm_small<2, 4>(m0, m, n0, n);
1414
+ break;
1415
+ case 3:
1416
+ mc = 3;
1417
+ gemm_small<3, 4>(m0, m, n0, n);
1418
+ break;
1419
+ default:
1420
+ return;
1421
+ }
1422
+ } else if ((m_rem > 4) && (n_rem < 4)) {
1423
+ mc = 4;
1424
+ switch(n_rem) {
1425
+ case 1:
1426
+ nc = 1;
1427
+ gemm_small<4, 1>(m0, m, n0, n);
1428
+ break;
1429
+ case 2:
1430
+ nc = 2;
1431
+ gemm_small<4, 2>(m0, m, n0, n);
1432
+ break;
1433
+ case 3:
1434
+ nc = 3;
1435
+ gemm_small<4, 3>(m0, m, n0, n);
1436
+ break;
1437
+ default:
1438
+ return;
1439
+ }
1440
+ } else {
1441
+ switch((m_rem << 4) | n_rem) {
1442
+ case 0x43:
1443
+ mc = 4;
1444
+ nc = 3;
1445
+ gemm_small<4, 3>(m0, m, n0, n);
1446
+ break;
1447
+ case 0x42:
1448
+ mc = 4;
1449
+ nc = 2;
1450
+ gemm_small<4, 2>(m0, m, n0, n);
1451
+ break;
1452
+ case 0x41:
1453
+ mc = 4;
1454
+ nc = 1;
1455
+ gemm_small<4, 1>(m0, m, n0, n);
1456
+ break;
1457
+ case 0x34:
1458
+ mc = 3;
1459
+ nc = 4;
1460
+ gemm_small<3, 4>(m0, m, n0, n);
1461
+ break;
1462
+ case 0x33:
1463
+ mc = 3;
1464
+ nc = 3;
1465
+ gemm_small<3, 3>(m0, m, n0, n);
1466
+ break;
1467
+ case 0x32:
1468
+ mc = 3;
1469
+ nc = 2;
1470
+ gemm_small<3, 2>(m0, m, n0, n);
1471
+ break;
1472
+ case 0x31:
1473
+ mc = 3;
1474
+ nc = 1;
1475
+ gemm_small<3, 1>(m0, m, n0, n);
1476
+ break;
1477
+ case 0x24:
1478
+ mc = 2;
1479
+ nc = 4;
1480
+ gemm_small<2, 4>(m0, m, n0, n);
1481
+ break;
1482
+ case 0x23:
1483
+ mc = 2;
1484
+ nc = 3;
1485
+ gemm_small<2, 3>(m0, m, n0, n);
1486
+ break;
1487
+ case 0x22:
1488
+ mc = 2;
1489
+ nc = 2;
1490
+ gemm_small<2, 2>(m0, m, n0, n);
1491
+ break;
1492
+ case 0x21:
1493
+ mc = 2;
1494
+ nc = 1;
1495
+ gemm_small<2, 1>(m0, m, n0, n);
1496
+ break;
1497
+ case 0x14:
1498
+ mc = 1;
1499
+ nc = 4;
1500
+ gemm_small<1, 4>(m0, m, n0, n);
1501
+ break;
1502
+ case 0x13:
1503
+ mc = 1;
1504
+ nc = 3;
1505
+ gemm_small<1, 3>(m0, m, n0, n);
1506
+ break;
1507
+ case 0x12:
1508
+ mc = 1;
1509
+ nc = 2;
1510
+ gemm_small<1, 2>(m0, m, n0, n);
1511
+ break;
1512
+ case 0x11:
1513
+ mc = 1;
1514
+ nc = 1;
1515
+ gemm_small<1, 1>(m0, m, n0, n);
1516
+ break;
1517
+ default:
1518
+ return;
1519
+ }
1520
+ }
1521
+ mp = m0 + (m - m0) / mc * mc;
1522
+ np = n0 + (n - n0) / nc * nc;
1523
+ mnpack(mp, m, n0, np);
1524
+ mnpack(m0, m, np, n);
1525
+ }
1526
+
1527
+ void KERNEL_4x8(int64_t ii, int64_t jj) {
1528
+ vec_t vec_A[8], vec_B[16] = {0};
1529
+ acc_t acc_0, acc_1;
1530
+ std::array<int, 4> comparray;
1531
+ vector float fin_res[8] = {0};
1532
+ vector float vs[8] = {0};
1533
+ for (int l = 0; l < k; l++) {
1534
+ __builtin_mma_xxsetaccz(&acc_0);
1535
+ __builtin_mma_xxsetaccz(&acc_1);
1536
+ packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
1537
+ packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
1538
+ for(int x = 0; x < 8; x++) {
1539
+ __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
1540
+ __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x], vec_B[x+8]);
1541
+ }
1542
+ for (int I = 0; I<4; I++) {
1543
+ for (int J = 0; J<4; J++) {
1544
+ *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
1545
+ *((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
1546
+ }
1547
+ }
1548
+ auto aoffset = A+(ii*lda)+l;
1549
+ for (int i = 0; i < 4; i++) {
1550
+ comparray[i] = 0;
1551
+ int ca = 0;
1552
+ const int8_t *at = aoffset->qs;
1553
+ for (int j = 0; j < 32; j++)
1554
+ ca += (int)*at++;
1555
+ comparray[i] = ca;
1556
+ aoffset += lda;
1557
+ }
1558
+ compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
1559
+ compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
1560
+ }
1561
+ save_res<4, 4>(ii, jj, 0, fin_res);
1562
+ save_res<4, 4>(ii, jj+4, 4, fin_res);
1563
+ }
1564
+
1565
+ void KERNEL_8x4(int64_t ii, int64_t jj) {
1566
+ vec_t vec_A[16], vec_B[8] = {0};
1567
+ acc_t acc_0, acc_1;
1568
+ std::array<int, 8> comparray;
1569
+ vector float fin_res[8] = {0};
1570
+ vector float vs[8] = {0};
1571
+ for (int l = 0; l < k; l++) {
1572
+ __builtin_mma_xxsetaccz(&acc_0);
1573
+ __builtin_mma_xxsetaccz(&acc_1);
1574
+ packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
1575
+ packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true);
1576
+ for(int x = 0; x < 8; x++) {
1577
+ __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
1578
+ __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
1579
+ }
1580
+ for (int I = 0; I<8; I++) {
1581
+ for (int J = 0; J<4; J++) {
1582
+ *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
1583
+ }
1584
+ }
1585
+ auto aoffset = A+(ii*lda)+l;
1586
+ for (int i = 0; i < 8; i++) {
1587
+ comparray[i] = 0;
1588
+ int ca = 0;
1589
+ const int8_t *at = aoffset->qs;
1590
+ for (int j = 0; j < 32; j++)
1591
+ ca += (int)*at++;
1592
+ comparray[i] = ca;
1593
+ aoffset += lda;
1594
+ }
1595
+ compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
1596
+ compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
1597
+ }
1598
+ save_res<4, 4>(ii, jj, 0, fin_res);
1599
+ save_res<4, 4>(ii+4, jj, 4, fin_res);
1600
+ }
1601
+
1602
+ void KERNEL_8x8(int64_t ii, int64_t jj) {
1603
+ vec_t vec_A[16], vec_B[16] = {0};
1604
+ acc_t acc_0, acc_1, acc_2, acc_3;
1605
+ std::array<int, 8> comparray;
1606
+ vector float fin_res[16] = {0};
1607
+ vector float vs[16] = {0};
1608
+ for (int l = 0; l < k; l++) {
1609
+ __builtin_mma_xxsetaccz(&acc_0);
1610
+ __builtin_mma_xxsetaccz(&acc_1);
1611
+ __builtin_mma_xxsetaccz(&acc_2);
1612
+ __builtin_mma_xxsetaccz(&acc_3);
1613
+ packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
1614
+ packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
1615
+ for(int x = 0; x < 8; x++) {
1616
+ __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
1617
+ __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]);
1618
+ __builtin_mma_xvi8ger4pp(&acc_2, vec_A[x], vec_B[x+8]);
1619
+ __builtin_mma_xvi8ger4pp(&acc_3, vec_A[x+8], vec_B[x+8]);
1620
+ }
1621
+ for (int I = 0; I<8; I++) {
1622
+ for (int J = 0; J<4; J++) {
1623
+ *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
1624
+ *((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
1625
+ }
1626
+ }
1627
+ auto aoffset = A+(ii*lda)+l;
1628
+ for (int i = 0; i < 8; i++) {
1629
+ comparray[i] = 0;
1630
+ int ca = 0;
1631
+ const int8_t *at = aoffset->qs;
1632
+ for (int j = 0; j < 32; j++)
1633
+ ca += (int)*at++;
1634
+ comparray[i] = ca;
1635
+ aoffset += lda;
1636
+ }
1637
+ compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
1638
+ compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
1639
+ compute<8>(&acc_2, 0, 8, comparray, vs, fin_res);
1640
+ compute<8>(&acc_3, 4, 12, comparray, vs, fin_res);
1641
+ }
1642
+ save_res<4, 4>(ii, jj, 0, fin_res);
1643
+ save_res<4, 4>(ii+4, jj, 4, fin_res);
1644
+ save_res<4, 4>(ii, jj+4, 8, fin_res);
1645
+ save_res<4, 4>(ii+4, jj+4, 12, fin_res);
1646
+ }
1647
+
1648
+ template<int RM, int RN>
1649
+ void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n) {
1650
+ int64_t ytiles = (m - m0) / RM;
1651
+ int64_t xtiles = (n - n0) / RN;
1652
+ int64_t tiles = xtiles * ytiles;
1653
+ int64_t duty = (tiles + nth - 1) / nth;
1654
+ int64_t start = duty * ith;
1655
+ int64_t end = start + duty;
1656
+ vec_t vec_A[8], vec_B[8] = {0};
1657
+ vector signed int vec_C[4];
1658
+ acc_t acc_0;
1659
+
1660
+ if (end > tiles)
1661
+ end = tiles;
1662
+ for (int64_t job = start; job < end; ++job) {
1663
+ int64_t ii = m0 + job / xtiles * RM;
1664
+ int64_t jj = n0 + job % xtiles * RN;
1665
+ std::array<int, RM> comparray;
1666
+ vector float res[4] = {0};
1667
+ vector float fin_res[4] = {0};
1668
+ vector float vs[4] = {0};
1669
+ vector float CA[4] = {0};
1670
+ __builtin_prefetch((A+(ii*lda)+0)->qs, 0, 1); // prefetch first value
1671
+ __builtin_prefetch((B+(jj*ldb)+0)->qs, 0, 1); // prefetch first value
1672
+ for (int l = 0; l < k; l++) {
1673
+ __builtin_prefetch((A+(ii*lda)+(l+1))->qs, 0, 1); // prefetch one loop ahead
1674
+ __builtin_prefetch((B+(jj*ldb)+(l+1))->qs, 0, 1); // prefetch one loop ahead
1675
+ __builtin_mma_xxsetaccz(&acc_0);
1676
+ packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
1677
+ packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true);
1678
+ for(int x = 0; x < 8; x+=4) {
1679
+ __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
1680
+ __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+1], vec_B[x+1]);
1681
+ __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+2], vec_B[x+2]);
1682
+ __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+3], vec_B[x+3]);
1683
+ }
1684
+ for (int I = 0; I<RM; I++) {
1685
+ for (int J = 0; J<RN; J++) {
1686
+ *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
1687
+ }
1688
+ }
1689
+ __builtin_mma_disassemble_acc(vec_C, &acc_0);
1690
+ auto aoffset = A+(ii*lda)+l;
1691
+ for (int i = 0; i < RM; i++) {
1692
+ comparray[i] = 0;
1693
+ int ca = 0;
1694
+ const int8_t *at = aoffset->qs;
1695
+ for (int j = 0; j < 32; j++)
1696
+ ca += (int)*at++;
1697
+ comparray[i] = ca;
1698
+ aoffset += lda;
1699
+ }
1700
+
1701
+ for (int i = 0; i < RM; i++) {
1702
+ CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0));
1703
+ res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
1704
+ fin_res[i] = vec_madd(res[i], vs[i], fin_res[i]);
1705
+ }
1706
+ }
1707
+ save_res<RM, RN>(ii, jj, 0, fin_res);
1708
+ }
1709
+ }
1710
+
1711
+ template<int RM, int RN>
1712
+ inline void kernel(int64_t ii, int64_t jj) {
1713
+ if constexpr(RM == 4 && RN == 8) {
1714
+ KERNEL_4x8(ii,jj);
1715
+ } else if constexpr(RM == 8 && RN == 4) {
1716
+ KERNEL_8x4(ii,jj);
1717
+ } else if constexpr(RM == 8 && RN == 8) {
1718
+ KERNEL_8x8(ii,jj);
1719
+ } else {
1720
+ static_assert(false, "RN/RM values not supported");
1721
+ }
1722
+ }
1723
+
1724
+ template <int RM, int RN>
1725
+ NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
1726
+ int64_t ytiles = (m - m0) / RM;
1727
+ int64_t xtiles = (n - n0) / RN;
1728
+ int64_t tiles = xtiles * ytiles;
1729
+ int64_t duty = (tiles + nth - 1) / nth;
1730
+ int64_t start = duty * ith;
1731
+ int64_t end = start + duty;
1732
+ if (end > tiles)
1733
+ end = tiles;
1734
+ for (int64_t job = start; job < end; ++job) {
1735
+ int64_t ii = m0 + job / xtiles * RM;
1736
+ int64_t jj = n0 + job % xtiles * RN;
1737
+ kernel<RM, RN>(ii, jj);
1738
+ }
1739
+ }
1740
+
1741
+ const TA *const A;
1742
+ const TB *const B;
1743
+ TC *C;
1744
+ TA *At;
1745
+ TB *Bt;
1746
+ const int64_t k;
1747
+ const int64_t lda;
1748
+ const int64_t ldb;
1749
+ const int64_t ldc;
1750
+ const int ith;
1751
+ const int nth;
1752
+ };
1753
+
1754
+ template <typename TA, typename TB, typename TC>
1755
+ class tinyBLAS_PPC {
1756
+ public:
1757
+ tinyBLAS_PPC(int64_t k,
1758
+ const TA *A, int64_t lda,
1759
+ const TB *B, int64_t ldb,
1760
+ TC *C, int64_t ldc,
1761
+ int ith, int nth)
1762
+ : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
1763
+ }
1764
+
1765
+ void matmul(int64_t m, int64_t n) {
1766
+ mnpack(0, m, 0, n);
1767
+ }
1768
+
1769
+ private:
1770
+
1771
+ void (tinyBLAS_PPC::*kernel)(int64_t, int64_t);
1772
+
1773
+ template<typename VA>
1774
+ void packTranspose(const TA* a, int64_t lda, int rows, int cols, TA* vec) {
1775
+ int64_t i, j;
1776
+ TA *aoffset = NULL, *boffset = NULL;
1777
+ TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
1778
+ TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
1779
+ __vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
1780
+ VA c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
1781
+ VA c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
1782
+ VA t1, t2, t3, t4, t5, t6, t7, t8;
1783
+ aoffset = const_cast<TA*>(a);
1784
+ boffset = vec;
1785
+ j = (rows >> 3);
1786
+ if (j > 0) {
1787
+ do {
1788
+ aoffset1 = aoffset;
1789
+ aoffset2 = aoffset1 + lda;
1790
+ aoffset3 = aoffset2 + lda;
1791
+ aoffset4 = aoffset3 + lda;
1792
+ aoffset5 = aoffset4 + lda;
1793
+ aoffset6 = aoffset5 + lda;
1794
+ aoffset7 = aoffset6 + lda;
1795
+ aoffset8 = aoffset7 + lda;
1796
+ aoffset += 8 * lda;
1797
+ i = (cols >> 3);
1798
+ if (i > 0) {
1799
+ do {
1800
+ C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
1801
+ C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
1802
+ C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3);
1803
+ C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4);
1804
+ C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5);
1805
+ C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6);
1806
+ C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7);
1807
+ C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8);
1808
+ __builtin_vsx_disassemble_pair(c1, &C1);
1809
+ __builtin_vsx_disassemble_pair(c2, &C2);
1810
+ __builtin_vsx_disassemble_pair(c3, &C3);
1811
+ __builtin_vsx_disassemble_pair(c4, &C4);
1812
+ __builtin_vsx_disassemble_pair(c5, &C5);
1813
+ __builtin_vsx_disassemble_pair(c6, &C6);
1814
+ __builtin_vsx_disassemble_pair(c7, &C7);
1815
+ __builtin_vsx_disassemble_pair(c8, &C8);
1816
+
1817
+ t1 = vec_mergeh(c1[0], c2[0]);
1818
+ t2 = vec_mergeh(c3[0], c4[0]);
1819
+ t3 = vec_mergeh(c5[0], c6[0]);
1820
+ t4 = vec_mergeh(c7[0], c8[0]);
1821
+ t5 = vec_xxpermdi(t1, t2, 0);
1822
+ t6 = vec_xxpermdi(t3, t4, 0);
1823
+ t7 = vec_xxpermdi(t1, t2, 3);
1824
+ t8 = vec_xxpermdi(t3, t4, 3);
1825
+ vec_xst(t5, 0, boffset);
1826
+ vec_xst(t6, 0, boffset+4);
1827
+ vec_xst(t7, 0, boffset+8);
1828
+ vec_xst(t8, 0, boffset+12);
1829
+
1830
+ t1 = vec_mergel(c1[0], c2[0]);
1831
+ t2 = vec_mergel(c3[0], c4[0]);
1832
+ t3 = vec_mergel(c5[0], c6[0]);
1833
+ t4 = vec_mergel(c7[0], c8[0]);
1834
+ t5 = vec_xxpermdi(t1, t2, 0);
1835
+ t6 = vec_xxpermdi(t3, t4, 0);
1836
+ t7 = vec_xxpermdi(t1, t2, 3);
1837
+ t8 = vec_xxpermdi(t3, t4, 3);
1838
+ vec_xst(t5, 0, boffset+16);
1839
+ vec_xst(t6, 0, boffset+20);
1840
+ vec_xst(t7, 0, boffset+24);
1841
+ vec_xst(t8, 0, boffset+28);
1842
+
1843
+ t1 = vec_mergeh(c1[1], c2[1]);
1844
+ t2 = vec_mergeh(c3[1], c4[1]);
1845
+ t3 = vec_mergeh(c5[1], c6[1]);
1846
+ t4 = vec_mergeh(c7[1], c8[1]);
1847
+ t5 = vec_xxpermdi(t1, t2, 0);
1848
+ t6 = vec_xxpermdi(t3, t4, 0);
1849
+ t7 = vec_xxpermdi(t1, t2, 3);
1850
+ t8 = vec_xxpermdi(t3, t4, 3);
1851
+ vec_xst(t5, 0, boffset+32);
1852
+ vec_xst(t6, 0, boffset+36);
1853
+ vec_xst(t7, 0, boffset+40);
1854
+ vec_xst(t8, 0, boffset+44);
1855
+
1856
+ t1 = vec_mergel(c1[1], c2[1]);
1857
+ t2 = vec_mergel(c3[1], c4[1]);
1858
+ t3 = vec_mergel(c5[1], c6[1]);
1859
+ t4 = vec_mergel(c7[1], c8[1]);
1860
+ t5 = vec_xxpermdi(t1, t2, 0);
1861
+ t6 = vec_xxpermdi(t3, t4, 0);
1862
+ t7 = vec_xxpermdi(t1, t2, 3);
1863
+ t8 = vec_xxpermdi(t3, t4, 3);
1864
+ vec_xst(t5, 0, boffset+48);
1865
+ vec_xst(t6, 0, boffset+52);
1866
+ vec_xst(t7, 0, boffset+56);
1867
+ vec_xst(t8, 0, boffset+60);
1868
+
1869
+ aoffset1 += 8*lda;
1870
+ aoffset2 += 8*lda;
1871
+ aoffset3 += 8*lda;
1872
+ aoffset4 += 8*lda;
1873
+ boffset += 64;
1874
+ i--;
1875
+ } while(i > 0);
1876
+ }
1877
+ if (cols & 4) {
1878
+ c1[0] = vec_xl(0, aoffset1);
1879
+ c2[0] = vec_xl(0, aoffset2);
1880
+ c3[0] = vec_xl(0, aoffset3);
1881
+ c4[0] = vec_xl(0, aoffset4);
1882
+ c5[0] = vec_xl(0, aoffset5);
1883
+ c6[0] = vec_xl(0, aoffset6);
1884
+ c7[0] = vec_xl(0, aoffset7);
1885
+ c8[0] = vec_xl(0, aoffset8);
1886
+
1887
+ t1 = vec_mergeh(c1[0], c2[0]);
1888
+ t2 = vec_mergeh(c3[0], c4[0]);
1889
+ t3 = vec_mergeh(c5[0], c6[0]);
1890
+ t4 = vec_mergeh(c7[0], c8[0]);
1891
+ t5 = vec_xxpermdi(t1, t2, 0);
1892
+ t6 = vec_xxpermdi(t3, t4, 0);
1893
+ t7 = vec_xxpermdi(t1, t2, 3);
1894
+ t8 = vec_xxpermdi(t3, t4, 3);
1895
+ vec_xst(t5, 0, boffset);
1896
+ vec_xst(t6, 0, boffset+4);
1897
+ vec_xst(t7, 0, boffset+8);
1898
+ vec_xst(t8, 0, boffset+12);
1899
+
1900
+ t1 = vec_mergel(c1[0], c2[0]);
1901
+ t2 = vec_mergel(c3[0], c4[0]);
1902
+ t3 = vec_mergel(c5[0], c6[0]);
1903
+ t4 = vec_mergel(c7[0], c8[0]);
1904
+ t5 = vec_xxpermdi(t1, t2, 0);
1905
+ t6 = vec_xxpermdi(t3, t4, 0);
1906
+ t7 = vec_xxpermdi(t1, t2, 3);
1907
+ t8 = vec_xxpermdi(t3, t4, 3);
1908
+ vec_xst(t5, 0, boffset+16);
1909
+ vec_xst(t6, 0, boffset+20);
1910
+ vec_xst(t7, 0, boffset+24);
1911
+ vec_xst(t8, 0, boffset+28);
1912
+ }
1913
+ j--;
1914
+ } while(j > 0);
1915
+ }
1916
+
1917
+ if (rows & 4) {
1918
+ aoffset1 = aoffset;
1919
+ aoffset2 = aoffset1 + lda;
1920
+ aoffset3 = aoffset2 + lda;
1921
+ aoffset4 = aoffset3 + lda;
1922
+ aoffset += 4 * lda;
1923
+ i = (cols >> 3);
1924
+ if (i > 0) {
1925
+ do {
1926
+ C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1);
1927
+ C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2);
1928
+ C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3);
1929
+ C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4);
1930
+ __builtin_vsx_disassemble_pair(c1, &C1);
1931
+ __builtin_vsx_disassemble_pair(c2, &C2);
1932
+ __builtin_vsx_disassemble_pair(c3, &C3);
1933
+ __builtin_vsx_disassemble_pair(c4, &C4);
1934
+
1935
+ t1 = vec_mergeh(c1[0], c2[0]);
1936
+ t2 = vec_mergeh(c3[0], c4[0]);
1937
+ t3 = vec_mergel(c1[0], c2[0]);
1938
+ t4 = vec_mergel(c3[0], c4[0]);
1939
+ t5 = vec_xxpermdi(t1, t2, 0);
1940
+ t6 = vec_xxpermdi(t1, t2, 3);
1941
+ t7 = vec_xxpermdi(t3, t4, 0);
1942
+ t8 = vec_xxpermdi(t3, t4, 3);
1943
+ vec_xst(t5, 0, boffset);
1944
+ vec_xst(t6, 0, boffset+4);
1945
+ vec_xst(t7, 0, boffset+8);
1946
+ vec_xst(t8, 0, boffset+12);
1947
+
1948
+ t1 = vec_mergeh(c1[1], c2[1]);
1949
+ t2 = vec_mergeh(c3[1], c4[1]);
1950
+ t3 = vec_mergel(c1[1], c2[1]);
1951
+ t4 = vec_mergel(c3[1], c4[1]);
1952
+ t5 = vec_xxpermdi(t1, t2, 0);
1953
+ t6 = vec_xxpermdi(t1, t2, 3);
1954
+ t7 = vec_xxpermdi(t3, t4, 0);
1955
+ t8 = vec_xxpermdi(t3, t4, 3);
1956
+ vec_xst(t5, 0, boffset+16);
1957
+ vec_xst(t6, 0, boffset+20);
1958
+ vec_xst(t7, 0, boffset+24);
1959
+ vec_xst(t8, 0, boffset+28);
1960
+
1961
+ aoffset1 += 8*lda;
1962
+ aoffset2 += 8*lda;
1963
+ aoffset3 += 8*lda;
1964
+ aoffset4 += 8*lda;
1965
+ boffset += 32;
1966
+ i--;
1967
+ } while(i > 0);
1968
+ }
1969
+
1970
+ if (cols & 4) {
1971
+ c1[0] = vec_xl(0, aoffset1);
1972
+ c2[0] = vec_xl(0, aoffset2);
1973
+ c3[0] = vec_xl(0, aoffset3);
1974
+ c4[0] = vec_xl(0, aoffset4);
1975
+
1976
+ t1 = vec_mergeh(c1[0], c2[0]);
1977
+ t2 = vec_mergeh(c3[0], c4[0]);
1978
+ t3 = vec_xxpermdi(t1, t2, 0);
1979
+ t4 = vec_xxpermdi(t1, t2, 3);
1980
+ vec_xst(t3, 0, boffset);
1981
+ vec_xst(t4, 0, boffset+4);
1982
+
1983
+ t1 = vec_mergel(c1[0], c2[0]);
1984
+ t2 = vec_mergel(c3[0], c4[0]);
1985
+ t3 = vec_xxpermdi(t1, t2, 0);
1986
+ t4 = vec_xxpermdi(t1, t2, 3);
1987
+ vec_xst(t3, 0, boffset+8);
1988
+ vec_xst(t4, 0, boffset+12);
1989
+ }
1990
+ }
1991
+ if (rows & 3) {
1992
+ aoffset1 = aoffset;
1993
+ aoffset2 = aoffset1 + lda;
1994
+ aoffset3 = aoffset2 + lda;
1995
+ if (cols & 4) {
1996
+ c1[0] = vec_xl(0, aoffset1);
1997
+ c2[0] = vec_xl(0, aoffset2);
1998
+ c3[0] = vec_xl(0, aoffset3);
1999
+
2000
+ t1 = vec_mergeh(c1[0], c2[0]);
2001
+ t2 = vec_mergeh(c3[0], c4[0]);
2002
+ t3 = vec_xxpermdi(t1, t2, 0);
2003
+ t4 = vec_xxpermdi(t1, t2, 3);
2004
+ vec_xst(t3, 0, boffset);
2005
+ vec_xst(t4, 0, boffset+4);
2006
+
2007
+ t1 = vec_mergel(c1[0], c2[0]);
2008
+ t2 = vec_mergel(c3[0], c4[0]);
2009
+ t3 = vec_xxpermdi(t1, t2, 0);
2010
+ t4 = vec_xxpermdi(t1, t2, 3);
2011
+ vec_xst(t3, 0, boffset+8);
2012
+ vec_xst(t4, 0, boffset+12);
2013
+ }
2014
+ }
2015
+ }
2016
+ void KERNEL_4x4(int64_t ii, int64_t jj) {
2017
+ vec_t vec_A[4], vec_B[4], vec_C[4];
2018
+ acc_t acc_0;
2019
+ __builtin_mma_xxsetaccz(&acc_0);
2020
+ for (int l = 0; l < k; l+=4) {
2021
+ packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
2022
+ packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
2023
+ __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
2024
+ __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
2025
+ __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
2026
+ __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], vec_B[3]);
2027
+ }
2028
+ SAVE_ACC(&acc_0, ii, jj);
2029
+ }
2030
+
2031
+ void KERNEL_4x8(int64_t ii, int64_t jj) {
2032
+ vec_t vec_A[4], vec_B[8], vec_C[4];
2033
+ acc_t acc_0, acc_1;
2034
+ __builtin_mma_xxsetaccz(&acc_0);
2035
+ __builtin_mma_xxsetaccz(&acc_1);
2036
+ for (int64_t l = 0; l < k; l+=4) {
2037
+ packTranspose<vector float>(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A);
2038
+ packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 4, (TA*)vec_B);
2039
+ __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]);
2040
+ __builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]);
2041
+ __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]);
2042
+ __builtin_mma_xvf32gerpp(&acc_1, vec_A[1], (vec_t)vec_B[3]);
2043
+ __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], (vec_t)vec_B[4]);
2044
+ __builtin_mma_xvf32gerpp(&acc_1, vec_A[2], (vec_t)vec_B[5]);
2045
+ __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], (vec_t)vec_B[6]);
2046
+ __builtin_mma_xvf32gerpp(&acc_1, vec_A[3], (vec_t)vec_B[7]);
2047
+ }
2048
+ SAVE_ACC(&acc_0, ii, jj);
2049
+ SAVE_ACC(&acc_1, ii, jj+4);
2050
+ }
2051
+
2052
+ void KERNEL_8x4(int64_t ii, int64_t jj) {
2053
+ vec_t vec_A[8], vec_B[4], vec_C[4];
2054
+ acc_t acc_0, acc_1;
2055
+ __builtin_mma_xxsetaccz(&acc_0);
2056
+ __builtin_mma_xxsetaccz(&acc_1);
2057
+ for (int64_t l = 0; l < k; l+=4) {
2058
+ packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 4, (TA*)vec_A);
2059
+ packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
2060
+ __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]);
2061
+ __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]);
2062
+ __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]);
2063
+ __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[3], vec_B[1]);
2064
+ __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[4], vec_B[2]);
2065
+ __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[5], vec_B[2]);
2066
+ __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[6], vec_B[3]);
2067
+ __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[7], vec_B[3]);
2068
+ }
2069
+ SAVE_ACC(&acc_0, ii, jj);
2070
+ SAVE_ACC(&acc_1, ii+4, jj);
2071
+ }
2072
+
2073
+ void KERNEL_8x8(int64_t ii, int64_t jj) {
2074
+ vec_t vec_A[16], vec_B[16], vec_C[4];
2075
+ acc_t acc_0, acc_1, acc_2, acc_3;
2076
+ __builtin_mma_xxsetaccz(&acc_0);
2077
+ __builtin_mma_xxsetaccz(&acc_1);
2078
+ __builtin_mma_xxsetaccz(&acc_2);
2079
+ __builtin_mma_xxsetaccz(&acc_3);
2080
+ for (int l = 0; l < k; l+=8) {
2081
+ packTranspose<vector float>(A+(ii*lda)+l, lda, 8, 8, (TA*)vec_A);
2082
+ packTranspose<vector float>(B+(jj*ldb)+l, ldb, 8, 8, (TA*)vec_B);
2083
+ for(int x = 0; x < 16; x+=2) {
2084
+ __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]);
2085
+ __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]);
2086
+ __builtin_mma_xvf32gerpp(&acc_2, (vec_t)vec_A[x+1], vec_B[x]);
2087
+ __builtin_mma_xvf32gerpp(&acc_3, (vec_t)vec_A[x+1], vec_B[x+1]);
2088
+ }
2089
+ }
2090
+ SAVE_ACC(&acc_0, ii, jj);
2091
+ SAVE_ACC(&acc_1, ii, jj+4);
2092
+ SAVE_ACC(&acc_2, ii+4, jj);
2093
+ SAVE_ACC(&acc_3, ii+4, jj+4);
2094
+ }
2095
+
2096
+ void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
2097
+ int64_t mc, nc, mp, np;
2098
+ int m_rem = MIN(m - m0, 16);
2099
+ int n_rem = MIN(n - n0, 16);
2100
+ if (m_rem >= 16 && n_rem >= 8) {
2101
+ mc = 8;
2102
+ nc = 8;
2103
+ gemm<8,8>(m0, m, n0, n);
2104
+ } else if(m_rem >= 8 && n_rem >= 16) {
2105
+ mc = 8;
2106
+ nc = 8;
2107
+ gemm<8,8>(m0, m, n0, n);
2108
+ } else if (m_rem >= 8 && n_rem >= 8) {
2109
+ mc = 8;
2110
+ nc = 8;
2111
+ gemm<8,8>(m0, m, n0, n);
2112
+ } else if (m_rem >= 4 && n_rem >= 8) {
2113
+ mc = 4;
2114
+ nc = 8;
2115
+ gemm<4,8>(m0, m, n0, n);
2116
+ } else if (m_rem >= 8 && n_rem >= 4) {
2117
+ mc = 8;
2118
+ nc = 4;
2119
+ gemm<8,4>(m0, m, n0, n);
2120
+ } else if (m_rem >= 4 && n_rem >= 4) {
2121
+ mc = 4;
2122
+ nc = 4;
2123
+ gemm<4,4>(m0, m, n0, n);
2124
+ } else if ((m_rem < 4) && (n_rem > 4)) {
2125
+ nc = 4;
2126
+ switch(m_rem) {
2127
+ case 1:
2128
+ mc = 1;
2129
+ gemm_small(m0, m, n0, n, mc, nc);
2130
+ break;
2131
+ case 2:
2132
+ mc = 2;
2133
+ gemm_small(m0, m, n0, n, mc, nc);
2134
+ break;
2135
+ case 3:
2136
+ mc = 3;
2137
+ gemm_small(m0, m, n0, n, mc, nc);
2138
+ break;
2139
+ default:
2140
+ return;
2141
+ }
2142
+ } else if ((m_rem > 4) && (n_rem < 4)) {
2143
+ mc = 4;
2144
+ switch(n_rem) {
2145
+ case 1:
2146
+ nc = 1;
2147
+ gemm_small(m0, m, n0, n, mc, nc);
2148
+ break;
2149
+ case 2:
2150
+ nc = 2;
2151
+ gemm_small(m0, m, n0, n, mc, nc);
2152
+ break;
2153
+ case 3:
2154
+ nc = 3;
2155
+ gemm_small(m0, m, n0, n, mc, nc);
2156
+ break;
2157
+ default:
2158
+ return;
2159
+ }
2160
+ } else {
2161
+ switch((m_rem << 4) | n_rem) {
2162
+ case 0x43:
2163
+ mc = 4;
2164
+ nc = 3;
2165
+ gemm_small(m0, m, n0, n, mc, nc);
2166
+ break;
2167
+ case 0x42:
2168
+ mc = 4;
2169
+ nc = 2;
2170
+ gemm_small(m0, m, n0, n, mc, nc);
2171
+ break;
2172
+ case 0x41:
2173
+ mc = 4;
2174
+ nc = 1;
2175
+ gemm_small(m0, m, n0, n, mc, nc);
2176
+ break;
2177
+ case 0x34:
2178
+ mc = 3;
2179
+ nc = 4;
2180
+ gemm_small(m0, m, n0, n, mc, nc);
2181
+ break;
2182
+ case 0x33:
2183
+ mc = 3;
2184
+ nc = 3;
2185
+ gemm_small(m0, m, n0, n, mc, nc);
2186
+ break;
2187
+ case 0x32:
2188
+ mc = 3;
2189
+ nc = 2;
2190
+ gemm_small(m0, m, n0, n, mc, nc);
2191
+ break;
2192
+ case 0x31:
2193
+ mc = 3;
2194
+ nc = 1;
2195
+ gemm_small(m0, m, n0, n, mc, nc);
2196
+ break;
2197
+ case 0x24:
2198
+ mc = 2;
2199
+ nc = 4;
2200
+ gemm_small(m0, m, n0, n, mc, nc);
2201
+ break;
2202
+ case 0x23:
2203
+ mc = 2;
2204
+ nc = 3;
2205
+ gemm_small(m0, m, n0, n, mc, nc);
2206
+ break;
2207
+ case 0x22:
2208
+ mc = 2;
2209
+ nc = 2;
2210
+ gemm_small(m0, m, n0, n, mc, nc);
2211
+ break;
2212
+ case 0x21:
2213
+ mc = 2;
2214
+ nc = 1;
2215
+ gemm_small(m0, m, n0, n, mc, nc);
2216
+ break;
2217
+ case 0x14:
2218
+ mc = 1;
2219
+ nc = 4;
2220
+ gemm_small(m0, m, n0, n, mc, nc);
2221
+ break;
2222
+ case 0x13:
2223
+ mc = 1;
2224
+ nc = 3;
2225
+ gemm_small(m0, m, n0, n, mc, nc);
2226
+ break;
2227
+ case 0x12:
2228
+ mc = 1;
2229
+ nc = 2;
2230
+ gemm_small(m0, m, n0, n, mc, nc);
2231
+ break;
2232
+ case 0x11:
2233
+ mc = 1;
2234
+ nc = 1;
2235
+ gemm_small(m0, m, n0, n, mc, nc);
2236
+ break;
2237
+ default:
2238
+ return;
2239
+ }
2240
+ }
2241
+ mp = m0 + (m - m0) / mc * mc;
2242
+ np = n0 + (n - n0) / nc * nc;
2243
+ mnpack(mp, m, n0, np);
2244
+ mnpack(m0, m, np, n);
2245
+ }
2246
+
2247
+ void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
2248
+ int64_t ytiles = (m - m0) / RM;
2249
+ int64_t xtiles = (n - n0) / RN;
2250
+ int64_t tiles = xtiles * ytiles;
2251
+ int64_t duty = (tiles + nth - 1) / nth;
2252
+ int64_t start = duty * ith;
2253
+ int64_t end = start + duty;
2254
+ if (end > tiles)
2255
+ end = tiles;
2256
+ for (int64_t job = start; job < end; ++job) {
2257
+ int64_t ii = m0 + job / xtiles * RM;
2258
+ int64_t jj = n0 + job % xtiles * RN;
2259
+ vec_t vec_C[4];
2260
+ acc_t acc_0;
2261
+ __builtin_mma_xxsetaccz(&acc_0);
2262
+ vec_t vec_A[4], vec_B[4];
2263
+ for (int l=0; l<k; l+=4) {
2264
+ if (RN >= 4 && RM == 1) {
2265
+ TA* a = const_cast<TA*>(A+(ii)*lda+l);
2266
+ packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
2267
+ vec_A[0] = (vec_t)vec_xl(0,a);
2268
+ vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1));
2269
+ vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2));
2270
+ vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3));
2271
+ } else {
2272
+ packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
2273
+ packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
2274
+ }
2275
+ __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]);
2276
+ __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]);
2277
+ __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]);
2278
+ __builtin_mma_xvf32gerpp(&acc_0, vec_A[3], vec_B[3]);
2279
+ }
2280
+ __builtin_mma_disassemble_acc(vec_C, &acc_0);
2281
+ for (int I = 0; I < RM; I++) {
2282
+ for (int J = 0; J < RN; J++) {
2283
+ *((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J);
2284
+ }
2285
+ }
2286
+ }
2287
+ }
2288
+
2289
+ template <int RM, int RN>
2290
+ NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
2291
+ int64_t ytiles = (m - m0) / RM;
2292
+ int64_t xtiles = (n - n0) / RN;
2293
+ int64_t tiles = xtiles * ytiles;
2294
+ int64_t duty = (tiles + nth - 1) / nth;
2295
+ int64_t start = duty * ith;
2296
+ int64_t end = start + duty;
2297
+ if (RM == 4 && RN == 4) {
2298
+ kernel = &tinyBLAS_PPC::KERNEL_4x4;
2299
+ } else if (RM == 4 && RN == 8) {
2300
+ kernel = &tinyBLAS_PPC::KERNEL_4x8;
2301
+ } else if (RM == 8 && RN == 4) {
2302
+ kernel = &tinyBLAS_PPC::KERNEL_8x4;
2303
+ } else if (RM == 8 && RN == 8) {
2304
+ kernel = &tinyBLAS_PPC::KERNEL_8x8;
2305
+ }
2306
+ if (end > tiles)
2307
+ end = tiles;
2308
+ for (int64_t job = start; job < end; ++job) {
2309
+ int64_t ii = m0 + job / xtiles * RM;
2310
+ int64_t jj = n0 + job % xtiles * RN;
2311
+ (this->*kernel)(ii, jj);
2312
+ }
2313
+ }
2314
+
2315
+ const TA *const A;
2316
+ const TB *const B;
2317
+ TC *C;
2318
+ TA *At;
2319
+ TB *Bt;
2320
+ const int64_t k;
2321
+ const int64_t lda;
2322
+ const int64_t ldb;
2323
+ const int64_t ldc;
2324
+ const int ith;
2325
+ const int nth;
2326
+ };
2327
+ #endif
2328
+ } // namespace
2329
+
2330
+ /**
2331
+ * Performs optimized matrix multiplication on CPU.
2332
+ *
2333
+ * This subroutine may compute C = Aᵀ * B with column major ordering.
2334
+ * Despite its name, this isn't a generalized implementation. Work is
2335
+ * only performed when a handwritten kernel is written and available.
2336
+ * Otherwise the caller should fall back to a general matmul routine.
2337
+ *
2338
+ * For example, for single-threaded single-precision GEMM you can say
2339
+ *
2340
+ * llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
2341
+ * 0, 1,
2342
+ * LM_GGML_TYPE_F32, LM_GGML_TYPE_F32, LM_GGML_TYPE_F32);
2343
+ *
2344
+ * @param m is rows in `A` and `C`
2345
+ * @param n is cols in `B` and `C`
2346
+ * @param k is cols in `A` and rows in `B`
2347
+ * @param A is first input matrix (always transposed)
2348
+ * @param lda is row stride of `A`
2349
+ * @param B is second input matrix (never transposed)
2350
+ * @param ldb is row stride of `B`
2351
+ * @param C is input/output array of output matrices
2352
+ * @param ldc is row stride of `C`
2353
+ * @param ith is thread id (must be less than `nth`)
2354
+ * @param nth is number of threads (must be greater than zero)
2355
+ * @param Atype is GGML data type of `A`
2356
+ * @param Btype is GGML data type of `B`
2357
+ * @param Ctype is GGML data type of `C`
2358
+ * @return true if this function was able to service the matmul request
2359
+ */
2360
+ bool llamafile_sgemm(const struct lm_ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
2361
+ const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
2362
+ int64_t ldc, int Atype, int Btype, int Ctype) {
2363
+
2364
+ assert(m >= 0);
2365
+ assert(n >= 0);
2366
+ assert(k >= 0);
2367
+ assert(lda >= k);
2368
+ assert(ldb >= k);
2369
+ assert(ldc >= m);
2370
+ assert(params->nth > 0);
2371
+ assert(params->ith < params->nth);
2372
+
2373
+ // only enable sgemm for prompt processing
2374
+ if (n < 2)
2375
+ return false;
2376
+
2377
+ if (Ctype != LM_GGML_TYPE_F32)
2378
+ return false;
2379
+
2380
+ switch (Atype) {
2381
+
2382
+ case LM_GGML_TYPE_F32: {
2383
+ if (Btype != LM_GGML_TYPE_F32)
2384
+ return false;
2385
+ #if defined(__AVX512F__)
2386
+ tinyBLAS<16, __m512, __m512, float, float, float> tb{ params,
2387
+ k, (const float *)A, lda,
2388
+ (const float *)B, ldb,
2389
+ (float *)C, ldc};
2390
+ return tb.matmul(m, n);
2391
+ #elif defined(__AVX__) || defined(__AVX2__)
2392
+ tinyBLAS<8, __m256, __m256, float, float, float> tb{ params,
2393
+ k, (const float *)A, lda,
2394
+ (const float *)B, ldb,
2395
+ (float *)C, ldc};
2396
+ return tb.matmul(m, n);
2397
+ #elif defined(__ARM_NEON)
2398
+ if (n < 4)
2399
+ return false;
2400
+ tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
2401
+ k, (const float *)A, lda,
2402
+ (const float *)B, ldb,
2403
+ (float *)C, ldc};
2404
+ return tb.matmul(m, n);
2405
+ #elif defined(__MMA__)
2406
+ if (k % 8)
2407
+ return false;
2408
+ tinyBLAS_PPC<float, float, float> tb{
2409
+ k, (const float *)A, lda,
2410
+ (const float *)B, ldb,
2411
+ (float *)C, ldc,
2412
+ params->ith, params->nth};
2413
+ tb.matmul(m, n);
2414
+ return true;
2415
+ #else
2416
+ return false;
2417
+ #endif
2418
+ }
2419
+
2420
+ case LM_GGML_TYPE_BF16: {
2421
+ #if defined(__AVX512BF16__)
2422
+ if (Btype == LM_GGML_TYPE_BF16) {
2423
+ tinyBLAS<32, __m512, __m512bh, lm_ggml_bf16_t, lm_ggml_bf16_t, float> tb{ params, k,
2424
+ (const lm_ggml_bf16_t *)A, lda,
2425
+ (const lm_ggml_bf16_t *)B, ldb,
2426
+ (float *)C, ldc};
2427
+ return tb.matmul(m, n);
2428
+ }
2429
+ #elif defined(__AVX512F__)
2430
+ if (Btype == LM_GGML_TYPE_BF16) {
2431
+ tinyBLAS<16, __m512, __m512, lm_ggml_bf16_t, lm_ggml_bf16_t, float> tb{ params, k,
2432
+ (const lm_ggml_bf16_t *)A, lda,
2433
+ (const lm_ggml_bf16_t *)B, ldb,
2434
+ (float *)C, ldc};
2435
+ return tb.matmul(m, n);
2436
+ }
2437
+ #elif defined(__AVX2__)
2438
+ if (Btype == LM_GGML_TYPE_BF16) {
2439
+ tinyBLAS<8, __m256, __m256, lm_ggml_bf16_t, lm_ggml_bf16_t, float> tb{ params, k,
2440
+ (const lm_ggml_bf16_t *)A, lda,
2441
+ (const lm_ggml_bf16_t *)B, ldb,
2442
+ (float *)C, ldc};
2443
+ return tb.matmul(m, n);
2444
+ }
2445
+ #endif
2446
+ return false;
2447
+ }
2448
+ case LM_GGML_TYPE_F16: {
2449
+ #if defined(__AVX512F__)
2450
+ if (Btype == LM_GGML_TYPE_F16) {
2451
+ tinyBLAS<16, __m512, __m512, lm_ggml_fp16_t, lm_ggml_fp16_t, float> tb{ params, k,
2452
+ (const lm_ggml_fp16_t *)A, lda,
2453
+ (const lm_ggml_fp16_t *)B, ldb,
2454
+ (float *)C, ldc};
2455
+ return tb.matmul(m, n);
2456
+ }
2457
+ #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
2458
+ if (Btype == LM_GGML_TYPE_F16) {
2459
+ tinyBLAS<8, __m256, __m256, lm_ggml_fp16_t, lm_ggml_fp16_t, float> tb{ params, k,
2460
+ (const lm_ggml_fp16_t *)A, lda,
2461
+ (const lm_ggml_fp16_t *)B, ldb,
2462
+ (float *)C, ldc};
2463
+ return tb.matmul(m, n);
2464
+ }
2465
+ #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
2466
+ if (n < 8)
2467
+ return false;
2468
+ if (Btype == LM_GGML_TYPE_F16) {
2469
+ tinyBLAS<8, float16x8_t, float16x8_t, lm_ggml_fp16_t, lm_ggml_fp16_t, float> tb{ params,
2470
+ k, (const lm_ggml_fp16_t *)A, lda,
2471
+ (const lm_ggml_fp16_t *)B, ldb,
2472
+ (float *)C, ldc};
2473
+ return tb.matmul(m, n);
2474
+ }
2475
+ #elif defined(__ARM_NEON) && !defined(_MSC_VER)
2476
+ if (Btype == LM_GGML_TYPE_F32) {
2477
+ tinyBLAS<4, float32x4_t, float32x4_t, lm_ggml_fp16_t, float, float> tb{ params,
2478
+ k, (const lm_ggml_fp16_t *)A, lda,
2479
+ (const float *)B, ldb,
2480
+ (float *)C, ldc};
2481
+ return tb.matmul(m, n);
2482
+ }
2483
+ #endif
2484
+ return false;
2485
+ }
2486
+
2487
+ case LM_GGML_TYPE_Q8_0: {
2488
+ if (Btype != LM_GGML_TYPE_Q8_0)
2489
+ return false;
2490
+ #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
2491
+ tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
2492
+ k, (const block_q8_0 *)A, lda,
2493
+ (const block_q8_0 *)B, ldb,
2494
+ (float *)C, ldc,
2495
+ params->ith, params->nth};
2496
+ tb.matmul(m, n);
2497
+ return true;
2498
+ #elif defined(__ARM_FEATURE_DOTPROD)
2499
+ tinyBLAS_Q0_ARM<block_q8_0> tb{
2500
+ k, (const block_q8_0 *)A, lda,
2501
+ (const block_q8_0 *)B, ldb,
2502
+ (float *)C, ldc,
2503
+ params->ith, params->nth};
2504
+ tb.matmul(m, n);
2505
+ return true;
2506
+
2507
+ #elif defined(__MMA__)
2508
+ if (n < 8 && n != 4)
2509
+ return false;
2510
+ if (m < 8 && m != 4)
2511
+ return false;
2512
+ tinyBLAS_Q0_PPC<block_q8_0, block_q8_0, float> tb{
2513
+ k, (const block_q8_0 *)A, lda,
2514
+ (const block_q8_0 *)B, ldb,
2515
+ (float *)C, ldc,
2516
+ params->ith, params->nth};
2517
+ tb.matmul(m, n);
2518
+ return true;
2519
+
2520
+ #else
2521
+ return false;
2522
+ #endif
2523
+ }
2524
+
2525
+ case LM_GGML_TYPE_Q4_0: {
2526
+ if (Btype != LM_GGML_TYPE_Q8_0)
2527
+ return false;
2528
+ #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
2529
+ tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
2530
+ k, (const block_q4_0 *)A, lda,
2531
+ (const block_q8_0 *)B, ldb,
2532
+ (float *)C, ldc,
2533
+ params->ith, params->nth};
2534
+ tb.matmul(m, n);
2535
+ return true;
2536
+ #elif defined(__ARM_FEATURE_DOTPROD)
2537
+ tinyBLAS_Q0_ARM<block_q4_0> tb{
2538
+ k, (const block_q4_0 *)A, lda,
2539
+ (const block_q8_0 *)B, ldb,
2540
+ (float *)C, ldc,
2541
+ params->ith, params->nth};
2542
+ tb.matmul(m, n);
2543
+ return true;
2544
+ #else
2545
+ return false;
2546
+ #endif
2547
+ }
2548
+
2549
+ case LM_GGML_TYPE_Q5_0: {
2550
+ if (Btype != LM_GGML_TYPE_Q8_0)
2551
+ return false;
2552
+ #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
2553
+ tinyBLAS_Q0_AVX<block_q5_0, block_q8_0, float> tb{
2554
+ k, (const block_q5_0 *)A, lda,
2555
+ (const block_q8_0 *)B, ldb,
2556
+ (float *)C, ldc,
2557
+ params->ith, params->nth};
2558
+ tb.matmul(m, n);
2559
+ return true;
2560
+ #else
2561
+ return false;
2562
+ #endif
2563
+ }
2564
+
2565
+ case LM_GGML_TYPE_IQ4_NL: {
2566
+ if (Btype != LM_GGML_TYPE_Q8_0)
2567
+ return false;
2568
+ #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
2569
+ tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
2570
+ k, (const block_iq4_nl *)A, lda,
2571
+ (const block_q8_0 *)B, ldb,
2572
+ (float *)C, ldc,
2573
+ params->ith, params->nth};
2574
+ tb.matmul(m, n);
2575
+ return true;
2576
+ #else
2577
+ return false;
2578
+ #endif
2579
+ }
2580
+
2581
+ default:
2582
+ return false;
2583
+ }
2584
+
2585
+ (void)params;
2586
+ (void)m;
2587
+ (void)n;
2588
+ (void)k;
2589
+ (void)A;
2590
+ (void)lda;
2591
+ (void)B;
2592
+ (void)ldb;
2593
+ (void)C;
2594
+ (void)ldc;
2595
+ (void)Atype;
2596
+ (void)Btype;
2597
+ (void)Ctype;
2598
+ }