whispercpp 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (244) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +4 -2
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  19. data/ext/sources/examples/addon.node/addon.cpp +150 -31
  20. data/ext/sources/examples/addon.node/index.js +3 -0
  21. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  22. data/ext/sources/examples/bench/bench.cpp +3 -2
  23. data/ext/sources/examples/cli/cli.cpp +3 -2
  24. data/ext/sources/examples/command/command.cpp +32 -8
  25. data/ext/sources/examples/common-whisper.cpp +14 -7
  26. data/ext/sources/examples/lsp/lsp.cpp +2 -0
  27. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  28. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  29. data/ext/sources/examples/server/server.cpp +169 -22
  30. data/ext/sources/examples/stream/stream.cpp +6 -0
  31. data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
  32. data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
  33. data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
  34. data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
  35. data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
  36. data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
  37. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  38. data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
  39. data/ext/sources/examples/talk-llama/llama-context.h +38 -17
  40. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  41. data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
  42. data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
  43. data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
  44. data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
  45. data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  47. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  48. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  49. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  50. data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
  51. data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
  52. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  53. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  54. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  55. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  56. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  57. data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
  58. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  59. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
  60. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  61. data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
  62. data/ext/sources/examples/talk-llama/llama-model.h +27 -0
  63. data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
  64. data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
  65. data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
  66. data/ext/sources/examples/talk-llama/llama.cpp +11 -7
  67. data/ext/sources/examples/talk-llama/llama.h +147 -40
  68. data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
  69. data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
  70. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  71. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
  72. data/ext/sources/ggml/CMakeLists.txt +48 -3
  73. data/ext/sources/ggml/cmake/common.cmake +24 -0
  74. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  75. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  76. data/ext/sources/ggml/include/ggml.h +144 -5
  77. data/ext/sources/ggml/src/CMakeLists.txt +82 -24
  78. data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
  79. data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
  80. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  81. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  82. data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
  83. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  84. data/ext/sources/ggml/src/ggml-common.h +4 -0
  85. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
  86. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  87. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  91. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  92. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  99. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  101. data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  103. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  105. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  106. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  107. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  108. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  109. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
  110. data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  112. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  113. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  115. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
  116. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  117. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
  118. data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
  119. data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
  120. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  121. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  122. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  123. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  124. data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
  125. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  129. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
  130. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  131. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  132. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
  133. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
  134. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  135. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
  136. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  137. data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
  138. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
  139. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  140. data/ext/sources/ggml/src/ggml-impl.h +127 -183
  141. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  142. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
  143. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
  144. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
  145. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  146. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
  147. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
  148. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  149. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  150. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  151. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  152. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  153. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  154. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  155. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  156. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  157. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  158. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  159. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  160. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  161. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  162. data/ext/sources/ggml/src/ggml-quants.c +6 -8
  163. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  164. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  165. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  166. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  167. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
  168. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
  169. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
  170. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
  171. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  172. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  173. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  174. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
  175. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  176. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  177. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
  178. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
  179. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  180. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  181. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
  182. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  183. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
  184. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
  185. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
  186. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  187. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  188. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  189. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  190. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  191. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
  192. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  193. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
  194. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  195. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  196. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  197. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  198. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  199. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  200. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  201. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  202. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
  203. data/ext/sources/ggml/src/ggml.c +328 -48
  204. data/ext/sources/ggml/src/ggml.cpp +26 -0
  205. data/ext/sources/ggml/src/gguf.cpp +24 -3
  206. data/ext/sources/include/whisper.h +2 -0
  207. data/ext/sources/src/CMakeLists.txt +2 -0
  208. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  209. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  210. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  211. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  212. data/ext/sources/src/whisper.cpp +218 -169
  213. data/extsources.rb +15 -9
  214. data/lib/whisper/context.rb +15 -0
  215. data/lib/whisper/model/uri.rb +56 -1
  216. data/lib/whisper/segment.rb +58 -0
  217. data/sig/whisper.rbs +68 -38
  218. data/{tests → test}/helper.rb +1 -12
  219. data/{tests → test}/test_model.rb +9 -0
  220. data/test/test_package.rb +51 -0
  221. data/test/test_segment.rb +146 -0
  222. data/{tests → test}/test_whisper.rb +70 -0
  223. data/whispercpp.gemspec +2 -3
  224. metadata +91 -43
  225. data/ext/sources/.dockerignore +0 -3
  226. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  227. data/ext/sources/ci/run.sh +0 -336
  228. data/ext/sources/close-issue.yml +0 -28
  229. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
  230. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  231. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  232. data/tests/test_package.rb +0 -46
  233. data/tests/test_segment.rb +0 -74
  234. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  235. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  236. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  237. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  238. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  239. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  240. /data/{tests → test}/test_callback.rb +0 -0
  241. /data/{tests → test}/test_error.rb +0 -0
  242. /data/{tests → test}/test_params.rb +0 -0
  243. /data/{tests → test}/test_vad.rb +0 -0
  244. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -0,0 +1,2639 @@
1
+ #define GGML_COMMON_IMPL_C
2
+ #include "ggml-common.h"
3
+ #include "ggml-quants.h"
4
+ #include "ggml-impl.h"
5
+ #include "ggml-cpu.h"
6
+ #include "simd-mappings.h"
7
+
8
+ #include "../../quants.h"
9
+ #include "../../ggml-cpu-impl.h"
10
+
11
+ #include <math.h>
12
+ #include <string.h>
13
+ #include <assert.h>
14
+ #include <float.h>
15
+ #include <stdlib.h> // for qsort
16
+ #include <stdio.h> // for GGML_ASSERT
17
+
18
+ #define GROUP_MAX_EPS 1e-15f
19
+ #define GROUP_MAX_EPS_IQ3_XXS 1e-8f
20
+ #define GROUP_MAX_EPS_IQ2_S 1e-8f
21
+ #define GROUP_MAX_EPS_IQ1_M 1e-7f
22
+ #define GROUP_MAX_EPS_IQ1_S 1e-12f
23
+
24
+ #define UNUSED GGML_UNUSED
25
+
26
+ #if defined(__loongarch_sx)
27
+
28
+ static __m128i lsx_packs_w(__m128i a, __m128i b) {
29
+ __m128i tmp, tmp1;
30
+ tmp = __lsx_vsat_w(a, 15);
31
+ tmp1 = __lsx_vsat_w(b, 15);
32
+ return __lsx_vpickev_h(tmp1, tmp);
33
+ }
34
+
35
+ static __m128i lsx_packs_h(__m128i a, __m128i b) {
36
+ __m128i tmp, tmp1;
37
+ tmp = __lsx_vsat_h(a, 7);
38
+ tmp1 = __lsx_vsat_h(b, 7);
39
+ return __lsx_vpickev_b(tmp1, tmp);
40
+ }
41
+
42
+ static __m128i lsx_packus_h(__m128i a, __m128i b) {
43
+ __m128i tmp, tmp1;
44
+ tmp = __lsx_vsat_hu(a, 7);
45
+ tmp1 = __lsx_vsat_hu(b, 7);
46
+ return __lsx_vpickev_b(tmp1, tmp);
47
+ }
48
+
49
+ static __m128i lsx_maddubs_h(__m128i a, __m128i b) {
50
+ __m128i tmp1, tmp2;
51
+ tmp1 = __lsx_vmulwev_h_b(a, b);
52
+ tmp2 = __lsx_vmulwod_h_b(a, b);
53
+ return __lsx_vsadd_h(tmp1, tmp2);
54
+ }
55
+
56
+ static __m128i lsx_madd_h(__m128i a, __m128i b) {
57
+ __m128i tmp1, tmp2;
58
+ tmp1 = __lsx_vmulwev_w_h(a, b);
59
+ tmp2 = __lsx_vmulwod_w_h(a, b);
60
+ return __lsx_vadd_w(tmp1, tmp2);
61
+ }
62
+
63
+ static __m128i lsx_set_w(int32_t a, int32_t b, int32_t c, int32_t d) {
64
+ v4i32 __ret = {d, c, b, a};
65
+ return (__m128i)__ret;
66
+ }
67
+
68
+ static __m128i lsx_shuffle_b(__m128i a, __m128i b) {
69
+ __m128i mask_f, zero, tmp0, tmp2, mask;
70
+ int f = 0x8f;
71
+ mask_f = __lsx_vreplgr2vr_b(f);
72
+ zero = __lsx_vldi(0);
73
+ tmp0 = __lsx_vand_v(b, mask_f); // get mask with low 4 bit and sign bits
74
+ tmp0 = __lsx_vori_b(tmp0, 0x10); // make each mask or with 0x10 prepare for positive
75
+ mask = __lsx_vsle_b(zero, tmp0); // if mask >= 0, set mask
76
+ tmp2 = __lsx_vand_v(tmp0, mask); // maskout the in2 < ones
77
+ return __lsx_vshuf_b(a, zero, tmp2);
78
+ }
79
+
80
+ static __m128i lsx_hadd_h(__m128i a, __m128i b) {
81
+ __m128i tmp1 = __lsx_vpickev_h(b, a);
82
+ __m128i tmp2 = __lsx_vpickod_h(b, a);
83
+ return __lsx_vadd_h(tmp1, tmp2);
84
+ }
85
+
86
+ static __m128i lsx_hadd_w(__m128i a, __m128i b) {
87
+ __m128i tmp1 = __lsx_vpickev_w(b, a);
88
+ __m128i tmp2 = __lsx_vpickod_w(b, a);
89
+ return __lsx_vadd_w(tmp1, tmp2);
90
+ }
91
+
92
+ static __m128 lsx_hadd_s(__m128 a, __m128 b) {
93
+ __m128 tmp1 = (__m128)__lsx_vpickev_w((__m128i)b, (__m128i)a);
94
+ __m128 tmp2 = (__m128)__lsx_vpickod_w((__m128i)b, (__m128i)a);
95
+
96
+ return __lsx_vfadd_s(tmp1, tmp2);
97
+ }
98
+
99
+ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 c, const __m128 d) {
100
+ __m128 res_0 =lsx_hadd_s(a, b);
101
+ __m128 res_1 =lsx_hadd_s(c, d);
102
+ __m128 res =lsx_hadd_s(res_0, res_1);
103
+ res =lsx_hadd_s(res, res);
104
+ res =lsx_hadd_s(res, res);
105
+
106
+ return ((v4f32)res)[0];
107
+ }
108
+ #endif
109
+
110
+ #if defined(__loongarch_asx)
111
+
112
+ #ifdef __clang__
113
+ #define VREGS_PREFIX "$vr"
114
+ #define XREGS_PREFIX "$xr"
115
+ #else // GCC
116
+ #define VREGS_PREFIX "$f"
117
+ #define XREGS_PREFIX "$f"
118
+ #endif
119
+ #define __ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
120
+ // Convert __m128i to __m256i
121
+ static inline __m256i ____m256i(__m128i in) {
122
+ __m256i out = __lasx_xvldi(0);
123
+ __asm__ volatile (
124
+ ".irp i," __ALL_REGS "\n\t"
125
+ " .ifc %[out], " XREGS_PREFIX"\\i \n\t"
126
+ " .irp j," __ALL_REGS "\n\t"
127
+ " .ifc %[in], " VREGS_PREFIX "\\j \n\t"
128
+ " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t"
129
+ " .endif \n\t"
130
+ " .endr \n\t"
131
+ " .endif \n\t"
132
+ ".endr \n\t"
133
+ : [out] "+f" (out) : [in] "f" (in)
134
+ );
135
+ return out;
136
+ }
137
+ // Convert two __m128i to __m256i
138
+ static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) {
139
+ __m256i out;
140
+ __asm__ volatile (
141
+ ".irp i," __ALL_REGS "\n\t"
142
+ " .ifc %[hi], " VREGS_PREFIX "\\i \n\t"
143
+ " .irp j," __ALL_REGS "\n\t"
144
+ " .ifc %[lo], " VREGS_PREFIX "\\j \n\t"
145
+ " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t"
146
+ " .endif \n\t"
147
+ " .endr \n\t"
148
+ " .endif \n\t"
149
+ ".endr \n\t"
150
+ ".ifnc %[out], %[hi] \n\t"
151
+ ".irp i," __ALL_REGS "\n\t"
152
+ " .ifc %[out], " XREGS_PREFIX "\\i \n\t"
153
+ " .irp j," __ALL_REGS "\n\t"
154
+ " .ifc %[hi], " VREGS_PREFIX "\\j \n\t"
155
+ " xvori.b $xr\\i, $xr\\j, 0 \n\t"
156
+ " .endif \n\t"
157
+ " .endr \n\t"
158
+ " .endif \n\t"
159
+ ".endr \n\t"
160
+ ".endif \n\t"
161
+ : [out] "=f" (out), [hi] "+f" (inhi)
162
+ : [lo] "f" (inlo)
163
+ );
164
+ return out;
165
+ }
166
+ // Convert __m256i low part to __m128i
167
+ static inline __m128i lasx_extracti128_lo(__m256i in) {
168
+ __m128i out;
169
+ __asm__ volatile (
170
+ ".ifnc %[out], %[in] \n\t"
171
+ ".irp i," __ALL_REGS "\n\t"
172
+ " .ifc %[out], " VREGS_PREFIX "\\i \n\t"
173
+ " .irp j," __ALL_REGS "\n\t"
174
+ " .ifc %[in], " XREGS_PREFIX "\\j \n\t"
175
+ " vori.b $vr\\i, $vr\\j, 0 \n\t"
176
+ " .endif \n\t"
177
+ " .endr \n\t"
178
+ " .endif \n\t"
179
+ ".endr \n\t"
180
+ ".endif \n\t"
181
+ : [out] "=f" (out) : [in] "f" (in)
182
+ );
183
+ return out;
184
+ }
185
+ // Convert __m256i high part to __m128i
186
+ static inline __m128i lasx_extracti128_hi(__m256i in) {
187
+ __m128i out;
188
+ __asm__ volatile (
189
+ ".irp i," __ALL_REGS "\n\t"
190
+ " .ifc %[out], " VREGS_PREFIX "\\i \n\t"
191
+ " .irp j," __ALL_REGS "\n\t"
192
+ " .ifc %[in], " XREGS_PREFIX "\\j \n\t"
193
+ " xvpermi.q $xr\\i, $xr\\j, 0x11 \n\t"
194
+ " .endif \n\t"
195
+ " .endr \n\t"
196
+ " .endif \n\t"
197
+ ".endr \n\t"
198
+ : [out] "=f" (out) : [in] "f" (in)
199
+ );
200
+ return out;
201
+ }
202
+
203
+ static __m256i lasx_set_w(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0) {
204
+ v8i32 __ret = {e0, e1, e2, e3, e4, e5, e6, e7};
205
+ return (__m256i)__ret;
206
+ }
207
+
208
+ static __m256i lasx_set_d(int64_t a, int64_t b, int64_t c, int64_t d) {
209
+ v4i64 __ret = {d, c, b, a};
210
+ return (__m256i)__ret;
211
+ }
212
+
213
+ static __m256i lasx_insertf128( __m128i x, __m128i y) {
214
+ return lasx_set_q(x, y);
215
+ }
216
+
217
+ static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
218
+ __m256i mask_f, zero, tmp0, tmp2, mask;
219
+ int f = 0x8f;
220
+ mask_f = __lasx_xvreplgr2vr_b(f);
221
+ zero = __lasx_xvldi(0);
222
+ tmp0 = __lasx_xvand_v(b, mask_f); // get mask with low 4 bit and sign bits
223
+ tmp0 = __lasx_xvori_b(tmp0, 0x10); // make each mask or with 0x10 prepare for positive
224
+ mask = __lasx_xvsle_b(zero, tmp0); // if mask >= 0, set mask
225
+ tmp2 = __lasx_xvand_v(tmp0, mask); // maskout the in2 < ones
226
+ return __lasx_xvshuf_b(a, zero, tmp2);
227
+ }
228
+
229
+ static __m256i lasx_extu8_16(__m128i a) {
230
+ return __lasx_vext2xv_hu_bu(____m256i(a));
231
+ }
232
+
233
+ static __m256i lasx_ext8_16(__m128i a) {
234
+ return __lasx_vext2xv_h_b(____m256i(a));
235
+ }
236
+
237
+ static __m256i lasx_ext16_32(__m128i a) {
238
+ return __lasx_vext2xv_w_h(____m256i(a));
239
+ }
240
+
241
+ static __m128i lasx_extracti128( __m256i a, int pos) {
242
+ __m128i ret;
243
+ if( pos == 0)
244
+ {
245
+ ret = lasx_extracti128_lo(a);
246
+ } else {
247
+ ret = lasx_extracti128_hi(a);
248
+ }
249
+ return ret;
250
+ }
251
+
252
+ static __m128 lasx_extractf128( __m256 a, int pos) {
253
+ __m128 ret;
254
+ if( pos == 0)
255
+ {
256
+ ret = (__m128)lasx_extracti128_lo((__m256i)a);
257
+ } else {
258
+ ret = (__m128)lasx_extracti128_hi((__m256i)a);
259
+ }
260
+ return ret;
261
+ }
262
+
263
+ static __m256i lasx_maddubs_h(__m256i a, __m256i b) {
264
+ __m256i tmp1, tmp2;
265
+ tmp1 = __lasx_xvmulwev_h_b(a, b);
266
+ tmp2 = __lasx_xvmulwod_h_b(a, b);
267
+ return __lasx_xvsadd_h(tmp1, tmp2);
268
+ }
269
+
270
+ static __m256i lasx_madd_h(__m256i a, __m256i b) {
271
+ __m256i tmp1, tmp2;
272
+ tmp1 = __lasx_xvmulwev_w_h(a, b);
273
+ tmp2 = __lasx_xvmulwod_w_h(a, b);
274
+ return __lasx_xvadd_w(tmp1, tmp2);
275
+ }
276
+
277
+ static __m256i lasx_packs_w(__m256i a, __m256i b) {
278
+ __m256i tmp, tmp1;
279
+ tmp = __lasx_xvsat_w(a, 15);
280
+ tmp1 = __lasx_xvsat_w(b, 15);
281
+ return __lasx_xvpickev_h(tmp1, tmp);
282
+ }
283
+
284
+ static __m256i lasx_packs_h(__m256i a, __m256i b) {
285
+ __m256i tmp, tmp1;
286
+ tmp = __lasx_xvsat_h(a, 7);
287
+ tmp1 = __lasx_xvsat_h(b, 7);
288
+ return __lasx_xvpickev_b(tmp1, tmp);
289
+ }
290
+
291
+ static inline __m256i lasx_madd_h_b(__m256i a, __m256i b) {
292
+ __m256i tmp1, tmp2;
293
+ tmp1 = __lasx_xvmulwev_h_b(a, b);
294
+ tmp2 = __lasx_xvmulwod_h_b(a, b);
295
+ return __lasx_xvadd_h(tmp1, tmp2);
296
+ }
297
+
298
+ static inline __m256i lasx_xvrepl128vei_h(__m256i a, const unsigned int b) {
299
+ switch (b) {
300
+ case 0: return __lasx_xvrepl128vei_h(a, 0);
301
+ case 1: return __lasx_xvrepl128vei_h(a, 1);
302
+ case 2: return __lasx_xvrepl128vei_h(a, 2);
303
+ case 3: return __lasx_xvrepl128vei_h(a, 3);
304
+ case 4: return __lasx_xvrepl128vei_h(a, 4);
305
+ case 5: return __lasx_xvrepl128vei_h(a, 5);
306
+ case 6: return __lasx_xvrepl128vei_h(a, 6);
307
+ case 7: return __lasx_xvrepl128vei_h(a, 7);
308
+ default: __builtin_unreachable();
309
+ }
310
+ }
311
+
312
+ static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
313
+ switch (b) {
314
+ case 0: return __lasx_xvandi_b(a, 1 << 0);
315
+ case 1: return __lasx_xvandi_b(a, 1 << 1);
316
+ case 2: return __lasx_xvandi_b(a, 1 << 2);
317
+ case 3: return __lasx_xvandi_b(a, 1 << 3);
318
+ case 4: return __lasx_xvandi_b(a, 1 << 4);
319
+ case 5: return __lasx_xvandi_b(a, 1 << 5);
320
+ case 6: return __lasx_xvandi_b(a, 1 << 6);
321
+ case 7: return __lasx_xvandi_b(a, 1 << 7);
322
+ default: __builtin_unreachable();
323
+ }
324
+ }
325
+
326
+ // multiply int8_t, add results pairwise twice
327
+ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
328
+ // Get absolute values of x vectors
329
+ const __m128i ax = __lsx_vsigncov_b(x, x);
330
+ // Sign the values of the y vectors
331
+ const __m128i sy = __lsx_vsigncov_b(x, y);
332
+ // Perform multiplication and create 16-bit values
333
+ const __m128i dot = lsx_maddubs_h(ax, sy);
334
+ const __m128i ones = __lsx_vreplgr2vr_h(1);
335
+ return lsx_madd_h(ones, dot);
336
+ }
337
+
338
+ // horizontally add 8 floats
339
+ static inline float hsum_float_8(const __m256 x) {
340
+ __m128 res = lasx_extractf128(x, 1);
341
+ res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
342
+ res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
343
+ res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
344
+ return ((v4f32)res)[0];
345
+ }
346
+
347
+ // horizontally add 8 int32_t
348
+ static inline int hsum_i32_8(const __m256i a) {
349
+
350
+ __m256i tmp1 = __lasx_xvpermi_q(a, a, 0x11);
351
+ __m256i tmp2 = __lasx_xvpermi_q(a, a, 0x00);
352
+
353
+ __m128i tmp1_128 = lasx_extracti128_lo(tmp1);
354
+ __m128i tmp2_128 = lasx_extracti128_lo(tmp2);
355
+
356
+ __m128i sum128 = __lsx_vadd_w(tmp1_128, tmp2_128);
357
+
358
+ __m128i ev = __lsx_vpickev_w(sum128, sum128);
359
+ __m128i od = __lsx_vpickod_w(sum128, sum128);
360
+ __m128i sum64 = __lsx_vadd_w(ev, od);
361
+
362
+ int sum64_1, sum64_2;
363
+ sum64_1 = __lsx_vpickve2gr_w(sum64, 0);
364
+ sum64_2 = __lsx_vpickve2gr_w(sum64, 1);
365
+
366
+ return sum64_1 + sum64_2;
367
+ }
368
+
369
+ // horizontally add 4 int32_t
370
+ static inline int hsum_i32_4(const __m128i a) {
371
+ __m128i ev = __lsx_vpickev_w(a, a);
372
+ __m128i od = __lsx_vpickod_w(a, a);
373
+ __m128i sum64 = __lsx_vadd_w(ev, od);
374
+
375
+ int sum64_1, sum64_2;
376
+ sum64_1 = __lsx_vpickve2gr_w(sum64, 0);
377
+ sum64_2 = __lsx_vpickve2gr_w(sum64, 1);
378
+
379
+ return sum64_1 + sum64_2;
380
+ }
381
+
382
+ // spread 32 bits to 32 bytes { 0x00, 0xFF }
383
+ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
384
+
385
+ uint32_t x32;
386
+ memcpy(&x32, x, sizeof(uint32_t));
387
+ const __m256i shuf_mask = lasx_set_d(
388
+ 0x0303030303030303, 0x0202020202020202,
389
+ 0x0101010101010101, 0x0000000000000000);
390
+
391
+ __m256i bytes = lasx_shuffle_b(__lasx_xvreplgr2vr_w(x32), shuf_mask);
392
+ const __m256i bit_mask = __lasx_xvreplgr2vr_d(0x7fbfdfeff7fbfdfe);
393
+ bytes = __lasx_xvor_v(bytes, bit_mask);
394
+ return __lasx_xvseq_b(bytes, __lasx_xvreplgr2vr_d(-1));
395
+ }
396
+
397
+ // Unpack 32 4-bit fields into 32 bytes
398
+ // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
399
+ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) {
400
+ const __m128i lo = __lsx_vld((const __m128i *)rsi, 0);
401
+ __m128i hi = __lsx_vsrli_h(lo, 4);
402
+ return __lasx_xvandi_b(lasx_insertf128(hi, lo), 0xf);
403
+ }
404
+
405
+ // add int16_t pairwise and return as float vector
406
+ static inline __m256 sum_i16_pairs_float(const __m256i x) {
407
+ __m256i v = __lasx_xvpackod_h(x, x);
408
+ __m256i summed_pairs = __lasx_xvaddwev_w_h(x, v);
409
+ return __lasx_xvffint_s_w(summed_pairs);
410
+ }
411
+
412
+ static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
413
+ // Perform multiplication and create 16-bit values
414
+ const __m256i dot = lasx_maddubs_h(ax, sy);
415
+ return sum_i16_pairs_float(dot);
416
+ }
417
+
418
+ // multiply int8_t, add results pairwise twice and return as float vector
419
+ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
420
+ const __m256i dot = lasx_madd_h_b(x, y);
421
+ return sum_i16_pairs_float(dot);
422
+ }
423
+
424
+ static inline __m128i packNibbles( __m256i bytes ) {
425
+ // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
426
+ const __m256i lowByte = __lasx_xvreplgr2vr_h(0xFF);
427
+ __m256i high = __lasx_xvandn_v(lowByte, bytes);
428
+ __m256i low = __lasx_xvand_v(lowByte, bytes);
429
+ high = __lasx_xvsrli_h(high, 4);
430
+ bytes = __lasx_xvor_v(low, high);
431
+ // Compress uint16_t lanes into bytes
432
+ __m128i *r0 = (__m128i *)&bytes;
433
+ __m256i tmp_h128 = __lasx_xvpermi_q(bytes, bytes, 0x11);
434
+ __m128i *r1 = (__m128i *)&tmp_h128;
435
+
436
+ __m128i zero = __lsx_vldi(0);
437
+ __m128i tmp, tmp2, tmp3;
438
+
439
+ tmp = __lsx_vmax_h(zero, *r0);
440
+ tmp2 = __lsx_vsat_hu(tmp, 7);
441
+
442
+ tmp = __lsx_vmax_h(zero, *r1);
443
+ tmp3 = __lsx_vsat_hu(tmp, 7);
444
+ return __lsx_vpickev_b(tmp3, tmp2);
445
+ }
446
+ #endif //__loongarch_asx
447
+
448
+ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
449
+ assert(QK8_0 == 32);
450
+ assert(k % QK8_0 == 0);
451
+ const int nb = k / QK8_0;
452
+
453
+ block_q8_0 * GGML_RESTRICT y = vy;
454
+
455
+ #if defined(__loongarch_asx)
456
+ for (int i = 0; i < nb; i++) {
457
+ __m256 v0 = (__m256)__lasx_xvld( x , 0);
458
+ __m256 v1 = (__m256)__lasx_xvld( x , 32);
459
+ __m256 v2 = (__m256)__lasx_xvld( x , 64);
460
+ __m256 v3 = (__m256)__lasx_xvld( x , 96);
461
+ x += 32;
462
+
463
+ // Compute max(abs(e)) for the block
464
+ const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f );
465
+ __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 );
466
+ max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) );
467
+ max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) );
468
+ max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) );
469
+
470
+ __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs , 0) );
471
+ max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
472
+ __m128 tmp = max4;
473
+ max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
474
+ const float max_scalar = ((v4f32)max4)[0];
475
+
476
+ // Quantize these floats
477
+ const float d = max_scalar / 127.f;
478
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
479
+ const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
480
+ const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id );
481
+
482
+ // Apply the multiplier
483
+ v0 = __lasx_xvfmul_s( v0, mul );
484
+ v1 = __lasx_xvfmul_s( v1, mul );
485
+ v2 = __lasx_xvfmul_s( v2, mul );
486
+ v3 = __lasx_xvfmul_s( v3, mul );
487
+
488
+ // Round to nearest integer
489
+ __m256i i0 = __lasx_xvftintrne_w_s( v0 );
490
+ __m256i i1 = __lasx_xvftintrne_w_s( v1 );
491
+ __m256i i2 = __lasx_xvftintrne_w_s( v2 );
492
+ __m256i i3 = __lasx_xvftintrne_w_s( v3 );
493
+
494
+ __m128i ni0 = lasx_extracti128( i0, 0 );
495
+ __m128i ni1 = lasx_extracti128( i0, 1);
496
+ __m128i ni2 = lasx_extracti128( i1, 0);
497
+ __m128i ni3 = lasx_extracti128( i1, 1);
498
+ __m128i ni4 = lasx_extracti128( i2, 0);
499
+ __m128i ni5 = lasx_extracti128( i2, 1);
500
+ __m128i ni6 = lasx_extracti128( i3, 0);
501
+ __m128i ni7 = lasx_extracti128( i3, 1);
502
+
503
+ // Convert int32 to int16
504
+ ni0 = lsx_packs_w( ni0, ni1 );
505
+ ni2 = lsx_packs_w( ni2, ni3 );
506
+ ni4 = lsx_packs_w( ni4, ni5 );
507
+ ni6 = lsx_packs_w( ni6, ni7 );
508
+ // Convert int16 to int8
509
+ ni0 = lsx_packs_h( ni0, ni2 );
510
+ ni4 = lsx_packs_h( ni4, ni6 );
511
+
512
+ __lsx_vst(ni0, (__m128i *)(y[i].qs + 0), 0);
513
+ __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
514
+
515
+ }
516
+ #else
517
+ GGML_UNUSED(nb);
518
+ // scalar
519
+ quantize_row_q8_0_ref(x, y, k);
520
+ #endif
521
+ }
522
+
523
+ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
524
+ assert(k % QK8_1 == 0);
525
+ const int nb = k / QK8_1;
526
+
527
+ block_q8_1 * GGML_RESTRICT y = vy;
528
+
529
+ #if defined(__loongarch_asx)
530
+ for (int i = 0; i < nb; i++) {
531
+ __m256 v0 = (__m256)__lasx_xvld( x , 0 );
532
+ __m256 v1 = (__m256)__lasx_xvld( x , 32 );
533
+ __m256 v2 = (__m256)__lasx_xvld( x , 64 );
534
+ __m256 v3 = (__m256)__lasx_xvld( x , 96 );
535
+ x += 32;
536
+
537
+ // Compute max(abs(e)) for the block
538
+ const __m256 sign_bit = __lasx_xvreplfr2vr_s( -0.0f );
539
+ __m256 max_abs = (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v0 );
540
+ max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v1 ) );
541
+ max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v2 ) );
542
+ max_abs = __lasx_xvfmax_s( max_abs, (__m256)__lasx_xvandn_v( (__m256i)sign_bit, (__m256i)v3 ) );
543
+
544
+ __m128 max4 = __lsx_vfmax_s( lasx_extractf128( max_abs, 1 ), lasx_extractf128( max_abs, 0) );
545
+ max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
546
+ __m128 tmp = max4;
547
+ max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
548
+ const float max_scalar = ((v4f32)max4)[0];
549
+
550
+ // Quantize these floats
551
+ const float d = max_scalar / 127.f;
552
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
553
+ const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
554
+ const __m256 mul = __lasx_xvreplfr2vr_s( id );
555
+
556
+ // Apply the multiplier
557
+ v0 = __lasx_xvfmul_s( v0, mul );
558
+ v1 = __lasx_xvfmul_s( v1, mul );
559
+ v2 = __lasx_xvfmul_s( v2, mul );
560
+ v3 = __lasx_xvfmul_s( v3, mul );
561
+
562
+ // Round to nearest integer
563
+ __m256i i0 = __lasx_xvftintrne_w_s( v0 );
564
+ __m256i i1 = __lasx_xvftintrne_w_s( v1 );
565
+ __m256i i2 = __lasx_xvftintrne_w_s( v2 );
566
+ __m256i i3 = __lasx_xvftintrne_w_s( v3 );
567
+
568
+ __m128i ni0 = lasx_extracti128(i0, 0);
569
+ __m128i ni1 = lasx_extracti128( i0, 1);
570
+ __m128i ni2 = lasx_extracti128( i1, 0);
571
+ __m128i ni3 = lasx_extracti128( i1, 1);
572
+ __m128i ni4 = lasx_extracti128( i2, 0 );
573
+ __m128i ni5 = lasx_extracti128( i2, 1);
574
+ __m128i ni6 = lasx_extracti128( i3, 0);
575
+ __m128i ni7 = lasx_extracti128( i3, 1);
576
+
577
+ // Compute the sum of the quants and set y[i].s
578
+ const __m128i s0 = __lsx_vadd_w(__lsx_vadd_w(ni0, ni1), __lsx_vadd_w(ni2, ni3));
579
+ const __m128i s1 = __lsx_vadd_w(__lsx_vadd_w(ni4, ni5), __lsx_vadd_w(ni6, ni7));
580
+ y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1)));
581
+
582
+ // Convert int32 to int16
583
+ ni0 = lsx_packs_w( ni0, ni1 );
584
+ ni2 = lsx_packs_w( ni2, ni3 );
585
+ ni4 = lsx_packs_w( ni4, ni5 );
586
+ ni6 = lsx_packs_w( ni6, ni7 );
587
+ // Convert int16 to int8
588
+ ni0 = lsx_packs_h( ni0, ni2 );
589
+ ni4 = lsx_packs_h( ni4, ni6 );
590
+
591
+ __lsx_vst(ni0, (__m128i *)(y[i].qs + 0), 0);
592
+ __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
593
+ }
594
+ #else
595
+ GGML_UNUSED(nb);
596
+ // scalar
597
+ quantize_row_q8_1_ref(x, y, k);
598
+ #endif
599
+ }
600
+
601
+
602
+ //===================================== Dot products =================================
603
+
604
+ //
605
+ // Helper functions
606
+ //
607
+
608
+ #if defined(__loongarch_asx)
609
+ // shuffles to pick the required scales in dot products
610
+ static inline __m256i get_scale_shuffle_q3k(int i) {
611
+ static const uint8_t k_shuffle[128] = {
612
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
613
+ 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
614
+ 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
615
+ 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13, 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,
616
+ };
617
+ return __lasx_xvld((const __m256i*)k_shuffle + i, 0);
618
+ }
619
+ static inline __m256i get_scale_shuffle_k4(int i) {
620
+ static const uint8_t k_shuffle[256] = {
621
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
622
+ 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
623
+ 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
624
+ 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
625
+ 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 8, 9,
626
+ 10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11,
627
+ 12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,12,13,
628
+ 14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15
629
+ };
630
+ return __lasx_xvld((const __m256i*)k_shuffle + i, 0);
631
+ }
632
+ static inline __m128i get_scale_shuffle(int i) {
633
+ static const uint8_t k_shuffle[128] = {
634
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
635
+ 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
636
+ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
637
+ 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
638
+ 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
639
+ 10,10,10,10,10,10,10,10, 11,11,11,11,11,11,11,11,
640
+ 12,12,12,12,12,12,12,12, 13,13,13,13,13,13,13,13,
641
+ 14,14,14,14,14,14,14,14, 15,15,15,15,15,15,15,15
642
+ };
643
+ return __lsx_vld((const __m128i*)k_shuffle + i, 0);
644
+ }
645
+ #endif
646
+
647
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
648
+ const int qk = QK8_0;
649
+ const int nb = n / qk;
650
+
651
+ assert(n % qk == 0);
652
+ assert(nrc == 1);
653
+ UNUSED(nrc);
654
+ UNUSED(bx);
655
+ UNUSED(by);
656
+ UNUSED(bs);
657
+
658
+ const block_q4_0 * GGML_RESTRICT x = vx;
659
+ const block_q8_0 * GGML_RESTRICT y = vy;
660
+
661
+ int ib = 0;
662
+ float sumf = 0;
663
+
664
+ #if defined(__loongarch_asx)
665
+ // Initialize accumulator with zeros
666
+ __m256 acc = (__m256)__lasx_xvldi(0);
667
+
668
+ // Main loop
669
+ for (; ib < nb; ++ib) {
670
+ /* Compute combined scale for the block */
671
+ const __m256 d = __lasx_xvreplfr2vr_s( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
672
+
673
+ __m256i qx = bytes_from_nibbles_32(x[ib].qs);
674
+
675
+ // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
676
+ const __m256i off = __lasx_xvreplgr2vr_b( 8 );
677
+ qx = __lasx_xvsub_b( qx, off );
678
+
679
+ __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
680
+
681
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
682
+
683
+ /* Multiply q with scale and accumulate */
684
+ acc = __lasx_xvfmadd_s( d, q, acc );
685
+ }
686
+
687
+ sumf = hsum_float_8(acc);
688
+
689
+ #elif defined(__loongarch_sx)
690
+ // set constants
691
+ const __m128i low_mask = __lsx_vreplgr2vr_b(0xF);
692
+ const __m128i off = __lsx_vreplgr2vr_b(8);
693
+
694
+ // Initialize accumulator with zeros
695
+ __m128 acc_0 = (__m128)__lsx_vldi(0);
696
+ __m128 acc_1 = (__m128)__lsx_vldi(0);
697
+ __m128 acc_2 = (__m128)__lsx_vldi(0);
698
+ __m128 acc_3 = (__m128)__lsx_vldi(0);
699
+
700
+ for (; ib + 1 < nb; ib += 2) {
701
+
702
+ // Compute combined scale for the block 0 and 1
703
+ const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
704
+
705
+ const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
706
+
707
+ __m128i bx_0 = __lsx_vand_v(low_mask, tmp_0_1);
708
+ __m128i by_0 = __lsx_vld((const __m128i *)y[ib].qs, 0);
709
+ bx_0 = __lsx_vsub_b(bx_0, off);
710
+ const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
711
+
712
+ __m128i bx_1 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_0_1, 4));
713
+ __m128i by_1 = __lsx_vld((const __m128i *)(y[ib].qs + 16), 0);
714
+ bx_1 = __lsx_vsub_b(bx_1, off);
715
+ const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
716
+
717
+ //_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
718
+ //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
719
+
720
+ // Compute combined scale for the block 2 and 3
721
+ const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
722
+
723
+ const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
724
+
725
+ __m128i bx_2 = __lsx_vand_v(low_mask, tmp_2_3);
726
+ __m128i by_2 = __lsx_vld((const __m128i *)y[ib + 1].qs, 0);
727
+ bx_2 = __lsx_vsub_b(bx_2, off);
728
+ const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
729
+
730
+ __m128i bx_3 = __lsx_vand_v(low_mask, __lsx_vsrli_d(tmp_2_3, 4));
731
+ __m128i by_3 = __lsx_vld((const __m128i *)(y[ib + 1].qs + 16), 0);
732
+ bx_3 = __lsx_vsub_b(bx_3, off);
733
+ const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
734
+
735
+ // Convert int32_t to float
736
+ __m128 p0 = __lsx_vffint_s_w(i32_0);
737
+ __m128 p1 = __lsx_vffint_s_w(i32_1);
738
+ __m128 p2 = __lsx_vffint_s_w(i32_2);
739
+ __m128 p3 = __lsx_vffint_s_w(i32_3);
740
+
741
+ // Apply the scale
742
+ __m128 p0_d = __lsx_vfmul_s( d_0_1, p0 );
743
+ __m128 p1_d = __lsx_vfmul_s( d_0_1, p1 );
744
+ __m128 p2_d = __lsx_vfmul_s( d_2_3, p2 );
745
+ __m128 p3_d = __lsx_vfmul_s( d_2_3, p3 );
746
+
747
+ // Acummulate
748
+ acc_0 = __lsx_vfadd_s(p0_d, acc_0);
749
+ acc_1 = __lsx_vfadd_s(p1_d, acc_1);
750
+ acc_2 = __lsx_vfadd_s(p2_d, acc_2);
751
+ acc_3 = __lsx_vfadd_s(p3_d, acc_3);
752
+ }
753
+
754
+ sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
755
+
756
+ #endif
757
+ for (; ib < nb; ++ib) {
758
+ int sumi0 = 0;
759
+ int sumi1 = 0;
760
+
761
+ for (int j = 0; j < qk/2; ++j) {
762
+ const int v0 = (x[ib].qs[j] & 0x0F) - 8;
763
+ const int v1 = (x[ib].qs[j] >> 4) - 8;
764
+
765
+ sumi0 += (v0 * y[ib].qs[j]);
766
+ sumi1 += (v1 * y[ib].qs[j + qk/2]);
767
+ }
768
+
769
+ int sumi = sumi0 + sumi1;
770
+ sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
771
+ }
772
+
773
+ *s = sumf;
774
+ }
775
+
776
+ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
777
+ const int qk = QK8_1;
778
+ const int nb = n / qk;
779
+
780
+ assert(n % qk == 0);
781
+ assert(nrc == 1);
782
+ UNUSED(nrc);
783
+ UNUSED(bx);
784
+ UNUSED(by);
785
+ UNUSED(bs);
786
+
787
+ const block_q4_1 * GGML_RESTRICT x = vx;
788
+ const block_q8_1 * GGML_RESTRICT y = vy;
789
+
790
+ int ib = 0;
791
+ float sumf = 0;
792
+
793
+ #if defined(__loongarch_asx)
794
+ // Initialize accumulator with zeros
795
+ __m256 acc = (__m256)__lasx_xvldi(0);
796
+
797
+ float summs = 0;
798
+
799
+ // Main loop
800
+ for (; ib < nb; ++ib) {
801
+ const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
802
+ const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d);
803
+
804
+ summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
805
+
806
+ const __m256 d0v = __lasx_xvreplfr2vr_s( d0 );
807
+ const __m256 d1v = __lasx_xvreplfr2vr_s( d1 );
808
+
809
+ // Compute combined scales
810
+ const __m256 d0d1 = __lasx_xvfmul_s( d0v, d1v );
811
+
812
+ // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
813
+ const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
814
+ const __m256i qy = __lasx_xvld( (const __m256i *)y[ib].qs, 0);
815
+
816
+ const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
817
+
818
+ // Accumulate d0*d1*x*y
819
+ acc = __lasx_xvfmadd_s( d0d1, xy, acc );
820
+ }
821
+
822
+ sumf = hsum_float_8(acc) + summs;
823
+
824
+ #endif
825
+ for (; ib < nb; ++ib) {
826
+ int sumi0 = 0;
827
+ int sumi1 = 0;
828
+
829
+ for (int j = 0; j < qk/2; ++j) {
830
+ const int v0 = (x[ib].qs[j] & 0x0F);
831
+ const int v1 = (x[ib].qs[j] >> 4);
832
+
833
+ sumi0 += (v0 * y[ib].qs[j]);
834
+ sumi1 += (v1 * y[ib].qs[j + qk/2]);
835
+ }
836
+
837
+ int sumi = sumi0 + sumi1;
838
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
839
+ }
840
+
841
+ *s = sumf;
842
+ }
843
+
844
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
845
+ const int qk = QK8_0;
846
+ const int nb = n / qk;
847
+
848
+ int ib = 0;
849
+ float sumf = 0;
850
+
851
+ assert(n % qk == 0);
852
+ assert(qk == QK5_0);
853
+ assert(nrc == 1);
854
+ UNUSED(nrc);
855
+ UNUSED(bx);
856
+ UNUSED(by);
857
+ UNUSED(bs);
858
+
859
+ const block_q5_0 * GGML_RESTRICT x = vx;
860
+ const block_q8_0 * GGML_RESTRICT y = vy;
861
+
862
+ #if defined(__loongarch_asx)
863
+ // Initialize accumulator with zeros
864
+ __m256 acc = (__m256)__lasx_xvldi(0);
865
+
866
+ // Main loop
867
+ for (; ib < nb; ++ib) {
868
+ /* Compute combined scale for the block */
869
+ const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); //FIXME
870
+
871
+ __m256i qx = bytes_from_nibbles_32(x[ib].qs);
872
+ __m256i bxhi = bytes_from_bits_32(x[ib].qh);
873
+ bxhi = __lasx_xvandn_v(bxhi, __lasx_xvreplgr2vr_b((char)0xF0));
874
+ qx = __lasx_xvor_v(qx, bxhi);
875
+
876
+ __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
877
+
878
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
879
+
880
+ /* Multiply q with scale and accumulate */
881
+ acc = __lasx_xvfmadd_s(d, q, acc);
882
+ }
883
+
884
+ sumf = hsum_float_8(acc);
885
+
886
+ #endif
887
+ for (; ib < nb; ++ib) {
888
+ uint32_t qh;
889
+ memcpy(&qh, x[ib].qh, sizeof(qh));
890
+
891
+ int sumi0 = 0;
892
+ int sumi1 = 0;
893
+
894
+ for (int j = 0; j < qk/2; ++j) {
895
+ const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
896
+ const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
897
+
898
+ const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
899
+ const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
900
+
901
+ sumi0 += (x0 * y[ib].qs[j]);
902
+ sumi1 += (x1 * y[ib].qs[j + qk/2]);
903
+ }
904
+
905
+ int sumi = sumi0 + sumi1;
906
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
907
+ }
908
+
909
+ *s = sumf;
910
+ }
911
+
912
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
913
+ const int qk = QK8_1;
914
+ const int nb = n / qk;
915
+
916
+ int ib = 0;
917
+ float sumf = 0;
918
+
919
+ assert(n % qk == 0);
920
+ assert(qk == QK5_1);
921
+ assert(nrc == 1);
922
+ UNUSED(nrc);
923
+ UNUSED(bx);
924
+ UNUSED(by);
925
+ UNUSED(bs);
926
+
927
+ const block_q5_1 * GGML_RESTRICT x = vx;
928
+ const block_q8_1 * GGML_RESTRICT y = vy;
929
+
930
+ #if defined(__loongarch_asx)
931
+ // Initialize accumulator with zeros
932
+ __m256 acc = (__m256)__lasx_xvldi(0);
933
+
934
+ float summs = 0.0f;
935
+
936
+ // Main loop
937
+ for (; ib < nb; ++ib) {
938
+ const __m256 dx = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d));
939
+
940
+ summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
941
+
942
+ __m256i qx = bytes_from_nibbles_32(x[ib].qs);
943
+ __m256i bxhi = bytes_from_bits_32(x[ib].qh);
944
+ bxhi = __lasx_xvand_v(bxhi, __lasx_xvreplgr2vr_b(0x10));
945
+ qx = __lasx_xvor_v(qx, bxhi);
946
+
947
+ const __m256 dy = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib].d));
948
+ const __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
949
+
950
+ const __m256 q = mul_sum_us8_pairs_float(qx, qy);
951
+
952
+ acc = __lasx_xvfmadd_s(q, __lasx_xvfmul_s(dx, dy), acc);
953
+ }
954
+
955
+ sumf = hsum_float_8(acc) + summs;
956
+
957
+ #endif
958
+ for (; ib < nb; ++ib) {
959
+ uint32_t qh;
960
+ memcpy(&qh, x[ib].qh, sizeof(qh));
961
+
962
+ int sumi0 = 0;
963
+ int sumi1 = 0;
964
+
965
+ for (int j = 0; j < qk/2; ++j) {
966
+ const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
967
+ const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
968
+
969
+ const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
970
+ const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
971
+
972
+ sumi0 += (x0 * y[ib].qs[j]);
973
+ sumi1 += (x1 * y[ib].qs[j + qk/2]);
974
+ }
975
+
976
+ int sumi = sumi0 + sumi1;
977
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
978
+ }
979
+
980
+ *s = sumf;
981
+ }
982
+
983
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
984
+ const int qk = QK8_0;
985
+ const int nb = n / qk;
986
+
987
+ assert(n % qk == 0);
988
+ assert(nrc == 1);
989
+ UNUSED(nrc);
990
+ UNUSED(bx);
991
+ UNUSED(by);
992
+ UNUSED(bs);
993
+
994
+ const block_q8_0 * GGML_RESTRICT x = vx;
995
+ const block_q8_0 * GGML_RESTRICT y = vy;
996
+
997
+ int ib = 0;
998
+ float sumf = 0;
999
+
1000
+ #if defined(__loongarch_asx)
1001
+ // Initialize accumulator with zeros
1002
+ __m256 acc = (__m256)__lasx_xvldi(0);
1003
+
1004
+ // Main loop
1005
+ for (; ib < nb; ++ib) {
1006
+ // Compute combined scale for the block
1007
+ const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
1008
+ __m256i qx = __lasx_xvld((const __m256i *)x[ib].qs, 0);
1009
+ __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
1010
+
1011
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
1012
+
1013
+ // Multiply q with scale and accumulate
1014
+ acc = __lasx_xvfmadd_s( d, q, acc );
1015
+ }
1016
+
1017
+ sumf = hsum_float_8(acc);
1018
+
1019
+ #endif
1020
+ for (; ib < nb; ++ib) {
1021
+ int sumi = 0;
1022
+
1023
+ for (int j = 0; j < qk; j++) {
1024
+ sumi += x[ib].qs[j]*y[ib].qs[j];
1025
+ }
1026
+
1027
+ sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
1028
+ }
1029
+
1030
+ *s = sumf;
1031
+ }
1032
+
1033
+ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1034
+ assert(nrc == 1);
1035
+ UNUSED(nrc);
1036
+ UNUSED(bx);
1037
+ UNUSED(by);
1038
+ UNUSED(bs);
1039
+
1040
+ const block_q2_K * GGML_RESTRICT x = vx;
1041
+ const block_q8_K * GGML_RESTRICT y = vy;
1042
+
1043
+ const int nb = n / QK_K;
1044
+
1045
+ #if defined __loongarch_asx
1046
+
1047
+ __m256 acc = (__m256)__lasx_xvldi(0);
1048
+
1049
+ for (int i = 0; i < nb; ++i) {
1050
+
1051
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1052
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1053
+
1054
+ const uint8_t * GGML_RESTRICT q2 = x[i].qs;
1055
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1056
+
1057
+ const __m128i mins_and_scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
1058
+ const __m128i scales128 = __lsx_vandi_b(mins_and_scales128, 0xf);
1059
+ const __m256i mins = lasx_ext8_16(__lsx_vsrli_b(mins_and_scales128, 4));
1060
+ const __m256i prod = lasx_madd_h(mins, __lasx_xvld((const __m256i*)y[i].bsums, 0));
1061
+
1062
+ acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(dmin), __lasx_xvffint_s_w(prod), acc);
1063
+
1064
+ const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
1065
+ const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask));
1066
+
1067
+ __m256i sumi = __lasx_xvldi(0);
1068
+
1069
+ for (int j = 0; j < QK_K/128; ++j) {
1070
+
1071
+ const __m256i q2bits = __lasx_xvld((const __m256i*)q2, 0); q2 += 32;
1072
+
1073
+ const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
1074
+ const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
1075
+ const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
1076
+ const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
1077
+
1078
+ const __m256i q2_0 = __lasx_xvandi_b(q2bits, 3);
1079
+ const __m256i q2_1 = __lasx_xvandi_b(__lasx_xvsrli_b(q2bits, 2), 3);
1080
+ const __m256i q2_2 = __lasx_xvandi_b(__lasx_xvsrli_b(q2bits, 4), 3);
1081
+ const __m256i q2_3 = __lasx_xvsrli_b(q2bits, 6);
1082
+
1083
+ __m256i p0 = lasx_madd_h_b(q2_0, q8_0);
1084
+ __m256i p1 = lasx_madd_h_b(q2_1, q8_1);
1085
+ __m256i p2 = lasx_madd_h_b(q2_2, q8_2);
1086
+ __m256i p3 = lasx_madd_h_b(q2_3, q8_3);
1087
+
1088
+ p0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p0);
1089
+ p1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p1);
1090
+ p2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p2);
1091
+ p3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p3);
1092
+
1093
+ p0 = __lasx_xvadd_w(p0, p1);
1094
+ p2 = __lasx_xvadd_w(p2, p3);
1095
+
1096
+ sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p0, p2));
1097
+ }
1098
+
1099
+ acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
1100
+
1101
+ }
1102
+
1103
+ *s = hsum_float_8(acc);
1104
+
1105
+ #else
1106
+
1107
+ float sumf = 0;
1108
+
1109
+ for (int i = 0; i < nb; ++i) {
1110
+
1111
+ const uint8_t * q2 = x[i].qs;
1112
+ const int8_t * q8 = y[i].qs;
1113
+ const uint8_t * sc = x[i].scales;
1114
+
1115
+ int summs = 0;
1116
+ for (int j = 0; j < 16; ++j) {
1117
+ summs += y[i].bsums[j] * (sc[j] >> 4);
1118
+ }
1119
+
1120
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1121
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1122
+
1123
+ int isum = 0;
1124
+ int is = 0;
1125
+ int d;
1126
+ for (int k = 0; k < QK_K/128; ++k) {
1127
+ int shift = 0;
1128
+ for (int j = 0; j < 4; ++j) {
1129
+ d = sc[is++] & 0xF;
1130
+ int isuml = 0;
1131
+ for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
1132
+ isum += d * isuml;
1133
+ d = sc[is++] & 0xF;
1134
+ isuml = 0;
1135
+ for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
1136
+ isum += d * isuml;
1137
+ shift += 2;
1138
+ q8 += 32;
1139
+ }
1140
+ q2 += 32;
1141
+ }
1142
+ sumf += dall * isum - dmin * summs;
1143
+ }
1144
+ *s = sumf;
1145
+ #endif
1146
+ }
1147
+
1148
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1149
+ assert(n % QK_K == 0);
1150
+ assert(nrc == 1);
1151
+ UNUSED(nrc);
1152
+ UNUSED(bx);
1153
+ UNUSED(by);
1154
+ UNUSED(bs);
1155
+
1156
+ const uint32_t kmask1 = 0x03030303;
1157
+ const uint32_t kmask2 = 0x0f0f0f0f;
1158
+
1159
+ const block_q3_K * GGML_RESTRICT x = vx;
1160
+ const block_q8_K * GGML_RESTRICT y = vy;
1161
+
1162
+ const int nb = n / QK_K;
1163
+
1164
+ #if defined __loongarch_asx
1165
+
1166
+ const __m128i m32 = __lsx_vreplgr2vr_b(32);
1167
+
1168
+ __m256 acc = (__m256)__lasx_xvldi(0);
1169
+
1170
+ uint32_t aux[3];
1171
+
1172
+ for (int i = 0; i < nb; ++i) {
1173
+
1174
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1175
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1176
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1177
+ // Set up scales
1178
+ memcpy(aux, x[i].scales, 12);
1179
+ __m128i scales128 = lsx_set_w(
1180
+ ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
1181
+ ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
1182
+ (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
1183
+ (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
1184
+ scales128 = __lsx_vsub_b(scales128, m32);
1185
+
1186
+ const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
1187
+ const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask));
1188
+
1189
+ // high bit
1190
+ const __m256i hbits = __lasx_xvld((const __m256i*)x[i].hmask, 0);
1191
+
1192
+ // integer accumulator
1193
+ __m256i sumi = __lasx_xvldi(0);
1194
+
1195
+ for (int j = 0; j < QK_K/128; ++j) {
1196
+ // load low 2 bits
1197
+ const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
1198
+
1199
+ // prepare low and high bits
1200
+ const __m256i q3l_0 = __lasx_xvandi_b(q3bits, 3);
1201
+ const __m256i q3l_1 = __lasx_xvandi_b(__lasx_xvsrli_b(q3bits, 2), 3);
1202
+ const __m256i q3l_2 = __lasx_xvandi_b(__lasx_xvsrli_b(q3bits, 4), 3);
1203
+ const __m256i q3l_3 = __lasx_xvsrli_b(q3bits, 6);
1204
+ const __m256i q3h_0 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 0), 0), 2);
1205
+ const __m256i q3h_1 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 1), 0), 2);
1206
+ const __m256i q3h_2 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 2), 0), 2);
1207
+ const __m256i q3h_3 = __lasx_xvslli_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 4 * j + 3), 0), 2);
1208
+ const __m256i q3_0 = __lasx_xvor_v(q3h_0, q3l_0);
1209
+ const __m256i q3_1 = __lasx_xvor_v(q3h_1, q3l_1);
1210
+ const __m256i q3_2 = __lasx_xvor_v(q3h_2, q3l_2);
1211
+ const __m256i q3_3 = __lasx_xvor_v(q3h_3, q3l_3);
1212
+
1213
+ // load Q8 quants
1214
+ const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
1215
+ const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
1216
+ const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
1217
+ const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
1218
+
1219
+ __m256i p16_0 = lasx_madd_h_b(q8_0, q3_0);
1220
+ __m256i p16_1 = lasx_madd_h_b(q8_1, q3_1);
1221
+ __m256i p16_2 = lasx_madd_h_b(q8_2, q3_2);
1222
+ __m256i p16_3 = lasx_madd_h_b(q8_3, q3_3);
1223
+
1224
+ // multiply with scales
1225
+ p16_0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p16_0);
1226
+ p16_1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p16_1);
1227
+ p16_2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p16_2);
1228
+ p16_3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p16_3);
1229
+
1230
+ // accumulate
1231
+ p16_0 = __lasx_xvadd_w(p16_0, p16_1);
1232
+ p16_2 = __lasx_xvadd_w(p16_2, p16_3);
1233
+ sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_2));
1234
+ }
1235
+ // multiply with block scale and accumulate
1236
+ acc = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
1237
+ }
1238
+
1239
+ *s = hsum_float_8(acc);
1240
+
1241
+ #else
1242
+ // scalar version
1243
+ // This function is written like this so the compiler can manage to vectorize most of it
1244
+ // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
1245
+ // manually vectorized version above. Every other version I tried would run at least 4 times slower.
1246
+ // The ideal situation would be if we could just write the code once, and the compiler would
1247
+ // automatically produce the best possible set of machine instructions, instead of us having to manually
1248
+ // write vectorized versions for AVX, ARM_NEON, etc.
1249
+
1250
+ int8_t aux8[QK_K];
1251
+ int16_t aux16[8];
1252
+ float sums [8];
1253
+ int32_t aux32[8];
1254
+ memset(sums, 0, 8*sizeof(float));
1255
+
1256
+ uint32_t auxs[4];
1257
+ const int8_t * scales = (const int8_t*)auxs;
1258
+
1259
+ float sumf = 0;
1260
+ for (int i = 0; i < nb; ++i) {
1261
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1262
+ const uint8_t * GGML_RESTRICT hm = x[i].hmask;
1263
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1264
+ memset(aux32, 0, 8*sizeof(int32_t));
1265
+ int8_t * GGML_RESTRICT a = aux8;
1266
+ uint8_t m = 1;
1267
+ for (int j = 0; j < QK_K; j += 128) {
1268
+ for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
1269
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1270
+ a += 32; m <<= 1;
1271
+ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
1272
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1273
+ a += 32; m <<= 1;
1274
+ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
1275
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1276
+ a += 32; m <<= 1;
1277
+ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
1278
+ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
1279
+ a += 32; m <<= 1;
1280
+ q3 += 32;
1281
+ }
1282
+ a = aux8;
1283
+
1284
+ memcpy(auxs, x[i].scales, 12);
1285
+ uint32_t tmp = auxs[2];
1286
+ auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
1287
+ auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
1288
+ auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
1289
+ auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
1290
+ for (int j = 0; j < QK_K/16; ++j) {
1291
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1292
+ for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1293
+ q8 += 8; a += 8;
1294
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1295
+ for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1296
+ q8 += 8; a += 8;
1297
+ }
1298
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1299
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1300
+ }
1301
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
1302
+ *s = sumf;
1303
+
1304
+ #endif
1305
+
1306
+ }
1307
+
1308
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1309
+ assert(n % QK_K == 0);
1310
+ assert(nrc == 1);
1311
+ UNUSED(nrc);
1312
+ UNUSED(bx);
1313
+ UNUSED(by);
1314
+ UNUSED(bs);
1315
+
1316
+ const block_q4_K * GGML_RESTRICT x = vx;
1317
+ const block_q8_K * GGML_RESTRICT y = vy;
1318
+
1319
+ const int nb = n / QK_K;
1320
+
1321
+ static const uint32_t kmask1 = 0x3f3f3f3f;
1322
+ static const uint32_t kmask2 = 0x0f0f0f0f;
1323
+ static const uint32_t kmask3 = 0x03030303;
1324
+
1325
+ uint32_t utmp[4];
1326
+
1327
+ #if defined __loongarch_asx
1328
+
1329
+ __m256 acc = (__m256)__lasx_xvldi(0);
1330
+ __m128 acc_m = (__m128)__lsx_vldi(0);
1331
+
1332
+ for (int i = 0; i < nb; ++i) {
1333
+
1334
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1335
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1336
+
1337
+ memcpy(utmp, x[i].scales, 12);
1338
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1339
+ const uint32_t uaux = utmp[1] & kmask1;
1340
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1341
+ utmp[2] = uaux;
1342
+ utmp[0] &= kmask1;
1343
+
1344
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1345
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1346
+
1347
+ const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
1348
+ const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
1349
+ const __m128i scales128 = __lsx_vsllwil_h_b(mins_and_scales128, 0);
1350
+
1351
+ const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
1352
+ const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
1353
+ const __m128i prod = lsx_madd_h(mins128, q8s);
1354
+ acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m);
1355
+
1356
+ const __m256i scales = lasx_insertf128(scales128, scales128);
1357
+
1358
+ __m256i sumi = __lasx_xvldi(0);
1359
+
1360
+ for (int j = 0; j < QK_K/64; ++j) {
1361
+
1362
+ const __m256i scale_l = lasx_xvrepl128vei_h(scales, 2 * j + 0);
1363
+ const __m256i scale_h = lasx_xvrepl128vei_h(scales, 2 * j + 1);
1364
+
1365
+ const __m256i q4bits = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
1366
+ const __m256i q4l = __lasx_xvandi_b(q4bits, 0xf);
1367
+ const __m256i q4h = __lasx_xvsrli_b(q4bits, 4);
1368
+
1369
+ const __m256i q8l = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
1370
+ __m256i p16l = lasx_madd_h_b(q4l, q8l);
1371
+ p16l = lasx_madd_h(scale_l, p16l);
1372
+
1373
+ const __m256i q8h = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
1374
+ __m256i p16h = lasx_madd_h_b(q4h, q8h);
1375
+ p16h = lasx_madd_h(scale_h, p16h);
1376
+ const __m256i sumj = __lasx_xvadd_w(p16l, p16h);
1377
+
1378
+ sumi = __lasx_xvadd_w(sumi, sumj);
1379
+ }
1380
+
1381
+ __m256 vd = __lasx_xvreplfr2vr_s(d);
1382
+ acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
1383
+
1384
+ }
1385
+
1386
+ acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee));
1387
+ __m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0);
1388
+ acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
1389
+
1390
+
1391
+ *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
1392
+
1393
+ #else
1394
+
1395
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
1396
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
1397
+
1398
+ int8_t aux8[QK_K];
1399
+ int16_t aux16[8];
1400
+ float sums [8];
1401
+ int32_t aux32[8];
1402
+ memset(sums, 0, 8*sizeof(float));
1403
+
1404
+ float sumf = 0;
1405
+ for (int i = 0; i < nb; ++i) {
1406
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1407
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1408
+ memset(aux32, 0, 8*sizeof(int32_t));
1409
+ int8_t * GGML_RESTRICT a = aux8;
1410
+ for (int j = 0; j < QK_K/64; ++j) {
1411
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1412
+ a += 32;
1413
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1414
+ a += 32; q4 += 32;
1415
+ }
1416
+ memcpy(utmp, x[i].scales, 12);
1417
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1418
+ const uint32_t uaux = utmp[1] & kmask1;
1419
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1420
+ utmp[2] = uaux;
1421
+ utmp[0] &= kmask1;
1422
+
1423
+ int sumi = 0;
1424
+ for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1425
+ a = aux8;
1426
+ int is = 0;
1427
+ for (int j = 0; j < QK_K/32; ++j) {
1428
+ int32_t scale = scales[is++];
1429
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1430
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1431
+ q8 += 8; a += 8;
1432
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1433
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1434
+ q8 += 8; a += 8;
1435
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1436
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1437
+ q8 += 8; a += 8;
1438
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1439
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1440
+ q8 += 8; a += 8;
1441
+ }
1442
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1443
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1444
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1445
+ sumf -= dmin * sumi;
1446
+ }
1447
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
1448
+ *s = sumf;
1449
+ #endif
1450
+ }
1451
+
1452
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1453
+ assert(n % QK_K == 0);
1454
+ assert(nrc == 1);
1455
+ UNUSED(nrc);
1456
+ UNUSED(bx);
1457
+ UNUSED(by);
1458
+ UNUSED(bs);
1459
+
1460
+ const block_q5_K * GGML_RESTRICT x = vx;
1461
+ const block_q8_K * GGML_RESTRICT y = vy;
1462
+
1463
+ const int nb = n / QK_K;
1464
+
1465
+ static const uint32_t kmask1 = 0x3f3f3f3f;
1466
+ static const uint32_t kmask2 = 0x0f0f0f0f;
1467
+ static const uint32_t kmask3 = 0x03030303;
1468
+
1469
+ uint32_t utmp[4];
1470
+
1471
+ #if defined __loongarch_asx
1472
+
1473
+ __m256 acc = (__m256)__lasx_xvldi(0);
1474
+ __m128 acc_m = (__m128)__lsx_vldi(0);
1475
+
1476
+ for (int i = 0; i < nb; ++i) {
1477
+
1478
+ const uint8_t * GGML_RESTRICT q5 = x[i].qs;
1479
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1480
+
1481
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1482
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1483
+
1484
+ memcpy(utmp, x[i].scales, 12);
1485
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1486
+ const uint32_t uaux = utmp[1] & kmask1;
1487
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1488
+ utmp[2] = uaux;
1489
+ utmp[0] &= kmask1;
1490
+
1491
+ const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
1492
+ const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
1493
+ const __m128i scales128 = __lsx_vsllwil_h_b(mins_and_scales128, 0);
1494
+
1495
+ const __m256i q8sums = __lasx_xvld((const __m256i*)y[i].bsums, 0);
1496
+ const __m128i q8s = lsx_hadd_h(lasx_extracti128(q8sums, 0), lasx_extracti128(q8sums, 1));
1497
+ const __m128i prod = lsx_madd_h(mins128, q8s);
1498
+ acc_m = __lsx_vfmadd_s(__lsx_vreplfr2vr_s(dmin), __lsx_vffint_s_w(prod), acc_m);
1499
+
1500
+ const __m256i scales = lasx_insertf128(scales128, scales128);
1501
+
1502
+ const __m256i hbits = __lasx_xvld((const __m256i*)x[i].qh, 0);
1503
+
1504
+ __m256i sumi = __lasx_xvldi(0);
1505
+
1506
+ for (int j = 0; j < QK_K/64; ++j) {
1507
+
1508
+ const __m256i scale_0 = lasx_xvrepl128vei_h(scales, 2 * j + 0);
1509
+ const __m256i scale_1 = lasx_xvrepl128vei_h(scales, 2 * j + 1);
1510
+
1511
+ const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
1512
+
1513
+ const __m256i q5l_0 = __lasx_xvandi_b(q5bits, 0xf);
1514
+ const __m256i q5l_1 = __lasx_xvsrli_b(q5bits, 4);
1515
+ const __m256i q5h_0 = __lasx_xvnori_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 2 * j + 0), 0), 0xef);
1516
+ const __m256i q5h_1 = __lasx_xvnori_b(__lasx_xvseqi_b(lasx_xvandi_b_bit(hbits, 2 * j + 1), 0), 0xef);
1517
+ const __m256i q5_0 = __lasx_xvor_v(q5l_0, q5h_0);
1518
+ const __m256i q5_1 = __lasx_xvor_v(q5l_1, q5h_1);
1519
+
1520
+ const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
1521
+ const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
1522
+
1523
+ __m256i p16_0 = lasx_madd_h_b(q5_0, q8_0);
1524
+ __m256i p16_1 = lasx_madd_h_b(q5_1, q8_1);
1525
+
1526
+ p16_0 = lasx_madd_h(scale_0, p16_0);
1527
+ p16_1 = lasx_madd_h(scale_1, p16_1);
1528
+
1529
+ sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
1530
+
1531
+ }
1532
+
1533
+ __m256 vd = __lasx_xvreplfr2vr_s(d);
1534
+ acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
1535
+
1536
+ }
1537
+
1538
+ acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 8));
1539
+ acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 4));
1540
+
1541
+ *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
1542
+
1543
+ #else
1544
+
1545
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
1546
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
1547
+
1548
+ int8_t aux8[QK_K];
1549
+ int16_t aux16[8];
1550
+ float sums [8];
1551
+ int32_t aux32[8];
1552
+ memset(sums, 0, 8*sizeof(float));
1553
+
1554
+ float sumf = 0;
1555
+ for (int i = 0; i < nb; ++i) {
1556
+ const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1557
+ const uint8_t * GGML_RESTRICT hm = x[i].qh;
1558
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1559
+ memset(aux32, 0, 8*sizeof(int32_t));
1560
+ int8_t * GGML_RESTRICT a = aux8;
1561
+ uint8_t m = 1;
1562
+ for (int j = 0; j < QK_K/64; ++j) {
1563
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
1564
+ for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1565
+ a += 32; m <<= 1;
1566
+ for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
1567
+ for (int l = 0; l < 32; ++l) a[l] += (hm[l] & m ? 16 : 0);
1568
+ a += 32; m <<= 1;
1569
+ q4 += 32;
1570
+ }
1571
+ memcpy(utmp, x[i].scales, 12);
1572
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
1573
+ const uint32_t uaux = utmp[1] & kmask1;
1574
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
1575
+ utmp[2] = uaux;
1576
+ utmp[0] &= kmask1;
1577
+
1578
+ int sumi = 0;
1579
+ for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
1580
+ a = aux8;
1581
+ int is = 0;
1582
+ for (int j = 0; j < QK_K/32; ++j) {
1583
+ int32_t scale = scales[is++];
1584
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1585
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1586
+ q8 += 8; a += 8;
1587
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1588
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1589
+ q8 += 8; a += 8;
1590
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1591
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1592
+ q8 += 8; a += 8;
1593
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1594
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1595
+ q8 += 8; a += 8;
1596
+ }
1597
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1598
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1599
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1600
+ sumf -= dmin * sumi;
1601
+ }
1602
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
1603
+ *s = sumf;
1604
+ #endif
1605
+ }
1606
+
1607
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1608
+ assert(n % QK_K == 0);
1609
+ assert(nrc == 1);
1610
+ UNUSED(nrc);
1611
+ UNUSED(bx);
1612
+ UNUSED(by);
1613
+ UNUSED(bs);
1614
+
1615
+ const block_q6_K * GGML_RESTRICT x = vx;
1616
+ const block_q8_K * GGML_RESTRICT y = vy;
1617
+
1618
+ const int nb = n / QK_K;
1619
+
1620
+ #if defined __loongarch_asx
1621
+
1622
+ const __m256i m32s = __lasx_xvreplgr2vr_b(32);
1623
+
1624
+ __m256 acc = (__m256)__lasx_xvldi(0);
1625
+
1626
+ for (int i = 0; i < nb; ++i) {
1627
+
1628
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1629
+
1630
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
1631
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
1632
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1633
+
1634
+ const __m128i scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
1635
+ const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
1636
+ const __m256i scales_shuffled = lasx_ext8_16(__lsx_vshuf_b(scales128, scales128, (__m128i)shuffle_mask));
1637
+
1638
+ __m256i sumi = __lasx_xvldi(0);
1639
+
1640
+ for (int j = 0; j < QK_K/128; ++j) {
1641
+
1642
+ const __m256i q4bits1 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
1643
+ const __m256i q4bits2 = __lasx_xvld((const __m256i*)q4, 0); q4 += 32;
1644
+ const __m256i q4bitsH = __lasx_xvld((const __m256i*)qh, 0); qh += 32;
1645
+
1646
+ const __m256i q4h_0 = __lasx_xvslli_b(__lasx_xvandi_b(q4bitsH, 3), 4);
1647
+ const __m256i q4h_1 = __lasx_xvslli_b(__lasx_xvandi_b(q4bitsH, 3 << 2), 2);
1648
+ const __m256i q4h_2 = __lasx_xvandi_b(q4bitsH, 3 << 4);
1649
+ const __m256i q4h_3 = __lasx_xvsrli_b(__lasx_xvandi_b(q4bitsH, 3 << 6), 2);
1650
+
1651
+ const __m256i q4_0 = __lasx_xvor_v(__lasx_xvandi_b(q4bits1, 0xf), q4h_0);
1652
+ const __m256i q4_1 = __lasx_xvor_v(__lasx_xvandi_b(q4bits2, 0xf), q4h_1);
1653
+ const __m256i q4_2 = __lasx_xvor_v(__lasx_xvsrli_b(q4bits1, 4), q4h_2);
1654
+ const __m256i q4_3 = __lasx_xvor_v(__lasx_xvsrli_b(q4bits2, 4), q4h_3);
1655
+
1656
+ const __m256i q8_0 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
1657
+ const __m256i q8_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
1658
+ const __m256i q8_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
1659
+ const __m256i q8_3 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
1660
+
1661
+ __m256i p16_0 = lasx_madd_h_b(__lasx_xvsub_b(q4_0, m32s), q8_0);
1662
+ __m256i p16_1 = lasx_madd_h_b(__lasx_xvsub_b(q4_1, m32s), q8_1);
1663
+ __m256i p16_2 = lasx_madd_h_b(__lasx_xvsub_b(q4_2, m32s), q8_2);
1664
+ __m256i p16_3 = lasx_madd_h_b(__lasx_xvsub_b(q4_3, m32s), q8_3);
1665
+
1666
+ p16_0 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 0), p16_0);
1667
+ p16_1 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 1), p16_1);
1668
+ p16_2 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 2), p16_2);
1669
+ p16_3 = lasx_madd_h(lasx_xvrepl128vei_h(scales_shuffled, 4 * j + 3), p16_3);
1670
+
1671
+ sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
1672
+ sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_2, p16_3));
1673
+ }
1674
+
1675
+ acc = __lasx_xvfmadd_s((__m256)__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), acc);
1676
+ }
1677
+
1678
+ *s = hsum_float_8(acc);
1679
+
1680
+ #else
1681
+
1682
+ int8_t aux8[QK_K];
1683
+ int16_t aux16[8];
1684
+ float sums [8];
1685
+ int32_t aux32[8];
1686
+ memset(sums, 0, 8*sizeof(float));
1687
+
1688
+ float sumf = 0;
1689
+ for (int i = 0; i < nb; ++i) {
1690
+ const uint8_t * GGML_RESTRICT q4 = x[i].ql;
1691
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
1692
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1693
+ memset(aux32, 0, 8*sizeof(int32_t));
1694
+ int8_t * GGML_RESTRICT a = aux8;
1695
+ for (int j = 0; j < QK_K; j += 128) {
1696
+ for (int l = 0; l < 32; ++l) {
1697
+ a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
1698
+ a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
1699
+ a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
1700
+ a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
1701
+ }
1702
+ a += 128;
1703
+ q4 += 64;
1704
+ qh += 32;
1705
+ }
1706
+ a = aux8;
1707
+ int is = 0;
1708
+ for (int j = 0; j < QK_K/16; ++j) {
1709
+ int scale = x[i].scales[is++];
1710
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1711
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1712
+ q8 += 8; a += 8;
1713
+ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
1714
+ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1715
+ q8 += 8; a += 8;
1716
+ }
1717
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1718
+ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1719
+ }
1720
+ for (int l = 0; l < 8; ++l) sumf += sums[l];
1721
+ *s = sumf;
1722
+ #endif
1723
+ }
1724
+
1725
+ #if defined(__loongarch_asx)
1726
+ static const int8_t keven_signs_q2xs[1024] = {
1727
+ 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
1728
+ 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
1729
+ 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1,
1730
+ 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1,
1731
+ 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1,
1732
+ 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1,
1733
+ 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1,
1734
+ 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1,
1735
+ 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1,
1736
+ 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1,
1737
+ 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1,
1738
+ 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1,
1739
+ 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1,
1740
+ 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1,
1741
+ 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1,
1742
+ 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1,
1743
+ 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1,
1744
+ 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1,
1745
+ 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1,
1746
+ 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1,
1747
+ 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1,
1748
+ 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1,
1749
+ 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1,
1750
+ 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1,
1751
+ 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1,
1752
+ 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1,
1753
+ 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1,
1754
+ 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1,
1755
+ 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1,
1756
+ 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1,
1757
+ 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
1758
+ 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
1759
+ };
1760
+ #endif
1761
+
1762
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1763
+ assert(n % QK_K == 0);
1764
+ assert(nrc == 1);
1765
+ UNUSED(nrc);
1766
+ UNUSED(bx);
1767
+ UNUSED(by);
1768
+ UNUSED(bs);
1769
+
1770
+ const block_iq2_xxs * GGML_RESTRICT x = vx;
1771
+ const block_q8_K * GGML_RESTRICT y = vy;
1772
+
1773
+ const int nb = n / QK_K;
1774
+
1775
+ #if defined(__loongarch_asx)
1776
+
1777
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
1778
+
1779
+ uint32_t aux32[4];
1780
+ const uint8_t * aux8 = (const uint8_t *)aux32;
1781
+
1782
+ __m256 accumf = (__m256)__lasx_xvldi(0);
1783
+ for (int i = 0; i < nb; ++i) {
1784
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1785
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1786
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1787
+ __m256i sumi1 = __lasx_xvldi(0);
1788
+ __m256i sumi2 = __lasx_xvldi(0);
1789
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
1790
+ const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
1791
+ const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
1792
+ memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
1793
+
1794
+ const __m256i q2_1 = lasx_set_d(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
1795
+ const __m256i q2_2 = lasx_set_d(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
1796
+ const __m256i s2_1 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
1797
+ signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
1798
+ const __m256i s2_2 = lasx_set_d(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
1799
+ signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
1800
+ const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1);
1801
+ const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2);
1802
+ const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1);
1803
+ const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2);
1804
+ const uint16_t ls1 = aux32[1] >> 28;
1805
+ const uint16_t ls2 = aux32[3] >> 28;
1806
+ const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
1807
+ const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
1808
+ sumi1 = __lasx_xvadd_w(sumi1, p1);
1809
+ sumi2 = __lasx_xvadd_w(sumi2, p2);
1810
+ }
1811
+
1812
+ accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
1813
+ }
1814
+
1815
+ *s = 0.125f * hsum_float_8(accumf);
1816
+
1817
+ #else
1818
+
1819
+ uint32_t aux32[2];
1820
+ const uint8_t * aux8 = (const uint8_t *)aux32;
1821
+
1822
+ float sumf = 0.f;
1823
+ for (int i = 0; i < nb; ++i) {
1824
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1825
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1826
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1827
+ int32_t bsum = 0;
1828
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
1829
+ memcpy(aux32, q2, 2*sizeof(uint32_t));
1830
+ q2 += 4;
1831
+ const uint32_t ls = 2*(aux32[1] >> 28) + 1;
1832
+ int32_t sumi = 0;
1833
+ for (int l = 0; l < 4; ++l) {
1834
+ const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
1835
+ const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
1836
+ for (int j = 0; j < 8; ++j) {
1837
+ sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1838
+ }
1839
+ q8 += 8;
1840
+ }
1841
+ bsum += sumi * ls;
1842
+ }
1843
+ sumf += d * bsum;
1844
+ }
1845
+ *s = 0.125f * sumf;
1846
+ #endif
1847
+ }
1848
+
1849
+ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
1850
+ assert(n % QK_K == 0);
1851
+ assert(nrc == 1);
1852
+ UNUSED(nrc);
1853
+ UNUSED(bx);
1854
+ UNUSED(by);
1855
+ UNUSED(bs);
1856
+
1857
+ const block_iq2_xs * GGML_RESTRICT x = vx;
1858
+ const block_q8_K * GGML_RESTRICT y = vy;
1859
+
1860
+ const int nb = n / QK_K;
1861
+
1862
+ #if defined(__loongarch_asx)
1863
+
1864
+ const __m256i mone = __lasx_xvreplgr2vr_b(1);
1865
+ static const char block_sign_shuffle_mask_1[32] = {
1866
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
1867
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
1868
+ };
1869
+ static const char block_sign_shuffle_mask_2[32] = {
1870
+ 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
1871
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
1872
+ };
1873
+ static const uint8_t bit_selector_mask_bytes[32] = {
1874
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
1875
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
1876
+ };
1877
+
1878
+ const __m256i bit_selector_mask = __lasx_xvld((const __m256i*)bit_selector_mask_bytes, 0);
1879
+ const __m256i block_sign_shuffle_1 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_1, 0);
1880
+ const __m256i block_sign_shuffle_2 = __lasx_xvld((const __m256i*)block_sign_shuffle_mask_2, 0);
1881
+
1882
+ static const uint8_t k_bit_helper[32] = {
1883
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
1884
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
1885
+ };
1886
+ const __m256i bit_helper = __lasx_xvld((const __m256i*)k_bit_helper, 0);
1887
+ const __m256i m511 = __lasx_xvreplgr2vr_h(511);
1888
+ const __m128i m4 = __lsx_vreplgr2vr_b(0xf);
1889
+ const __m128i m1 = __lsx_vreplgr2vr_b(1);
1890
+
1891
+ uint64_t aux64;
1892
+
1893
+ // somewhat hacky, but gives a significant boost in performance
1894
+ __m256i aux_gindex;
1895
+ const uint16_t * gindex = (const uint16_t *)&aux_gindex;
1896
+
1897
+ __m256 accumf = (__m256)__lasx_xvldi(0);
1898
+ for (int i = 0; i < nb; ++i) {
1899
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1900
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1901
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1902
+
1903
+ memcpy(&aux64, x[i].scales, 8);
1904
+ __m128i stmp = __lsx_vreplgr2vr_d(aux64);
1905
+ stmp = __lsx_vilvl_b( __lsx_vand_v(__lsx_vsrli_h(stmp, 4), m4), __lsx_vand_v(stmp, m4));
1906
+ const __m128i scales = __lsx_vadd_b(__lsx_vslli_h(stmp, 1), m1);
1907
+
1908
+ __m256i sumi1 = __lasx_xvldi(0);
1909
+ __m256i sumi2 = __lasx_xvldi(0);
1910
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
1911
+
1912
+ const __m256i q2_data = __lasx_xvld((const __m256i*)q2, 0); q2 += 16;
1913
+ aux_gindex = __lasx_xvand_v(q2_data, m511);
1914
+
1915
+ const __m256i partial_sign_bits = __lasx_xvsrli_h(q2_data, 9);
1916
+ const __m256i partial_sign_bits_upper = __lasx_xvsrli_h(q2_data, 13);
1917
+ const __m256i partial_sign_bits_for_counting = __lasx_xvxor_v(partial_sign_bits, partial_sign_bits_upper);
1918
+
1919
+ const __m256i odd_bits = lasx_shuffle_b(bit_helper, partial_sign_bits_for_counting);
1920
+ const __m256i full_sign_bits = __lasx_xvor_v(partial_sign_bits, odd_bits);
1921
+
1922
+ const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
1923
+ const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
1924
+ const __m256i q8_3 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
1925
+ const __m256i q8_4 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
1926
+
1927
+ const __m256i q2_1 = lasx_set_d(iq2xs_grid[gindex[ 3]], iq2xs_grid[gindex[ 2]],
1928
+ iq2xs_grid[gindex[ 1]], iq2xs_grid[gindex[ 0]]);
1929
+ const __m256i q2_2 = lasx_set_d(iq2xs_grid[gindex[ 7]], iq2xs_grid[gindex[ 6]],
1930
+ iq2xs_grid[gindex[ 5]], iq2xs_grid[gindex[ 4]]);
1931
+ const __m256i q2_3 = lasx_set_d(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]],
1932
+ iq2xs_grid[gindex[ 9]], iq2xs_grid[gindex[ 8]]);
1933
+ const __m256i q2_4 = lasx_set_d(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]],
1934
+ iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
1935
+
1936
+ const __m128i full_signs_l = lasx_extracti128(full_sign_bits, 0);
1937
+ const __m128i full_signs_h = lasx_extracti128(full_sign_bits, 1);
1938
+ const __m256i full_signs_1 = lasx_insertf128(full_signs_l, full_signs_l);
1939
+ const __m256i full_signs_2 = lasx_insertf128(full_signs_h, full_signs_h);
1940
+
1941
+ __m256i signs;
1942
+ signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_1);
1943
+ signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
1944
+ const __m256i q8s_1 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_1);
1945
+
1946
+ signs = lasx_shuffle_b(full_signs_1, block_sign_shuffle_2);
1947
+ signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
1948
+ const __m256i q8s_2 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_2);
1949
+
1950
+ signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_1);
1951
+ signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
1952
+ const __m256i q8s_3 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_3);
1953
+
1954
+ signs = lasx_shuffle_b(full_signs_2, block_sign_shuffle_2);
1955
+ signs = __lasx_xvseq_b(__lasx_xvand_v(signs, bit_selector_mask), bit_selector_mask);
1956
+ const __m256i q8s_4 = __lasx_xvsigncov_b(__lasx_xvor_v(signs, mone), q8_4);
1957
+
1958
+ const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1);
1959
+ const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2);
1960
+ const __m256i dot3 = lasx_maddubs_h(q2_3, q8s_3);
1961
+ const __m256i dot4 = lasx_maddubs_h(q2_4, q8s_4);
1962
+
1963
+ const __m256i sc1 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+0)));
1964
+ const __m256i sc2 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+1)));
1965
+ const __m256i sc3 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+2)));
1966
+ const __m256i sc4 = lasx_ext8_16(lsx_shuffle_b(scales, get_scale_shuffle(ib32+3)));
1967
+
1968
+ sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot1, sc1));
1969
+ sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot2, sc2));
1970
+ sumi1 = __lasx_xvadd_w(sumi1, lasx_madd_h(dot3, sc3));
1971
+ sumi2 = __lasx_xvadd_w(sumi2, lasx_madd_h(dot4, sc4));
1972
+ }
1973
+
1974
+ accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
1975
+
1976
+ }
1977
+
1978
+ *s = 0.125f * hsum_float_8(accumf);
1979
+
1980
+ #else
1981
+
1982
+ float sumf = 0.f;
1983
+ for (int i = 0; i < nb; ++i) {
1984
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1985
+ const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1986
+ const uint8_t * GGML_RESTRICT sc = x[i].scales;
1987
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
1988
+ int32_t bsum = 0;
1989
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
1990
+ const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
1991
+ const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
1992
+ int32_t sumi = 0;
1993
+ for (int l = 0; l < 2; ++l) {
1994
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
1995
+ const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
1996
+ for (int j = 0; j < 8; ++j) {
1997
+ sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
1998
+ }
1999
+ q8 += 8;
2000
+ }
2001
+ bsum += sumi * ls1;
2002
+ sumi = 0;
2003
+ for (int l = 2; l < 4; ++l) {
2004
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
2005
+ const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
2006
+ for (int j = 0; j < 8; ++j) {
2007
+ sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
2008
+ }
2009
+ q8 += 8;
2010
+ }
2011
+ bsum += sumi * ls2;
2012
+ q2 += 4;
2013
+ }
2014
+ sumf += d * bsum;
2015
+ }
2016
+ *s = 0.125f * sumf;
2017
+ #endif
2018
+ }
2019
+
2020
+ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2021
+ assert(n % QK_K == 0);
2022
+ assert(nrc == 1);
2023
+ UNUSED(nrc);
2024
+ UNUSED(bx);
2025
+ UNUSED(by);
2026
+ UNUSED(bs);
2027
+
2028
+ const block_iq2_s * GGML_RESTRICT x = vx;
2029
+ const block_q8_K * GGML_RESTRICT y = vy;
2030
+
2031
+ const int nb = n / QK_K;
2032
+
2033
+ #if defined(__loongarch_asx)
2034
+
2035
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
2036
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
2037
+ };
2038
+
2039
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2040
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2041
+ };
2042
+
2043
+
2044
+ const __m128i m4 = __lsx_vreplgr2vr_b(0xf);
2045
+ const __m128i m1 = __lsx_vreplgr2vr_b(1);
2046
+
2047
+ const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0);
2048
+ const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0);
2049
+ uint64_t aux64;
2050
+
2051
+ __m256 accumf = (__m256)__lasx_xvldi(0);
2052
+ for (int i = 0; i < nb; ++i) {
2053
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2054
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
2055
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
2056
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
2057
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2058
+
2059
+ __m128i tmp1;
2060
+ memcpy(&aux64, x[i].scales, 8);
2061
+ tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64, 0);
2062
+ tmp1 = __lsx_vinsgr2vr_d(tmp1, aux64 >> 4, 1);
2063
+ const __m128i scales8 = __lsx_vadd_b(__lsx_vslli_h(__lsx_vand_v(tmp1, m4), 1), m1);
2064
+ const __m256i scales16 = lasx_ext8_16(scales8); // 0 2 4 6 8 10 12 14 1 3 5 7 9 11 13 15
2065
+
2066
+ __m256i sumi1 = __lasx_xvldi(0);
2067
+ __m256i sumi2 = __lasx_xvldi(0);
2068
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
2069
+ const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
2070
+ const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
2071
+ const __m256i q2_1 = lasx_set_d(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
2072
+ iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)],
2073
+ iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
2074
+ iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
2075
+ const __m256i q2_2 = lasx_set_d(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
2076
+ iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)],
2077
+ iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
2078
+ iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
2079
+ qs += 8;
2080
+
2081
+ __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | ((uint32_t) signs[1] << 16));
2082
+ aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
2083
+ const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2);
2084
+ const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1);
2085
+
2086
+ aux256 = __lasx_xvreplgr2vr_w(signs[2] | ((uint32_t) signs[3] << 16));
2087
+ aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
2088
+ const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2);
2089
+ const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2);
2090
+
2091
+ signs += 4;
2092
+
2093
+ const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1); // blocks 2*ib32+0, 2*ib32+1
2094
+ const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2); // blocks 2*ib32+2, 2*ib32+3
2095
+
2096
+ const __m256i p1 = lasx_madd_h(dot1, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+0)));
2097
+ const __m256i p2 = lasx_madd_h(dot2, lasx_shuffle_b(scales16, get_scale_shuffle_k4(ib32+1)));
2098
+ sumi1 = __lasx_xvadd_w(sumi1, p1);
2099
+ sumi2 = __lasx_xvadd_w(sumi2, p2);
2100
+ }
2101
+
2102
+ accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
2103
+ }
2104
+
2105
+ *s = 0.125f * hsum_float_8(accumf);
2106
+
2107
+ #else
2108
+
2109
+ float sumf = 0;
2110
+ for (int i = 0; i < nb; i++) {
2111
+
2112
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2113
+ const int8_t * q8 = y[i].qs;
2114
+ const uint8_t * qs = x[i].qs;
2115
+ const uint8_t * qh = x[i].qh;
2116
+ const uint8_t * signs = qs + QK_K/8;
2117
+
2118
+ int bsum = 0;
2119
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2120
+ int ls1 = 1 + 2*(x[i].scales[ib32] & 0xf);
2121
+ int ls2 = 1 + 2*(x[i].scales[ib32] >> 4);
2122
+ int sumi1 = 0, sumi2 = 0;
2123
+ for (int l = 0; l < 2; ++l) {
2124
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
2125
+ for (int j = 0; j < 8; ++j) {
2126
+ sumi1 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
2127
+ }
2128
+ q8 += 8;
2129
+ }
2130
+ for (int l = 2; l < 4; ++l) {
2131
+ const uint8_t * grid = (const uint8_t *)(iq2s_grid + (qs[l] | (qh[ib32] << (8-2*l) & 0x300)));
2132
+ for (int j = 0; j < 8; ++j) {
2133
+ sumi2 += q8[j] * grid[j] * (signs[l] & kmask_iq2xs[j] ? -1 : 1);
2134
+ }
2135
+ q8 += 8;
2136
+ }
2137
+ bsum += ls1 * sumi1 + ls2 * sumi2;
2138
+ qs += 4;
2139
+ signs += 4;
2140
+ }
2141
+
2142
+ sumf += d * bsum;
2143
+ }
2144
+
2145
+ *s = 0.125f * sumf;
2146
+
2147
+ #endif
2148
+
2149
+ }
2150
+
2151
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2152
+ assert(n % QK_K == 0);
2153
+ assert(nrc == 1);
2154
+ UNUSED(nrc);
2155
+ UNUSED(bx);
2156
+ UNUSED(by);
2157
+ UNUSED(bs);
2158
+
2159
+ const block_iq3_xxs * GGML_RESTRICT x = vx;
2160
+ const block_q8_K * GGML_RESTRICT y = vy;
2161
+
2162
+ const int nb = n / QK_K;
2163
+
2164
+ #if defined(__loongarch_asx)
2165
+
2166
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
2167
+
2168
+ uint32_t aux32[2];
2169
+
2170
+ __m256 accumf = (__m256)__lasx_xvldi(0);
2171
+ for (int i = 0; i < nb; ++i) {
2172
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2173
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
2174
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
2175
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2176
+ __m256i sumi1 = __lasx_xvldi(0);
2177
+ __m256i sumi2 = __lasx_xvldi(0);
2178
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
2179
+ const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
2180
+ const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
2181
+ const __m256i q2_1 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
2182
+ iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
2183
+ q3 += 8;
2184
+ const __m256i q2_2 = lasx_set_w(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]],
2185
+ iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
2186
+ q3 += 8;
2187
+ memcpy(aux32, gas, 8); gas += 8;
2188
+
2189
+ const __m256i s2_1 = lasx_set_d(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127],
2190
+ signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
2191
+ const __m256i s2_2 = lasx_set_d(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
2192
+ signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
2193
+ const __m256i q8s_1 = __lasx_xvsigncov_b(s2_1, q8_1);
2194
+ const __m256i q8s_2 = __lasx_xvsigncov_b(s2_2, q8_2);
2195
+ const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1);
2196
+ const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2);
2197
+ const uint16_t ls1 = aux32[0] >> 28;
2198
+ const uint16_t ls2 = aux32[1] >> 28;
2199
+
2200
+ const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
2201
+ const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
2202
+ sumi1 = __lasx_xvadd_w(sumi1, p1);
2203
+ sumi2 = __lasx_xvadd_w(sumi2, p2);
2204
+ }
2205
+
2206
+ accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
2207
+ }
2208
+
2209
+ *s = 0.25f * hsum_float_8(accumf);
2210
+
2211
+ #else
2212
+
2213
+ uint32_t aux32;
2214
+
2215
+ float sumf = 0.f;
2216
+ for (int i = 0; i < nb; ++i) {
2217
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2218
+ const uint8_t * GGML_RESTRICT q3 = x[i].qs;
2219
+ const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
2220
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2221
+ int32_t bsum = 0;
2222
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2223
+ memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
2224
+ const uint32_t ls = 2*(aux32 >> 28) + 1;
2225
+ int32_t sumi = 0;
2226
+ for (int l = 0; l < 4; ++l) {
2227
+ const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*l+0]);
2228
+ const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*l+1]);
2229
+ const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*l) & 127];
2230
+ for (int j = 0; j < 4; ++j) {
2231
+ sumi += grid1[j] * q8[j+0] * (signs & kmask_iq2xs[j+0] ? -1 : 1);
2232
+ sumi += grid2[j] * q8[j+4] * (signs & kmask_iq2xs[j+4] ? -1 : 1);
2233
+ }
2234
+ q8 += 8;
2235
+ }
2236
+ q3 += 8;
2237
+ bsum += sumi * ls;
2238
+ }
2239
+ sumf += d * bsum;
2240
+ }
2241
+ *s = 0.25f * sumf;
2242
+ #endif
2243
+ }
2244
+
2245
+ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2246
+ assert(n % QK_K == 0);
2247
+ assert(nrc == 1);
2248
+ UNUSED(nrc);
2249
+ UNUSED(bx);
2250
+ UNUSED(by);
2251
+ UNUSED(bs);
2252
+
2253
+ const block_iq3_s * GGML_RESTRICT x = vx;
2254
+ const block_q8_K * GGML_RESTRICT y = vy;
2255
+
2256
+ const int nb = n / QK_K;
2257
+
2258
+ #if defined(__loongarch_asx)
2259
+
2260
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
2261
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
2262
+ };
2263
+
2264
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2265
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
2266
+ };
2267
+
2268
+ const __m256i mask1 = __lasx_xvld((const __m256i*)k_mask1, 0);
2269
+ const __m256i mask2 = __lasx_xvld((const __m256i*)k_mask2, 0);
2270
+
2271
+ __m256i idx_shift = lasx_set_w(1, 2, 3, 4, 5, 6, 7, 8);
2272
+ const __m256i idx_mask = __lasx_xvreplgr2vr_w(256);
2273
+
2274
+ typedef union {
2275
+ __m256i vec[2];
2276
+ uint32_t index[16];
2277
+ } index_t;
2278
+
2279
+ index_t idx;
2280
+
2281
+ __m256 accumf = (__m256)__lasx_xvldi(0);
2282
+ for (int i = 0; i < nb; ++i) {
2283
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2284
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
2285
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
2286
+ const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
2287
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2288
+ __m256i sumi1 = __lasx_xvldi(0);
2289
+ __m256i sumi2 = __lasx_xvldi(0);
2290
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
2291
+ const __m256i q8_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
2292
+ const __m256i q8_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
2293
+ const __m256i idx_l = lasx_extu8_16(__lsx_vld(qs, 0)); qs += 16;
2294
+ idx.vec[0] = __lasx_xvreplgr2vr_w(qh[ib32+0]);
2295
+ idx.vec[1] = __lasx_xvreplgr2vr_w(qh[ib32+1]);
2296
+ idx.vec[0] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[0], idx_shift), idx_mask);
2297
+ idx.vec[1] = __lasx_xvand_v(__lasx_xvsll_w(idx.vec[1], idx_shift), idx_mask);
2298
+ idx.vec[0] = __lasx_xvor_v(idx.vec[0], lasx_ext16_32(lasx_extracti128(idx_l, 0)));
2299
+ idx.vec[1] = __lasx_xvor_v(idx.vec[1], lasx_ext16_32(lasx_extracti128(idx_l, 1)));
2300
+
2301
+ // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange.
2302
+ //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4);
2303
+ //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4);
2304
+ const __m256i q2_1 = lasx_set_w(
2305
+ iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]],
2306
+ iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]
2307
+ );
2308
+ const __m256i q2_2 = lasx_set_w(
2309
+ iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]],
2310
+ iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]]
2311
+ );
2312
+
2313
+ __m256i aux256 = __lasx_xvreplgr2vr_w(signs[0] | (signs[1] << 16));
2314
+ aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
2315
+ const __m256i s2_1 = __lasx_xvseq_b(aux256, mask2);
2316
+ const __m256i q8s_1 = __lasx_xvsub_b(__lasx_xvxor_v(s2_1, q8_1), s2_1);
2317
+
2318
+ aux256 = __lasx_xvreplgr2vr_w(signs[2] | (signs[3] << 16));
2319
+ aux256 = __lasx_xvand_v(lasx_shuffle_b(aux256,mask1), mask2);
2320
+ const __m256i s2_2 = __lasx_xvseq_b(aux256, mask2);
2321
+ const __m256i q8s_2 = __lasx_xvsub_b(__lasx_xvxor_v(s2_2, q8_2), s2_2);
2322
+
2323
+ signs += 4;
2324
+
2325
+ const __m256i dot1 = lasx_maddubs_h(q2_1, q8s_1);
2326
+ const __m256i dot2 = lasx_maddubs_h(q2_2, q8s_2);
2327
+ const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
2328
+ const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
2329
+ const __m256i p1 = lasx_madd_h(dot1, __lasx_xvreplgr2vr_h(2*ls1+1));
2330
+ const __m256i p2 = lasx_madd_h(dot2, __lasx_xvreplgr2vr_h(2*ls2+1));
2331
+ sumi1 = __lasx_xvadd_w(sumi1, p1);
2332
+ sumi2 = __lasx_xvadd_w(sumi2, p2);
2333
+ }
2334
+
2335
+ accumf = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accumf);
2336
+ }
2337
+
2338
+ *s = hsum_float_8(accumf);
2339
+
2340
+ #else
2341
+
2342
+ float sumf = 0.f;
2343
+ for (int i = 0; i < nb; ++i) {
2344
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2345
+ const uint8_t * GGML_RESTRICT qs = x[i].qs;
2346
+ const uint8_t * GGML_RESTRICT qh = x[i].qh;
2347
+ const uint8_t * GGML_RESTRICT signs = x[i].signs;
2348
+ const int8_t * GGML_RESTRICT q8 = y[i].qs;
2349
+ int32_t bsum = 0;
2350
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
2351
+ const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
2352
+ const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1;
2353
+ int32_t sumi = 0;
2354
+ for (int l = 0; l < 4; ++l) {
2355
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256)));
2356
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256)));
2357
+ for (int j = 0; j < 4; ++j) {
2358
+ sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
2359
+ sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
2360
+ }
2361
+ q8 += 8;
2362
+ }
2363
+ qs += 8;
2364
+ signs += 4;
2365
+ bsum += sumi * ls1;
2366
+ sumi = 0;
2367
+ for (int l = 0; l < 4; ++l) {
2368
+ const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256)));
2369
+ const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256)));
2370
+ for (int j = 0; j < 4; ++j) {
2371
+ sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1);
2372
+ sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1);
2373
+ }
2374
+ q8 += 8;
2375
+ }
2376
+ qs += 8;
2377
+ signs += 4;
2378
+ bsum += sumi * ls2;
2379
+ }
2380
+ sumf += d * bsum;
2381
+ }
2382
+ *s = sumf;
2383
+ #endif
2384
+ }
2385
+
2386
+ #if defined(__loongarch_asx)
2387
+ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
2388
+ const __m256i a = __lasx_xvmulwev_h_b(x, y);
2389
+ const __m256i b = __lasx_xvmulwod_h_b(x, y);
2390
+ return __lasx_xvadd_h(a, b);
2391
+ }
2392
+ #endif
2393
+
2394
+ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2395
+ assert(n % QK_K == 0);
2396
+ assert(nrc == 1);
2397
+ UNUSED(nrc);
2398
+ UNUSED(bx);
2399
+ UNUSED(by);
2400
+ UNUSED(bs);
2401
+
2402
+ const block_iq1_s * GGML_RESTRICT x = vx;
2403
+ const block_q8_K * GGML_RESTRICT y = vy;
2404
+
2405
+ const int nb = n / QK_K;
2406
+
2407
+ #if defined(__loongarch_asx)
2408
+
2409
+ __m256 accum = (__m256)__lasx_xvldi(0);
2410
+ float accum1 = 0;
2411
+ for (int i = 0; i < nb; ++i) {
2412
+
2413
+ const int8_t * q8 = y[i].qs;
2414
+ const uint8_t * qs = x[i].qs;
2415
+ const uint16_t * qh = x[i].qh;
2416
+
2417
+ __m256i sumi = __lasx_xvldi(0);
2418
+ int sumi1 = 0;
2419
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
2420
+ __m256i q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)], 0);
2421
+ q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], 1);
2422
+ q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)], 2);
2423
+ q1b_1 = __lasx_xvinsgr2vr_d(q1b_1, iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], 3);
2424
+
2425
+ __m256i q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)], 0);
2426
+ q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], 1);
2427
+ q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)], 2);
2428
+ q1b_2 = __lasx_xvinsgr2vr_d(q1b_2, iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], 3);
2429
+
2430
+ qs += 8;
2431
+ const __m256i q8b_1 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
2432
+ const __m256i q8b_2 = __lasx_xvld((const __m256i*)q8, 0); q8 += 32;
2433
+
2434
+ const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
2435
+ const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
2436
+ const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
2437
+ const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
2438
+
2439
+ __m256i tmp1, tmp5, tmp6;
2440
+ tmp1 = __lasx_xvreplgr2vr_h(ls1);
2441
+ tmp5 = __lasx_xvmulwev_w_h(dot1, tmp1);
2442
+ tmp6 = __lasx_xvmulwod_w_h(dot1, tmp1);
2443
+ const __m256i p1 = __lasx_xvadd_w(tmp5, tmp6);
2444
+
2445
+ tmp1 = __lasx_xvreplgr2vr_h(ls2);
2446
+ tmp5 = __lasx_xvmulwev_w_h(dot2, tmp1);
2447
+ tmp6 = __lasx_xvmulwod_w_h(dot2, tmp1);
2448
+ const __m256i p2 = __lasx_xvadd_w(tmp5, tmp6);
2449
+
2450
+ sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p1, p2));
2451
+ sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
2452
+ + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
2453
+ }
2454
+
2455
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
2456
+ accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum);
2457
+ accum1 += d * sumi1;
2458
+ }
2459
+
2460
+ *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
2461
+
2462
+ #else
2463
+
2464
+ float sumf = 0;
2465
+ for (int i = 0; i < nb; i++) {
2466
+
2467
+ const int8_t * q8 = y[i].qs;
2468
+ const uint8_t * qs = x[i].qs;
2469
+ const uint16_t * qh = x[i].qh;
2470
+
2471
+ int sumi = 0, sumi1 = 0;
2472
+ for (int ib = 0; ib < QK_K/32; ++ib) {
2473
+ const int ls = 2*((qh[ib] >> 12) & 7) + 1;
2474
+ const int delta = qh[ib] & 0x8000 ? -1 : 1;
2475
+ int lsum = 0;
2476
+ for (int l = 0; l < 4; ++l) {
2477
+ const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8)));
2478
+ for (int j = 0; j < 8; ++j) {
2479
+ lsum += q8[j] * grid[j];
2480
+ }
2481
+ q8 += 8;
2482
+ }
2483
+ sumi += ls * lsum;
2484
+ sumi1 += ls * delta * (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]);
2485
+ qs += 4;
2486
+ }
2487
+
2488
+ sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
2489
+ }
2490
+
2491
+ *s = sumf;
2492
+
2493
+ #endif
2494
+ }
2495
+
2496
+ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2497
+ assert(nrc == 1);
2498
+ UNUSED(nrc);
2499
+ UNUSED(bx);
2500
+ UNUSED(by);
2501
+ UNUSED(bs);
2502
+ assert(n % QK4_NL == 0);
2503
+ static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
2504
+
2505
+ const block_iq4_nl * GGML_RESTRICT x = vx;
2506
+ const block_q8_0 * GGML_RESTRICT y = vy;
2507
+
2508
+ const int nb = n / QK4_NL;
2509
+
2510
+ int ib = 0;
2511
+ float sumf = 0;
2512
+
2513
+ #if defined (__loongarch_asx)
2514
+
2515
+ const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
2516
+ const __m128i m4b = __lsx_vreplgr2vr_b(0x0f);
2517
+ const __m256i mone = __lasx_xvreplgr2vr_h(1);
2518
+
2519
+ __m256 accum1 = (__m256)__lasx_xvldi(0);
2520
+ __m256 accum2 = (__m256)__lasx_xvldi(0);
2521
+ for (; ib + 1 < nb; ib += 2) {
2522
+ const __m128i q4bits_1 = __lsx_vld((const __m128i*)x[ib + 0].qs, 0);
2523
+ const __m128i q4bits_2 = __lsx_vld((const __m128i*)x[ib + 1].qs, 0);
2524
+ const __m256i q8b_1 = __lasx_xvld((const __m256i *)y[ib + 0].qs, 0);
2525
+ const __m256i q8b_2 = __lasx_xvld((const __m256i *)y[ib + 1].qs, 0);
2526
+ const __m256i q4b_1 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_1, 4), m4b)),
2527
+ lsx_shuffle_b(values128, __lsx_vand_v(q4bits_1, m4b)));
2528
+ const __m256i q4b_2 = lasx_insertf128(lsx_shuffle_b(values128, __lsx_vand_v(__lsx_vsrli_h(q4bits_2, 4), m4b)),
2529
+ lsx_shuffle_b(values128, __lsx_vand_v(q4bits_2, m4b)));
2530
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
2531
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
2532
+ const __m256i p_1 = lasx_madd_h(p16_1, mone);
2533
+ const __m256i p_2 = lasx_madd_h(p16_2, mone);
2534
+ accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
2535
+ __lasx_xvffint_s_w(p_1), accum1);
2536
+ accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
2537
+ __lasx_xvffint_s_w(p_2), accum2);
2538
+ }
2539
+
2540
+ sumf = hsum_float_8(__lasx_xvfadd_s(accum1, accum2));
2541
+
2542
+ #endif
2543
+ for (; ib < nb; ++ib) {
2544
+ const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
2545
+ int sumi1 = 0, sumi2 = 0;
2546
+ for (int j = 0; j < QK4_NL/2; ++j) {
2547
+ sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
2548
+ sumi2 += y[ib].qs[j+QK4_NL/2] * kvalues_iq4nl[x[ib].qs[j] >> 4];
2549
+ }
2550
+ sumf += d * (sumi1 + sumi2);
2551
+ }
2552
+ *s = sumf;
2553
+ }
2554
+
2555
+ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
2556
+ assert(nrc == 1);
2557
+ UNUSED(nrc);
2558
+ UNUSED(bx);
2559
+ UNUSED(by);
2560
+ UNUSED(bs);
2561
+ assert(n % QK_K == 0);
2562
+
2563
+ const block_iq4_xs * GGML_RESTRICT x = vx;
2564
+ const block_q8_K * GGML_RESTRICT y = vy;
2565
+
2566
+ const int nb = n / QK_K;
2567
+
2568
+ #if defined(__loongarch_asx)
2569
+
2570
+ const __m128i values128 = __lsx_vld((const __m128i*)kvalues_iq4nl, 0);
2571
+
2572
+ __m256 accum = (__m256)__lasx_xvldi(0);
2573
+
2574
+ for (int ibl = 0; ibl < nb; ++ibl) {
2575
+ const uint8_t * qs = x[ibl].qs;
2576
+ const int8_t * q8 = y[ibl].qs;
2577
+ uint16_t sh = x[ibl].scales_h;
2578
+ __m256i sumi1 = __lasx_xvldi(0);
2579
+ __m256i sumi2 = __lasx_xvldi(0);
2580
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
2581
+ const __m128i q4bits_1 = __lsx_vld((const __m128i*)qs, 0); qs += 16;
2582
+ const __m128i q4bits_2 = __lsx_vld((const __m128i*)qs, 0); qs += 16;
2583
+ const __m256i q8b_1 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
2584
+ const __m256i q8b_2 = __lasx_xvld((const __m256i *)q8, 0); q8 += 32;
2585
+ const __m256i q4b_1 = lasx_insertf128(__lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits_1, 4)),
2586
+ __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits_1, 0xf)));
2587
+ const __m256i q4b_2 = lasx_insertf128(__lsx_vshuf_b(values128, values128, __lsx_vsrli_b(q4bits_2, 4)),
2588
+ __lsx_vshuf_b(values128, values128, __lsx_vandi_b(q4bits_2, 0xf)));
2589
+ const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
2590
+ const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
2591
+ const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
2592
+ const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
2593
+ sh >>= 4;
2594
+ const __m256i p_1 = lasx_madd_h(p16_1, __lasx_xvreplgr2vr_h(ls1));
2595
+ const __m256i p_2 = lasx_madd_h(p16_2, __lasx_xvreplgr2vr_h(ls2));
2596
+ sumi1 = __lasx_xvadd_w(p_1, sumi1);
2597
+ sumi2 = __lasx_xvadd_w(p_2, sumi2);
2598
+ }
2599
+ accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
2600
+ __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accum);
2601
+ }
2602
+
2603
+ *s = hsum_float_8(accum);
2604
+
2605
+ #else
2606
+ float sumf = 0;
2607
+ for (int ibl = 0; ibl < nb; ++ibl) {
2608
+ const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
2609
+ uint16_t h = x[ibl].scales_h;
2610
+ const uint8_t * qs = x[ibl].qs;
2611
+ const int8_t * q8 = y[ibl].qs;
2612
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
2613
+ const uint8_t ls1 = (x[ibl].scales_l[ib/2] & 0xf) | ((h << 4) & 0x30);
2614
+ const uint8_t ls2 = (x[ibl].scales_l[ib/2] >> 4) | ((h << 2) & 0x30);
2615
+ h >>= 4;
2616
+ const float d1 = d4d8*(ls1 - 32);
2617
+ const float d2 = d4d8*(ls2 - 32);
2618
+ int sumi1 = 0, sumi2 = 0;
2619
+ for (int j = 0; j < 16; ++j) {
2620
+ sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
2621
+ sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
2622
+ }
2623
+ sumf += d1 * (sumi1 + sumi2);
2624
+ qs += 16;
2625
+ q8 += 32;
2626
+ sumi1 = sumi2 = 0;
2627
+ for (int j = 0; j < 16; ++j) {
2628
+ sumi1 += q8[j+ 0] * kvalues_iq4nl[qs[j] & 0xf];
2629
+ sumi2 += q8[j+16] * kvalues_iq4nl[qs[j] >> 4];
2630
+ }
2631
+ sumf += d2 * (sumi1 + sumi2);
2632
+ qs += 16;
2633
+ q8 += 32;
2634
+ }
2635
+ }
2636
+ *s = sumf;
2637
+ #endif
2638
+ }
2639
+