cui-llama.rn 1.1.6 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,13 @@
1
- // SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
1
+ // SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
2
+ // SPDX-License-Identifier: MIT
3
+ //
4
+
2
5
  #define LM_GGML_COMMON_IMPL_C
3
6
  #include "ggml-common.h"
4
7
 
5
8
  #include "ggml-quants.h"
6
9
  #include "ggml-impl.h"
10
+ #include "ggml-cpu-impl.h"
7
11
 
8
12
  #include <math.h>
9
13
  #include <string.h>
@@ -38,11 +42,44 @@
38
42
  //
39
43
  #if defined(__AVX__)
40
44
  #if defined(__F16C__)
45
+ #if defined(__AVX512F__)
46
+ #define LM_GGML_F32Cx8x2_LOAD(x, y) _mm512_cvtph_ps(_mm256_set_m128i(_mm_loadu_si128((const __m128i *)(y)), _mm_loadu_si128((const __m128i *)(x))))
47
+ #define LM_GGML_F32Cx16_REPEAT_LOAD(x) _mm512_cvtph_ps(_mm256_set_m128i(x, x))
48
+ #endif
41
49
  // the _mm256_cvt intrinsics require F16C
42
50
  #define LM_GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
43
51
  #define LM_GGML_F32Cx8_REPEAT_LOAD(x, loadMask) _mm256_cvtph_ps(_mm_shuffle_epi32(_mm_maskload_epi32((int const*)(x), loadMask), 68))
44
52
  #define LM_GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask) _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask))
45
53
  #else
54
+ #if defined(__AVX512F__)
55
+ static inline __m512 __avx512_f32cx8x2_load(lm_ggml_fp16_t *x, lm_ggml_fp16_t *y) {
56
+ float tmp[16];
57
+
58
+ for (int i = 0; i < 8; i++) {
59
+ tmp[i] = LM_GGML_FP16_TO_FP32(x[i]);
60
+ }
61
+
62
+ for (int i = 0; i < 8; i++) {
63
+ tmp[i + 8] = LM_GGML_FP16_TO_FP32(y[i]);
64
+ }
65
+
66
+ return _mm512_loadu_ps(tmp);
67
+ }
68
+ static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
69
+ float tmp[16];
70
+ uint16_t tmphalf[8];
71
+ _mm_storeu_si128((__m128i*)tmphalf, x);
72
+
73
+ for (int i = 0; i < 4; i++) {
74
+ tmp[i] = LM_GGML_FP16_TO_FP32(tmphalf[i]);
75
+ tmp[i + 4] = LM_GGML_FP16_TO_FP32(tmphalf[i]);
76
+ tmp[i + 8] = LM_GGML_FP16_TO_FP32(tmphalf[i]);
77
+ tmp[i + 12] = LM_GGML_FP16_TO_FP32(tmphalf[i]);
78
+ }
79
+
80
+ return _mm512_loadu_ps(tmp);
81
+ }
82
+ #endif
46
83
  static inline __m256 __avx_f32cx8_load(lm_ggml_fp16_t *x) {
47
84
  float tmp[8];
48
85
 
@@ -77,30 +114,65 @@ static inline __m256 __avx_rearranged_f32cx8_load(lm_ggml_fp16_t *x, __m128i arr
77
114
  #define LM_GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
78
115
  #define LM_GGML_F32Cx8_REPEAT_LOAD(x, loadMask) __avx_repeat_f32cx8_load(x)
79
116
  #define LM_GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask) __avx_rearranged_f32cx8_load(x, arrangeMask)
117
+ #if defined(__AVX512F__)
118
+ #define LM_GGML_F32Cx8x2_LOAD(x, y) __avx512_f32cx8x2_load(x, y)
119
+ #define LM_GGML_F32Cx16_REPEAT_LOAD(x) __avx512_repeat_f32cx16_load(x)
120
+ #endif
80
121
  #endif
81
122
  #endif
82
123
 
83
124
 
84
125
  #if defined(__AVX2__) || defined(__AVX512F__)
85
- static inline __m256i sum_i16_pairs_int(const __m256i x) {
126
+ #if defined(__AVX512F__)
127
+ // add int16_t pairwise and return as 512 bit int vector
128
+ static inline __m512i sum_i16_pairs_int_32x16(const __m512i x) {
129
+ const __m512i ones = _mm512_set1_epi16(1);
130
+ return _mm512_madd_epi16(ones, x);
131
+ }
132
+
133
+ static inline __m512i mul_sum_us8_pairs_int32x16(const __m512i ax, const __m512i sy) {
134
+ #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
135
+ const __m512i zero = _mm512_setzero_si512();
136
+ return _mm512_dpbusd_epi32(zero, ax, sy);
137
+ #else
138
+ // Perform multiplication and create 16-bit values
139
+ const __m512i dot = _mm512_maddubs_epi16(ax, sy);
140
+ return sum_i16_pairs_int_32x16(dot);
141
+ #endif
142
+ }
143
+
144
+ // multiply int8_t, add results pairwise twice and return as 512 bit int vector
145
+ static inline __m512i mul_sum_i8_pairs_int32x16(const __m512i x, const __m512i y) {
146
+ const __m512i zero = _mm512_setzero_si512();
147
+ // Get absolute values of x vectors
148
+ const __m512i ax = _mm512_abs_epi8(x);
149
+ // Sign the values of the y vectors
150
+ __mmask64 blt0 = _mm512_movepi8_mask(x);
151
+ const __m512i sy = _mm512_mask_sub_epi8(y, blt0, zero, y);
152
+ return mul_sum_us8_pairs_int32x16(ax, sy);
153
+ }
154
+ #endif
155
+
156
+ // add int16_t pairwise and return as 256 bit int vector
157
+ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
86
158
  const __m256i ones = _mm256_set1_epi16(1);
87
159
  return _mm256_madd_epi16(ones, x);
88
160
  }
89
161
 
90
- static inline __m256i mul_sum_us8_pairs_int(const __m256i ax, const __m256i sy) {
162
+ static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
91
163
  #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
92
164
  const __m256i zero = _mm256_setzero_si256();
93
165
  return _mm256_dpbusd_epi32(zero, ax, sy);
94
166
  #else
95
167
  // Perform multiplication and create 16-bit values
96
168
  const __m256i dot = _mm256_maddubs_epi16(ax, sy);
97
- return sum_i16_pairs_int(dot);
169
+ return sum_i16_pairs_int32x8(dot);
98
170
  #endif
99
171
  }
100
172
 
101
173
  // Integer variant of the function defined in ggml-quants.c
102
- // multiply int8_t, add results pairwise twice and return as float vector
103
- static inline __m256i mul_sum_i8_pairs_int(const __m256i x, const __m256i y) {
174
+ // multiply int8_t, add results pairwise twice and return as 256 bit int vector
175
+ static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y) {
104
176
  #if __AVXVNNIINT8__
105
177
  const __m256i zero = _mm256_setzero_si256();
106
178
  return _mm256_dpbssd_epi32(zero, x, y);
@@ -109,7 +181,7 @@ static inline __m256i mul_sum_i8_pairs_int(const __m256i x, const __m256i y) {
109
181
  const __m256i ax = _mm256_sign_epi8(x, x);
110
182
  // Sign the values of the y vectors
111
183
  const __m256i sy = _mm256_sign_epi8(y, x);
112
- return mul_sum_us8_pairs_int(ax, sy);
184
+ return mul_sum_us8_pairs_int32x8(ax, sy);
113
185
  #endif
114
186
  }
115
187
  #endif
@@ -526,6 +598,15 @@ size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_
526
598
  return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
527
599
  }
528
600
 
601
+ // Return the number of byte lanes in the SVE vector if SVE is supported; otherwise, returns 0 if SVE is not supported.
602
+ static int sve_lane_count(void) {
603
+ #if defined(__ARM_FEATURE_SVE)
604
+ return lm_ggml_sve_cnt_b;
605
+ #else
606
+ return 0;
607
+ #endif
608
+ }
609
+
529
610
  void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
530
611
  const int qk = QK8_0;
531
612
  const int nb = n / qk;
@@ -545,73 +626,67 @@ void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
545
626
  UNUSED(ncols_interleaved);
546
627
  UNUSED(blocklen);
547
628
 
548
- #if defined(__ARM_FEATURE_SVE)
549
- if (lm_ggml_sve_cnt_b == QK8_0) {
550
- LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
551
- "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
629
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
630
+ if (lm_ggml_cpu_has_neon()) {
631
+ const void * b_ptr = vx;
632
+ const void * a_ptr = vy;
633
+ float * res_ptr = s;
634
+
635
+ __asm__ __volatile__(
636
+ "movi v31.16b, #0x4\n"
637
+ "movi v30.16b, #0xf0\n"
638
+ "add %x[b_ptr], %x[b_ptr], #0x8\n"
639
+ "1:" // Column loop
640
+ "add x22, %x[a_ptr], #0x2\n"
641
+ "movi v29.16b, #0x0\n"
642
+ "mov x21, %x[nb]\n"
643
+ "2:" // Block loop
644
+ "ldr q28, [%x[b_ptr], #0x0]\n"
645
+ "ldr q27, [x22, #0x0]\n"
646
+ "movi v26.4s, #0x0\n"
647
+ "sub x20, x22, #0x2\n"
648
+ "ldr q25, [x22, #0x10]\n"
649
+ "ldr q24, [%x[b_ptr], #0x10]\n"
650
+ "sub x21, x21, #0x1\n"
651
+ "add x22, x22, #0x22\n"
652
+ "ldr q23, [%x[b_ptr], #0x20]\n"
653
+ "ldr q22, [%x[b_ptr], #0x30]\n"
654
+ "ld1r { v21.8h }, [x20]\n"
655
+ "ldr q20, [%x[b_ptr], #-0x8]\n"
656
+ "sshl v16.16b, v28.16b, v31.16b\n"
657
+ "and v28.16b, v28.16b, v30.16b\n"
658
+ "sshl v19.16b, v24.16b, v31.16b\n"
659
+ "and v24.16b, v24.16b, v30.16b\n"
660
+ "add %x[b_ptr], %x[b_ptr], #0x48\n"
661
+ "sshl v18.16b, v23.16b, v31.16b\n"
662
+ "and v23.16b, v23.16b, v30.16b\n"
663
+ ".inst 0x4f9be21a // sdot v26.4s, v16.16b, v27.4b[0]\n"
664
+ "sshl v17.16b, v22.16b, v31.16b\n"
665
+ "and v22.16b, v22.16b, v30.16b\n"
666
+ "fcvtl v21.4s, v21.4h\n"
667
+ "fcvtl v16.4s, v20.4h\n"
668
+ ".inst 0x4f99e39a // sdot v26.4s, v28.16b, v25.4b[0]\n"
669
+ "fmul v16.4s, v16.4s, v21.4s\n"
670
+ ".inst 0x4fbbe27a // sdot v26.4s, v19.16b, v27.4b[1]\n"
671
+ ".inst 0x4fb9e31a // sdot v26.4s, v24.16b, v25.4b[1]\n"
672
+ ".inst 0x4f9bea5a // sdot v26.4s, v18.16b, v27.4b[2]\n"
673
+ ".inst 0x4f99eafa // sdot v26.4s, v23.16b, v25.4b[2]\n"
674
+ ".inst 0x4fbbea3a // sdot v26.4s, v17.16b, v27.4b[3]\n"
675
+ ".inst 0x4fb9eada // sdot v26.4s, v22.16b, v25.4b[3]\n"
676
+ "scvtf v26.4s, v26.4s, #0x4\n"
677
+ "fmla v29.4s, v26.4s, v16.4s\n"
678
+ "cbnz x21, 2b\n"
679
+ "sub %x[nc], %x[nc], #0x4\n"
680
+ "str q29, [%x[res_ptr], #0x0]\n"
681
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
682
+ "cbnz %x[nc], 1b\n"
683
+ : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
684
+ : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
685
+ : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
686
+ );
687
+ return;
552
688
  }
553
- #endif
554
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
555
- LM_GGML_ASSERT(!(lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) &&
556
- "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
557
- #elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
558
- const void * b_ptr = vx;
559
- const void * a_ptr = vy;
560
- float * res_ptr = s;
561
-
562
- __asm__ __volatile__(
563
- "movi v31.16b, #0x4\n"
564
- "movi v30.16b, #0xf0\n"
565
- "add %x[b_ptr], %x[b_ptr], #0x8\n"
566
- "1:" // Column loop
567
- "add x22, %x[a_ptr], #0x2\n"
568
- "movi v29.16b, #0x0\n"
569
- "mov x21, %x[nb]\n"
570
- "2:" // Block loop
571
- "ldr q28, [%x[b_ptr], #0x0]\n"
572
- "ldr q27, [x22, #0x0]\n"
573
- "movi v26.4s, #0x0\n"
574
- "sub x20, x22, #0x2\n"
575
- "ldr q25, [x22, #0x10]\n"
576
- "ldr q24, [%x[b_ptr], #0x10]\n"
577
- "sub x21, x21, #0x1\n"
578
- "add x22, x22, #0x22\n"
579
- "ldr q23, [%x[b_ptr], #0x20]\n"
580
- "ldr q22, [%x[b_ptr], #0x30]\n"
581
- "ld1r { v21.8h }, [x20]\n"
582
- "ldr q20, [%x[b_ptr], #-0x8]\n"
583
- "sshl v16.16b, v28.16b, v31.16b\n"
584
- "and v28.16b, v28.16b, v30.16b\n"
585
- "sshl v19.16b, v24.16b, v31.16b\n"
586
- "and v24.16b, v24.16b, v30.16b\n"
587
- "add %x[b_ptr], %x[b_ptr], #0x48\n"
588
- "sshl v18.16b, v23.16b, v31.16b\n"
589
- "and v23.16b, v23.16b, v30.16b\n"
590
- ".inst 0x4f9be21a // sdot v26.4s, v16.16b, v27.4b[0]\n"
591
- "sshl v17.16b, v22.16b, v31.16b\n"
592
- "and v22.16b, v22.16b, v30.16b\n"
593
- "fcvtl v21.4s, v21.4h\n"
594
- "fcvtl v16.4s, v20.4h\n"
595
- ".inst 0x4f99e39a // sdot v26.4s, v28.16b, v25.4b[0]\n"
596
- "fmul v16.4s, v16.4s, v21.4s\n"
597
- ".inst 0x4fbbe27a // sdot v26.4s, v19.16b, v27.4b[1]\n"
598
- ".inst 0x4fb9e31a // sdot v26.4s, v24.16b, v25.4b[1]\n"
599
- ".inst 0x4f9bea5a // sdot v26.4s, v18.16b, v27.4b[2]\n"
600
- ".inst 0x4f99eafa // sdot v26.4s, v23.16b, v25.4b[2]\n"
601
- ".inst 0x4fbbea3a // sdot v26.4s, v17.16b, v27.4b[3]\n"
602
- ".inst 0x4fb9eada // sdot v26.4s, v22.16b, v25.4b[3]\n"
603
- "scvtf v26.4s, v26.4s, #0x4\n"
604
- "fmla v29.4s, v26.4s, v16.4s\n"
605
- "cbnz x21, 2b\n"
606
- "sub %x[nc], %x[nc], #0x4\n"
607
- "str q29, [%x[res_ptr], #0x0]\n"
608
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
609
- "cbnz %x[nc], 1b\n"
610
- : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
611
- : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
612
- : "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
613
- );
614
- #else
689
+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
615
690
  float sumf[4];
616
691
  int sumi;
617
692
 
@@ -635,7 +710,6 @@ void lm_ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
635
710
  }
636
711
  for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
637
712
  }
638
- #endif
639
713
  }
640
714
 
641
715
  void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
@@ -657,79 +731,72 @@ void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
657
731
  UNUSED(ncols_interleaved);
658
732
  UNUSED(blocklen);
659
733
 
660
- #if defined(__ARM_FEATURE_SVE)
661
- if (lm_ggml_sve_cnt_b == QK8_0) {
662
- LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
663
- "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
734
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
735
+ if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
736
+ const void * b_ptr = vx;
737
+ const void * a_ptr = vy;
738
+ float * res_ptr = s;
739
+
740
+ __asm__ __volatile__(
741
+ "movi v2.16b, #0x4\n"
742
+ "movi v1.16b, #0xf0\n"
743
+ "add %x[b_ptr], %x[b_ptr], #0x8\n"
744
+ "1:" // Column loop
745
+ "add x23, %x[a_ptr], #0x2\n"
746
+ "movi v0.16b, #0x0\n"
747
+ "mov x22, %x[nb]\n"
748
+ "2:" // Block loop
749
+ "ldr q31, [%x[b_ptr], #0x0]\n"
750
+ "ldr q30, [%x[b_ptr], #0x10]\n"
751
+ "mov x21, x23\n"
752
+ "movi v29.4s, #0x0\n"
753
+ "ldr q28, [%x[b_ptr], #0x20]\n"
754
+ "ldr q27, [%x[b_ptr], #0x30]\n"
755
+ "movi v26.4s, #0x0\n"
756
+ "sub x20, x23, #0x2\n"
757
+ "ld1r { v25.8h }, [x20]\n"
758
+ "ldr q24, [%x[b_ptr], #-0x8]\n"
759
+ "sub x22, x22, #0x1\n"
760
+ "add x23, x23, #0x22\n"
761
+ "ld1r { v23.2d }, [x21], #0x8\n"
762
+ "sshl v22.16b, v31.16b, v2.16b\n"
763
+ "sshl v16.16b, v30.16b, v2.16b\n"
764
+ "add %x[b_ptr], %x[b_ptr], #0x48\n"
765
+ "ld1r { v21.2d }, [x21], #0x8\n"
766
+ "sshl v20.16b, v28.16b, v2.16b\n"
767
+ "sshl v19.16b, v27.16b, v2.16b\n"
768
+ "ld1r { v18.2d }, [x21], #0x8\n"
769
+ "ld1r { v17.2d }, [x21], #0x8\n"
770
+ "and v31.16b, v31.16b, v1.16b\n"
771
+ "and v30.16b, v30.16b, v1.16b\n"
772
+ ".inst 0x4e9796dd // sdot v29.4s, v22.16b, v23.16b\n"
773
+ ".inst 0x4e97961a // sdot v26.4s, v16.16b, v23.16b\n"
774
+ "and v28.16b, v28.16b, v1.16b\n"
775
+ "and v27.16b, v27.16b, v1.16b\n"
776
+ "fcvtl v25.4s, v25.4h\n"
777
+ "fcvtl v16.4s, v24.4h\n"
778
+ ".inst 0x4e95969d // sdot v29.4s, v20.16b, v21.16b\n"
779
+ ".inst 0x4e95967a // sdot v26.4s, v19.16b, v21.16b\n"
780
+ "fmul v16.4s, v16.4s, v25.4s\n"
781
+ ".inst 0x4e9297fd // sdot v29.4s, v31.16b, v18.16b\n"
782
+ ".inst 0x4e9297da // sdot v26.4s, v30.16b, v18.16b\n"
783
+ ".inst 0x4e91979d // sdot v29.4s, v28.16b, v17.16b\n"
784
+ ".inst 0x4e91977a // sdot v26.4s, v27.16b, v17.16b\n"
785
+ "addp v29.4s, v29.4s, v26.4s\n"
786
+ "scvtf v29.4s, v29.4s, #0x4\n"
787
+ "fmla v0.4s, v29.4s, v16.4s\n"
788
+ "cbnz x22, 2b\n"
789
+ "sub %x[nc], %x[nc], #0x4\n"
790
+ "str q0, [%x[res_ptr], #0x0]\n"
791
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
792
+ "cbnz %x[nc], 1b\n"
793
+ : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
794
+ : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
795
+ : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
796
+ );
797
+ return;
664
798
  }
665
- #endif
666
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
667
- const void * b_ptr = vx;
668
- const void * a_ptr = vy;
669
- float * res_ptr = s;
670
-
671
- __asm__ __volatile__(
672
- "movi v2.16b, #0x4\n"
673
- "movi v1.16b, #0xf0\n"
674
- "add %x[b_ptr], %x[b_ptr], #0x8\n"
675
- "1:" // Column loop
676
- "add x23, %x[a_ptr], #0x2\n"
677
- "movi v0.16b, #0x0\n"
678
- "mov x22, %x[nb]\n"
679
- "2:" // Block loop
680
- "ldr q31, [%x[b_ptr], #0x0]\n"
681
- "ldr q30, [%x[b_ptr], #0x10]\n"
682
- "mov x21, x23\n"
683
- "movi v29.4s, #0x0\n"
684
- "ldr q28, [%x[b_ptr], #0x20]\n"
685
- "ldr q27, [%x[b_ptr], #0x30]\n"
686
- "movi v26.4s, #0x0\n"
687
- "sub x20, x23, #0x2\n"
688
- "ld1r { v25.8h }, [x20]\n"
689
- "ldr q24, [%x[b_ptr], #-0x8]\n"
690
- "sub x22, x22, #0x1\n"
691
- "add x23, x23, #0x22\n"
692
- "ld1r { v23.2d }, [x21], #0x8\n"
693
- "sshl v22.16b, v31.16b, v2.16b\n"
694
- "sshl v16.16b, v30.16b, v2.16b\n"
695
- "add %x[b_ptr], %x[b_ptr], #0x48\n"
696
- "ld1r { v21.2d }, [x21], #0x8\n"
697
- "sshl v20.16b, v28.16b, v2.16b\n"
698
- "sshl v19.16b, v27.16b, v2.16b\n"
699
- "ld1r { v18.2d }, [x21], #0x8\n"
700
- "ld1r { v17.2d }, [x21], #0x8\n"
701
- "and v31.16b, v31.16b, v1.16b\n"
702
- "and v30.16b, v30.16b, v1.16b\n"
703
- ".inst 0x4e9796dd // sdot v29.4s, v22.16b, v23.16b\n"
704
- ".inst 0x4e97961a // sdot v26.4s, v16.16b, v23.16b\n"
705
- "and v28.16b, v28.16b, v1.16b\n"
706
- "and v27.16b, v27.16b, v1.16b\n"
707
- "fcvtl v25.4s, v25.4h\n"
708
- "fcvtl v16.4s, v24.4h\n"
709
- ".inst 0x4e95969d // sdot v29.4s, v20.16b, v21.16b\n"
710
- ".inst 0x4e95967a // sdot v26.4s, v19.16b, v21.16b\n"
711
- "fmul v16.4s, v16.4s, v25.4s\n"
712
- ".inst 0x4e9297fd // sdot v29.4s, v31.16b, v18.16b\n"
713
- ".inst 0x4e9297da // sdot v26.4s, v30.16b, v18.16b\n"
714
- ".inst 0x4e91979d // sdot v29.4s, v28.16b, v17.16b\n"
715
- ".inst 0x4e91977a // sdot v26.4s, v27.16b, v17.16b\n"
716
- "addp v29.4s, v29.4s, v26.4s\n"
717
- "scvtf v29.4s, v29.4s, #0x4\n"
718
- "fmla v0.4s, v29.4s, v16.4s\n"
719
- "cbnz x22, 2b\n"
720
- "sub %x[nc], %x[nc], #0x4\n"
721
- "str q0, [%x[res_ptr], #0x0]\n"
722
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
723
- "cbnz %x[nc], 1b\n"
724
- : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
725
- : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
726
- : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
727
- );
728
- #elif defined(__ARM_NEON) && defined(__aarch64__)
729
- LM_GGML_ASSERT((lm_ggml_cpu_has_sve() || lm_ggml_cpu_has_matmul_int8()) &&
730
- "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
731
- "performance");
732
- #else
799
+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
733
800
  float sumf[4];
734
801
  int sumi;
735
802
 
@@ -753,7 +820,6 @@ void lm_ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
753
820
  }
754
821
  for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
755
822
  }
756
- #endif
757
823
  }
758
824
 
759
825
  void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
@@ -775,8 +841,9 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
775
841
  UNUSED(ncols_interleaved);
776
842
  UNUSED(blocklen);
777
843
 
778
- #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
779
- if (lm_ggml_sve_cnt_b == QK8_0) {
844
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
845
+ #if defined(__ARM_FEATURE_SVE)
846
+ if (lm_ggml_cpu_has_sve() && sve_lane_count() == QK8_0) {
780
847
  const void * b_ptr = vx;
781
848
  const void * a_ptr = vy;
782
849
  float * res_ptr = s;
@@ -841,24 +908,7 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
841
908
  );
842
909
  return;
843
910
  }
844
- else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
845
- LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
846
- "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
847
- "performance");
848
- }
849
- else if (lm_ggml_cpu_has_neon()) {
850
- LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) &&
851
- "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
852
- "quantization format for optimal performance");
853
- }
854
- #endif
855
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
856
- LM_GGML_ASSERT(lm_ggml_cpu_has_sve() &&
857
- "__ARM_FEATURE_SVE not defined, use the Q4_0_4_8 quantization format for optimal performance");
858
- #elif defined(__ARM_NEON) && defined(__aarch64__)
859
- LM_GGML_ASSERT((lm_ggml_cpu_has_sve() || lm_ggml_cpu_has_matmul_int8()) &&
860
- "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
861
- "performance");
911
+ #endif // #if defined(__ARM_FEATURE_SVE)
862
912
  #elif defined(__AVX2__)
863
913
  // Lookup table to convert signed nibbles to signed bytes
864
914
  __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
@@ -928,17 +978,17 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
928
978
  // ...........................................................................
929
979
  // B0(28-31) B4(28-31) B1(28-31) B5(28-31) B2(28-31) B6(28-31) B3(28-31) B7(28-31) with A0(28-31)
930
980
 
931
- iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(rhs_vec_0123_0 ,_mm256_shuffle_epi32(rhs_vec_4567_0, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 0)));
932
- iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_0, 177) ,rhs_vec_4567_0, 170), _mm256_shuffle_epi32(lhs_vec_0, 85)));
981
+ iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_0 ,_mm256_shuffle_epi32(rhs_vec_4567_0, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 0)));
982
+ iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_0, 177) ,rhs_vec_4567_0, 170), _mm256_shuffle_epi32(lhs_vec_0, 85)));
933
983
 
934
- iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(rhs_vec_0123_1 ,_mm256_shuffle_epi32(rhs_vec_4567_1, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 170)));
935
- iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_1, 177) ,rhs_vec_4567_1, 170), _mm256_shuffle_epi32(lhs_vec_0, 255)));
984
+ iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_1 ,_mm256_shuffle_epi32(rhs_vec_4567_1, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 170)));
985
+ iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_1, 177) ,rhs_vec_4567_1, 170), _mm256_shuffle_epi32(lhs_vec_0, 255)));
936
986
 
937
- iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(rhs_vec_0123_2 ,_mm256_shuffle_epi32(rhs_vec_4567_2, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 0)));
938
- iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_2, 177) ,rhs_vec_4567_2, 170), _mm256_shuffle_epi32(lhs_vec_1, 85)));
987
+ iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_2 ,_mm256_shuffle_epi32(rhs_vec_4567_2, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 0)));
988
+ iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_2, 177) ,rhs_vec_4567_2, 170), _mm256_shuffle_epi32(lhs_vec_1, 85)));
939
989
 
940
- iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170)));
941
- iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255)));
990
+ iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170)));
991
+ iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int32x8(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255)));
942
992
 
943
993
  // Accumulated values multipled with appropriate scales
944
994
  acc_row = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc), _mm256_mul_ps(col_scale_f32, row_scale_f32), acc_row);
@@ -949,31 +999,33 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
949
999
  _mm256_storeu_ps(s + (y * nr + x * 8), acc_row);
950
1000
  }
951
1001
  }
952
- #else
953
- float sumf[8];
954
- int sumi;
1002
+ return;
1003
+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
1004
+ {
1005
+ float sumf[8];
1006
+ int sumi;
955
1007
 
956
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
957
- for (int x = 0; x < nc / ncols_interleaved; x++) {
958
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
1008
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1009
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1010
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
959
1011
 
960
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
961
- for (int l = 0; l < nb; l++) {
962
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
963
- for (int j = 0; j < ncols_interleaved; j++) {
964
- sumi = 0;
965
- for (int i = 0; i < blocklen; ++i) {
966
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
967
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
968
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
1012
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
1013
+ for (int l = 0; l < nb; l++) {
1014
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1015
+ for (int j = 0; j < ncols_interleaved; j++) {
1016
+ sumi = 0;
1017
+ for (int i = 0; i < blocklen; ++i) {
1018
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1019
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1020
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
1021
+ }
1022
+ sumf[j] += sumi * LM_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_FP16_TO_FP32(a_ptr[l].d);
969
1023
  }
970
- sumf[j] += sumi * LM_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_FP16_TO_FP32(a_ptr[l].d);
971
1024
  }
972
1025
  }
1026
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
973
1027
  }
974
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
975
1028
  }
976
- #endif
977
1029
  }
978
1030
 
979
1031
  void lm_ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
@@ -996,505 +1048,500 @@ void lm_ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void
996
1048
  UNUSED(ncols_interleaved);
997
1049
  UNUSED(blocklen);
998
1050
 
999
- #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1000
- if (lm_ggml_sve_cnt_b == QK8_0) {
1001
- LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
1002
- "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
1003
- }
1004
- #endif
1005
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
1006
- LM_GGML_ASSERT(!(lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) &&
1007
- "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
1008
- #elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
1009
- const void * b_ptr = vx;
1010
- const void * a_ptr = vy;
1011
- float * res_ptr = s;
1012
- size_t res_stride = bs * sizeof(float);
1013
-
1014
- __asm__ __volatile__(
1015
- "mov x10, %x[nr]\n"
1016
- "mov x9, #0x88\n"
1017
- "cmp x10, #0x10\n"
1018
- "mul x9, %x[nb], x9\n"
1019
- "blt 4f\n"
1020
- "1:" // Row loop
1021
- "add x28, %x[b_ptr], #0x8\n"
1022
- "mov x27, %x[nc]\n"
1023
- "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
1024
- "2:" // Column loop
1025
- "add x25, %x[a_ptr], #0x8\n"
1026
- "movi v15.16b, #0x0\n"
1027
- "movi v19.16b, #0x0\n"
1028
- "mov x24, %x[nb]\n"
1029
- "add x23, x25, x9\n"
1030
- "movi v18.16b, #0x0\n"
1031
- "movi v14.16b, #0x0\n"
1032
- "add x22, x23, x9\n"
1033
- "movi v11.16b, #0x0\n"
1034
- "movi v13.16b, #0x0\n"
1035
- "add x21, x22, x9\n"
1036
- "movi v23.16b, #0x0\n"
1037
- "movi v16.16b, #0x0\n"
1038
- "movi v25.16b, #0x0\n"
1039
- "movi v7.16b, #0x0\n"
1040
- "movi v0.16b, #0x0\n"
1041
- "movi v4.16b, #0x0\n"
1042
- "movi v5.16b, #0x0\n"
1043
- "movi v21.16b, #0x0\n"
1044
- "movi v8.16b, #0x0\n"
1045
- "movi v1.16b, #0x0\n"
1046
- "3:" // Block loop
1047
- "ldr q3, [x28, #0x0]\n"
1048
- "ldr q31, [x25, #0x0]\n"
1049
- "movi v28.16b, #0x4\n"
1050
- "movi v10.4s, #0x0\n"
1051
- "ldr q22, [x28, #0x10]\n"
1052
- "ldr q6, [x25, #0x10]\n"
1053
- "movi v29.4s, #0x0\n"
1054
- "movi v9.4s, #0x0\n"
1055
- "ldr q27, [x28, #0x20]\n"
1056
- "ldr q30, [x28, #0x30]\n"
1057
- "movi v20.4s, #0x0\n"
1058
- "movi v24.16b, #0xf0\n"
1059
- "ldr d2, [x25, #-0x8]\n"
1060
- "ldr d26, [x23, #-0x8]\n"
1061
- "sshl v12.16b, v3.16b, v28.16b\n"
1062
- "sub x20, x28, #0x8\n"
1063
- "ldr d17, [x20, #0x0]\n"
1064
- "and v3.16b, v3.16b, v24.16b\n"
1065
- "subs x24, x24, #0x1\n"
1066
- "add x28, x28, #0x48\n"
1067
- ".inst 0x4f9fe18a // sdot v10.4s, v12.16b, v31.4b[0]\n"
1068
- ".inst 0x4fbfe19d // sdot v29.4s, v12.16b, v31.4b[1]\n"
1069
- ".inst 0x4f9fe989 // sdot v9.4s, v12.16b, v31.4b[2]\n"
1070
- ".inst 0x4fbfe994 // sdot v20.4s, v12.16b, v31.4b[3]\n"
1071
- "sshl v31.16b, v22.16b, v28.16b\n"
1072
- "and v22.16b, v22.16b, v24.16b\n"
1073
- "fcvtl v17.4s, v17.4h\n"
1074
- "fcvtl v2.4s, v2.4h\n"
1075
- "fcvtl v26.4s, v26.4h\n"
1076
- ".inst 0x4f86e3ea // sdot v10.4s, v31.16b, v6.4b[0]\n"
1077
- ".inst 0x4fa6e3fd // sdot v29.4s, v31.16b, v6.4b[1]\n"
1078
- ".inst 0x4f86ebe9 // sdot v9.4s, v31.16b, v6.4b[2]\n"
1079
- ".inst 0x4fa6ebf4 // sdot v20.4s, v31.16b, v6.4b[3]\n"
1080
- "sshl v6.16b, v27.16b, v28.16b\n"
1081
- "sshl v28.16b, v30.16b, v28.16b\n"
1082
- "and v27.16b, v27.16b, v24.16b\n"
1083
- "and v30.16b, v30.16b, v24.16b\n"
1084
- "ldr q24, [x25, #0x20]\n"
1085
- ".inst 0x4f98e0ca // sdot v10.4s, v6.16b, v24.4b[0]\n"
1086
- ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
1087
- ".inst 0x4f98e8c9 // sdot v9.4s, v6.16b, v24.4b[2]\n"
1088
- ".inst 0x4fb8e8d4 // sdot v20.4s, v6.16b, v24.4b[3]\n"
1089
- "ldr q24, [x25, #0x30]\n"
1090
- ".inst 0x4f98e38a // sdot v10.4s, v28.16b, v24.4b[0]\n"
1091
- ".inst 0x4fb8e39d // sdot v29.4s, v28.16b, v24.4b[1]\n"
1092
- ".inst 0x4f98eb89 // sdot v9.4s, v28.16b, v24.4b[2]\n"
1093
- ".inst 0x4fb8eb94 // sdot v20.4s, v28.16b, v24.4b[3]\n"
1094
- "ldr q24, [x25, #0x40]\n"
1095
- ".inst 0x4f98e06a // sdot v10.4s, v3.16b, v24.4b[0]\n"
1096
- ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
1097
- ".inst 0x4f98e869 // sdot v9.4s, v3.16b, v24.4b[2]\n"
1098
- ".inst 0x4fb8e874 // sdot v20.4s, v3.16b, v24.4b[3]\n"
1099
- "ldr q24, [x25, #0x50]\n"
1100
- ".inst 0x4f98e2ca // sdot v10.4s, v22.16b, v24.4b[0]\n"
1101
- ".inst 0x4fb8e2dd // sdot v29.4s, v22.16b, v24.4b[1]\n"
1102
- ".inst 0x4f98eac9 // sdot v9.4s, v22.16b, v24.4b[2]\n"
1103
- ".inst 0x4fb8ead4 // sdot v20.4s, v22.16b, v24.4b[3]\n"
1104
- "ldr q24, [x25, #0x60]\n"
1105
- ".inst 0x4f98e36a // sdot v10.4s, v27.16b, v24.4b[0]\n"
1106
- ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
1107
- ".inst 0x4f98eb69 // sdot v9.4s, v27.16b, v24.4b[2]\n"
1108
- ".inst 0x4fb8eb74 // sdot v20.4s, v27.16b, v24.4b[3]\n"
1109
- "ldr q24, [x25, #0x70]\n"
1110
- "add x25, x25, #0x88\n"
1111
- ".inst 0x4f98e3ca // sdot v10.4s, v30.16b, v24.4b[0]\n"
1112
- ".inst 0x4fb8e3dd // sdot v29.4s, v30.16b, v24.4b[1]\n"
1113
- ".inst 0x4f98ebc9 // sdot v9.4s, v30.16b, v24.4b[2]\n"
1114
- ".inst 0x4fb8ebd4 // sdot v20.4s, v30.16b, v24.4b[3]\n"
1115
- "fmul v24.4s, v17.4s, v2.s[0]\n"
1116
- "scvtf v10.4s, v10.4s, #0x4\n"
1117
- "scvtf v29.4s, v29.4s, #0x4\n"
1118
- "scvtf v9.4s, v9.4s, #0x4\n"
1119
- "scvtf v20.4s, v20.4s, #0x4\n"
1120
- "fmla v15.4s, v10.4s, v24.4s\n"
1121
- "ldr q24, [x23, #0x0]\n"
1122
- "fmul v10.4s, v17.4s, v2.s[1]\n"
1123
- "fmla v19.4s, v29.4s, v10.4s\n"
1124
- "ldr q10, [x23, #0x10]\n"
1125
- "fmul v29.4s, v17.4s, v2.s[2]\n"
1126
- "fmul v2.4s, v17.4s, v2.s[3]\n"
1127
- "fmla v18.4s, v9.4s, v29.4s\n"
1128
- "movi v9.4s, #0x0\n"
1129
- "movi v29.4s, #0x0\n"
1130
- ".inst 0x4f98e189 // sdot v9.4s, v12.16b, v24.4b[0]\n"
1131
- ".inst 0x4fb8e19d // sdot v29.4s, v12.16b, v24.4b[1]\n"
1132
- "fmla v14.4s, v20.4s, v2.4s\n"
1133
- "movi v20.4s, #0x0\n"
1134
- "movi v2.4s, #0x0\n"
1135
- ".inst 0x4f98e994 // sdot v20.4s, v12.16b, v24.4b[2]\n"
1136
- ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
1137
- "ldr q24, [x23, #0x20]\n"
1138
- ".inst 0x4f8ae3e9 // sdot v9.4s, v31.16b, v10.4b[0]\n"
1139
- ".inst 0x4faae3fd // sdot v29.4s, v31.16b, v10.4b[1]\n"
1140
- ".inst 0x4f8aebf4 // sdot v20.4s, v31.16b, v10.4b[2]\n"
1141
- ".inst 0x4faaebe2 // sdot v2.4s, v31.16b, v10.4b[3]\n"
1142
- "ldr q10, [x23, #0x30]\n"
1143
- ".inst 0x4f98e0c9 // sdot v9.4s, v6.16b, v24.4b[0]\n"
1144
- ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
1145
- ".inst 0x4f98e8d4 // sdot v20.4s, v6.16b, v24.4b[2]\n"
1146
- ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
1147
- "ldr q24, [x23, #0x40]\n"
1148
- ".inst 0x4f8ae389 // sdot v9.4s, v28.16b, v10.4b[0]\n"
1149
- ".inst 0x4faae39d // sdot v29.4s, v28.16b, v10.4b[1]\n"
1150
- ".inst 0x4f8aeb94 // sdot v20.4s, v28.16b, v10.4b[2]\n"
1151
- ".inst 0x4faaeb82 // sdot v2.4s, v28.16b, v10.4b[3]\n"
1152
- "ldr q10, [x23, #0x50]\n"
1153
- ".inst 0x4f98e069 // sdot v9.4s, v3.16b, v24.4b[0]\n"
1154
- ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
1155
- ".inst 0x4f98e874 // sdot v20.4s, v3.16b, v24.4b[2]\n"
1156
- ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
1157
- "ldr q24, [x23, #0x60]\n"
1158
- ".inst 0x4f8ae2c9 // sdot v9.4s, v22.16b, v10.4b[0]\n"
1159
- ".inst 0x4faae2dd // sdot v29.4s, v22.16b, v10.4b[1]\n"
1160
- ".inst 0x4f8aead4 // sdot v20.4s, v22.16b, v10.4b[2]\n"
1161
- ".inst 0x4faaeac2 // sdot v2.4s, v22.16b, v10.4b[3]\n"
1162
- "ldr q10, [x23, #0x70]\n"
1163
- "add x23, x23, #0x88\n"
1164
- ".inst 0x4f98e369 // sdot v9.4s, v27.16b, v24.4b[0]\n"
1165
- ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
1166
- ".inst 0x4f98eb74 // sdot v20.4s, v27.16b, v24.4b[2]\n"
1167
- ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
1168
- "ldr q24, [x22, #0x0]\n"
1169
- ".inst 0x4f8ae3c9 // sdot v9.4s, v30.16b, v10.4b[0]\n"
1170
- ".inst 0x4faae3dd // sdot v29.4s, v30.16b, v10.4b[1]\n"
1171
- ".inst 0x4f8aebd4 // sdot v20.4s, v30.16b, v10.4b[2]\n"
1172
- ".inst 0x4faaebc2 // sdot v2.4s, v30.16b, v10.4b[3]\n"
1173
- "fmul v10.4s, v17.4s, v26.s[0]\n"
1174
- "scvtf v9.4s, v9.4s, #0x4\n"
1175
- "scvtf v29.4s, v29.4s, #0x4\n"
1176
- "scvtf v20.4s, v20.4s, #0x4\n"
1177
- "scvtf v2.4s, v2.4s, #0x4\n"
1178
- "fmla v11.4s, v9.4s, v10.4s\n"
1179
- "ldr q9, [x22, #0x10]\n"
1180
- "fmul v10.4s, v17.4s, v26.s[1]\n"
1181
- "fmla v13.4s, v29.4s, v10.4s\n"
1182
- "ldr d29, [x22, #-0x8]\n"
1183
- "fmul v10.4s, v17.4s, v26.s[2]\n"
1184
- "fmul v26.4s, v17.4s, v26.s[3]\n"
1185
- "fcvtl v29.4s, v29.4h\n"
1186
- "fmla v23.4s, v20.4s, v10.4s\n"
1187
- "movi v20.4s, #0x0\n"
1188
- "movi v10.4s, #0x0\n"
1189
- "fmla v16.4s, v2.4s, v26.4s\n"
1190
- "movi v26.4s, #0x0\n"
1191
- "movi v2.4s, #0x0\n"
1192
- ".inst 0x4f98e194 // sdot v20.4s, v12.16b, v24.4b[0]\n"
1193
- ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
1194
- ".inst 0x4f98e99a // sdot v26.4s, v12.16b, v24.4b[2]\n"
1195
- ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
1196
- "ldr q24, [x22, #0x20]\n"
1197
- ".inst 0x4f89e3f4 // sdot v20.4s, v31.16b, v9.4b[0]\n"
1198
- ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
1199
- ".inst 0x4f89ebfa // sdot v26.4s, v31.16b, v9.4b[2]\n"
1200
- ".inst 0x4fa9ebe2 // sdot v2.4s, v31.16b, v9.4b[3]\n"
1201
- "ldr q9, [x22, #0x30]\n"
1202
- ".inst 0x4f98e0d4 // sdot v20.4s, v6.16b, v24.4b[0]\n"
1203
- ".inst 0x4fb8e0ca // sdot v10.4s, v6.16b, v24.4b[1]\n"
1204
- ".inst 0x4f98e8da // sdot v26.4s, v6.16b, v24.4b[2]\n"
1205
- ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
1206
- "ldr q24, [x22, #0x40]\n"
1207
- ".inst 0x4f89e394 // sdot v20.4s, v28.16b, v9.4b[0]\n"
1208
- ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
1209
- ".inst 0x4f89eb9a // sdot v26.4s, v28.16b, v9.4b[2]\n"
1210
- ".inst 0x4fa9eb82 // sdot v2.4s, v28.16b, v9.4b[3]\n"
1211
- "ldr q9, [x22, #0x50]\n"
1212
- ".inst 0x4f98e074 // sdot v20.4s, v3.16b, v24.4b[0]\n"
1213
- ".inst 0x4fb8e06a // sdot v10.4s, v3.16b, v24.4b[1]\n"
1214
- ".inst 0x4f98e87a // sdot v26.4s, v3.16b, v24.4b[2]\n"
1215
- ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
1216
- "ldr q24, [x22, #0x60]\n"
1217
- ".inst 0x4f89e2d4 // sdot v20.4s, v22.16b, v9.4b[0]\n"
1218
- ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
1219
- ".inst 0x4f89eada // sdot v26.4s, v22.16b, v9.4b[2]\n"
1220
- ".inst 0x4fa9eac2 // sdot v2.4s, v22.16b, v9.4b[3]\n"
1221
- "ldr q9, [x22, #0x70]\n"
1222
- "add x22, x22, #0x88\n"
1223
- ".inst 0x4f98e374 // sdot v20.4s, v27.16b, v24.4b[0]\n"
1224
- ".inst 0x4fb8e36a // sdot v10.4s, v27.16b, v24.4b[1]\n"
1225
- ".inst 0x4f98eb7a // sdot v26.4s, v27.16b, v24.4b[2]\n"
1226
- ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
1227
- "ldr q24, [x21, #0x0]\n"
1228
- ".inst 0x4f89e3d4 // sdot v20.4s, v30.16b, v9.4b[0]\n"
1229
- ".inst 0x4fa9e3ca // sdot v10.4s, v30.16b, v9.4b[1]\n"
1230
- ".inst 0x4f89ebda // sdot v26.4s, v30.16b, v9.4b[2]\n"
1231
- ".inst 0x4fa9ebc2 // sdot v2.4s, v30.16b, v9.4b[3]\n"
1232
- "fmul v9.4s, v17.4s, v29.s[0]\n"
1233
- "scvtf v20.4s, v20.4s, #0x4\n"
1234
- "scvtf v10.4s, v10.4s, #0x4\n"
1235
- "scvtf v26.4s, v26.4s, #0x4\n"
1236
- "scvtf v2.4s, v2.4s, #0x4\n"
1237
- "fmla v25.4s, v20.4s, v9.4s\n"
1238
- "ldr q9, [x21, #0x10]\n"
1239
- "fmul v20.4s, v17.4s, v29.s[1]\n"
1240
- "fmla v7.4s, v10.4s, v20.4s\n"
1241
- "ldr d20, [x21, #-0x8]\n"
1242
- "fmul v10.4s, v17.4s, v29.s[2]\n"
1243
- "fmul v29.4s, v17.4s, v29.s[3]\n"
1244
- "fcvtl v20.4s, v20.4h\n"
1245
- "fmla v0.4s, v26.4s, v10.4s\n"
1246
- "movi v26.4s, #0x0\n"
1247
- "movi v10.4s, #0x0\n"
1248
- "fmla v4.4s, v2.4s, v29.4s\n"
1249
- "movi v2.4s, #0x0\n"
1250
- "movi v29.4s, #0x0\n"
1251
- ".inst 0x4f98e19a // sdot v26.4s, v12.16b, v24.4b[0]\n"
1252
- ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
1253
- ".inst 0x4f98e982 // sdot v2.4s, v12.16b, v24.4b[2]\n"
1254
- ".inst 0x4fb8e99d // sdot v29.4s, v12.16b, v24.4b[3]\n"
1255
- "ldr q12, [x21, #0x20]\n"
1256
- "fmul v24.4s, v17.4s, v20.s[0]\n"
1257
- ".inst 0x4f89e3fa // sdot v26.4s, v31.16b, v9.4b[0]\n"
1258
- ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
1259
- ".inst 0x4f89ebe2 // sdot v2.4s, v31.16b, v9.4b[2]\n"
1260
- ".inst 0x4fa9ebfd // sdot v29.4s, v31.16b, v9.4b[3]\n"
1261
- "ldr q9, [x21, #0x30]\n"
1262
- "fmul v31.4s, v17.4s, v20.s[1]\n"
1263
- ".inst 0x4f8ce0da // sdot v26.4s, v6.16b, v12.4b[0]\n"
1264
- ".inst 0x4face0ca // sdot v10.4s, v6.16b, v12.4b[1]\n"
1265
- ".inst 0x4f8ce8c2 // sdot v2.4s, v6.16b, v12.4b[2]\n"
1266
- ".inst 0x4face8dd // sdot v29.4s, v6.16b, v12.4b[3]\n"
1267
- "ldr q12, [x21, #0x40]\n"
1268
- "fmul v6.4s, v17.4s, v20.s[2]\n"
1269
- "fmul v20.4s, v17.4s, v20.s[3]\n"
1270
- ".inst 0x4f89e39a // sdot v26.4s, v28.16b, v9.4b[0]\n"
1271
- ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
1272
- ".inst 0x4f89eb82 // sdot v2.4s, v28.16b, v9.4b[2]\n"
1273
- ".inst 0x4fa9eb9d // sdot v29.4s, v28.16b, v9.4b[3]\n"
1274
- "ldr q9, [x21, #0x50]\n"
1275
- ".inst 0x4f8ce07a // sdot v26.4s, v3.16b, v12.4b[0]\n"
1276
- ".inst 0x4face06a // sdot v10.4s, v3.16b, v12.4b[1]\n"
1277
- ".inst 0x4f8ce862 // sdot v2.4s, v3.16b, v12.4b[2]\n"
1278
- ".inst 0x4face87d // sdot v29.4s, v3.16b, v12.4b[3]\n"
1279
- "ldr q12, [x21, #0x60]\n"
1280
- ".inst 0x4f89e2da // sdot v26.4s, v22.16b, v9.4b[0]\n"
1281
- ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
1282
- ".inst 0x4f89eac2 // sdot v2.4s, v22.16b, v9.4b[2]\n"
1283
- ".inst 0x4fa9eadd // sdot v29.4s, v22.16b, v9.4b[3]\n"
1284
- "ldr q17, [x21, #0x70]\n"
1285
- "add x21, x21, #0x88\n"
1286
- ".inst 0x4f8ce37a // sdot v26.4s, v27.16b, v12.4b[0]\n"
1287
- ".inst 0x4face36a // sdot v10.4s, v27.16b, v12.4b[1]\n"
1288
- ".inst 0x4f8ceb62 // sdot v2.4s, v27.16b, v12.4b[2]\n"
1289
- ".inst 0x4faceb7d // sdot v29.4s, v27.16b, v12.4b[3]\n"
1290
- ".inst 0x4f91e3da // sdot v26.4s, v30.16b, v17.4b[0]\n"
1291
- ".inst 0x4fb1e3ca // sdot v10.4s, v30.16b, v17.4b[1]\n"
1292
- ".inst 0x4f91ebc2 // sdot v2.4s, v30.16b, v17.4b[2]\n"
1293
- ".inst 0x4fb1ebdd // sdot v29.4s, v30.16b, v17.4b[3]\n"
1294
- "scvtf v26.4s, v26.4s, #0x4\n"
1295
- "scvtf v10.4s, v10.4s, #0x4\n"
1296
- "fmla v5.4s, v26.4s, v24.4s\n"
1297
- "scvtf v2.4s, v2.4s, #0x4\n"
1298
- "scvtf v29.4s, v29.4s, #0x4\n"
1299
- "fmla v21.4s, v10.4s, v31.4s\n"
1300
- "fmla v8.4s, v2.4s, v6.4s\n"
1301
- "fmla v1.4s, v29.4s, v20.4s\n"
1302
- "bgt 3b\n"
1303
- "mov x20, %x[res_ptr]\n"
1304
- "subs x27, x27, #0x4\n"
1305
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
1306
- "str q15, [x20, #0x0]\n"
1307
- "add x20, x20, %x[res_stride]\n"
1308
- "str q19, [x20, #0x0]\n"
1309
- "add x20, x20, %x[res_stride]\n"
1310
- "str q18, [x20, #0x0]\n"
1311
- "add x20, x20, %x[res_stride]\n"
1312
- "str q14, [x20, #0x0]\n"
1313
- "add x20, x20, %x[res_stride]\n"
1314
- "str q11, [x20, #0x0]\n"
1315
- "add x20, x20, %x[res_stride]\n"
1316
- "str q13, [x20, #0x0]\n"
1317
- "add x20, x20, %x[res_stride]\n"
1318
- "str q23, [x20, #0x0]\n"
1319
- "add x20, x20, %x[res_stride]\n"
1320
- "str q16, [x20, #0x0]\n"
1321
- "add x20, x20, %x[res_stride]\n"
1322
- "str q25, [x20, #0x0]\n"
1323
- "add x20, x20, %x[res_stride]\n"
1324
- "str q7, [x20, #0x0]\n"
1325
- "add x20, x20, %x[res_stride]\n"
1326
- "str q0, [x20, #0x0]\n"
1327
- "add x20, x20, %x[res_stride]\n"
1328
- "str q4, [x20, #0x0]\n"
1329
- "add x20, x20, %x[res_stride]\n"
1330
- "str q5, [x20, #0x0]\n"
1331
- "add x20, x20, %x[res_stride]\n"
1332
- "str q21, [x20, #0x0]\n"
1333
- "add x20, x20, %x[res_stride]\n"
1334
- "str q8, [x20, #0x0]\n"
1335
- "add x20, x20, %x[res_stride]\n"
1336
- "str q1, [x20, #0x0]\n"
1337
- "bne 2b\n"
1338
- "mov x20, #0x4\n"
1339
- "sub x10, x10, #0x10\n"
1340
- "cmp x10, #0x10\n"
1341
- "mov %x[res_ptr], x26\n"
1342
- "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
1343
- "bge 1b\n"
1344
- "4:" // Row loop skip
1345
- "cbz x10, 9f\n"
1346
- "5:" // Row tail: Row loop
1347
- "add x24, %x[b_ptr], #0x8\n"
1348
- "mov x23, %x[nc]\n"
1349
- "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
1350
- "6:" // Row tail: Column loop
1351
- "movi v15.16b, #0x0\n"
1352
- "movi v19.16b, #0x0\n"
1353
- "add x25, %x[a_ptr], #0x8\n"
1354
- "mov x21, %x[nb]\n"
1355
- "movi v18.16b, #0x0\n"
1356
- "movi v14.16b, #0x0\n"
1357
- "7:" // Row tail: Block loop
1358
- "ldr q7, [x24, #0x0]\n"
1359
- "ldr q5, [x25, #0x0]\n"
1360
- "movi v9.16b, #0x4\n"
1361
- "movi v4.4s, #0x0\n"
1362
- "ldr q3, [x24, #0x10]\n"
1363
- "ldr q2, [x25, #0x10]\n"
1364
- "movi v1.4s, #0x0\n"
1365
- "movi v0.4s, #0x0\n"
1366
- "ldr q13, [x24, #0x20]\n"
1367
- "ldr q31, [x25, #0x20]\n"
1368
- "movi v30.4s, #0x0\n"
1369
- "movi v29.16b, #0xf0\n"
1370
- "ldr q28, [x24, #0x30]\n"
1371
- "ldr q27, [x25, #0x30]\n"
1372
- "sshl v20.16b, v7.16b, v9.16b\n"
1373
- "sub x20, x24, #0x8\n"
1374
- "ldr q26, [x25, #0x40]\n"
1375
- "ldr q25, [x25, #0x50]\n"
1376
- "sshl v17.16b, v3.16b, v9.16b\n"
1377
- "and v7.16b, v7.16b, v29.16b\n"
1378
- "ldr q24, [x25, #0x60]\n"
1379
- "ldr q16, [x25, #0x70]\n"
1380
- "sshl v22.16b, v13.16b, v9.16b\n"
1381
- "and v3.16b, v3.16b, v29.16b\n"
1382
- "ldr d21, [x20, #0x0]\n"
1383
- "ldr d12, [x25, #-0x8]\n"
1384
- ".inst 0x4f85e284 // sdot v4.4s, v20.16b, v5.4b[0]\n"
1385
- ".inst 0x4fa5e281 // sdot v1.4s, v20.16b, v5.4b[1]\n"
1386
- ".inst 0x4f85ea80 // sdot v0.4s, v20.16b, v5.4b[2]\n"
1387
- ".inst 0x4fa5ea9e // sdot v30.4s, v20.16b, v5.4b[3]\n"
1388
- "sshl v9.16b, v28.16b, v9.16b\n"
1389
- "subs x21, x21, #0x1\n"
1390
- "and v13.16b, v13.16b, v29.16b\n"
1391
- "and v28.16b, v28.16b, v29.16b\n"
1392
- "add x25, x25, #0x88\n"
1393
- "add x24, x24, #0x48\n"
1394
- "fcvtl v21.4s, v21.4h\n"
1395
- "fcvtl v12.4s, v12.4h\n"
1396
- ".inst 0x4f82e224 // sdot v4.4s, v17.16b, v2.4b[0]\n"
1397
- ".inst 0x4fa2e221 // sdot v1.4s, v17.16b, v2.4b[1]\n"
1398
- ".inst 0x4f82ea20 // sdot v0.4s, v17.16b, v2.4b[2]\n"
1399
- ".inst 0x4fa2ea3e // sdot v30.4s, v17.16b, v2.4b[3]\n"
1400
- "fmul v11.4s, v21.4s, v12.s[0]\n"
1401
- "fmul v23.4s, v21.4s, v12.s[1]\n"
1402
- "fmul v17.4s, v21.4s, v12.s[2]\n"
1403
- ".inst 0x4f9fe2c4 // sdot v4.4s, v22.16b, v31.4b[0]\n"
1404
- "fmul v6.4s, v21.4s, v12.s[3]\n"
1405
- ".inst 0x4fbfe2c1 // sdot v1.4s, v22.16b, v31.4b[1]\n"
1406
- ".inst 0x4f9feac0 // sdot v0.4s, v22.16b, v31.4b[2]\n"
1407
- ".inst 0x4fbfeade // sdot v30.4s, v22.16b, v31.4b[3]\n"
1408
- ".inst 0x4f9be124 // sdot v4.4s, v9.16b, v27.4b[0]\n"
1409
- ".inst 0x4fbbe121 // sdot v1.4s, v9.16b, v27.4b[1]\n"
1410
- ".inst 0x4f9be920 // sdot v0.4s, v9.16b, v27.4b[2]\n"
1411
- ".inst 0x4fbbe93e // sdot v30.4s, v9.16b, v27.4b[3]\n"
1412
- ".inst 0x4f9ae0e4 // sdot v4.4s, v7.16b, v26.4b[0]\n"
1413
- ".inst 0x4fbae0e1 // sdot v1.4s, v7.16b, v26.4b[1]\n"
1414
- ".inst 0x4f9ae8e0 // sdot v0.4s, v7.16b, v26.4b[2]\n"
1415
- ".inst 0x4fbae8fe // sdot v30.4s, v7.16b, v26.4b[3]\n"
1416
- ".inst 0x4f99e064 // sdot v4.4s, v3.16b, v25.4b[0]\n"
1417
- ".inst 0x4fb9e061 // sdot v1.4s, v3.16b, v25.4b[1]\n"
1418
- ".inst 0x4f99e860 // sdot v0.4s, v3.16b, v25.4b[2]\n"
1419
- ".inst 0x4fb9e87e // sdot v30.4s, v3.16b, v25.4b[3]\n"
1420
- ".inst 0x4f98e1a4 // sdot v4.4s, v13.16b, v24.4b[0]\n"
1421
- ".inst 0x4fb8e1a1 // sdot v1.4s, v13.16b, v24.4b[1]\n"
1422
- ".inst 0x4f98e9a0 // sdot v0.4s, v13.16b, v24.4b[2]\n"
1423
- ".inst 0x4fb8e9be // sdot v30.4s, v13.16b, v24.4b[3]\n"
1424
- ".inst 0x4f90e384 // sdot v4.4s, v28.16b, v16.4b[0]\n"
1425
- ".inst 0x4fb0e381 // sdot v1.4s, v28.16b, v16.4b[1]\n"
1426
- ".inst 0x4f90eb80 // sdot v0.4s, v28.16b, v16.4b[2]\n"
1427
- ".inst 0x4fb0eb9e // sdot v30.4s, v28.16b, v16.4b[3]\n"
1428
- "scvtf v4.4s, v4.4s, #0x4\n"
1429
- "scvtf v1.4s, v1.4s, #0x4\n"
1430
- "scvtf v0.4s, v0.4s, #0x4\n"
1431
- "fmla v15.4s, v4.4s, v11.4s\n"
1432
- "scvtf v30.4s, v30.4s, #0x4\n"
1433
- "fmla v19.4s, v1.4s, v23.4s\n"
1434
- "fmla v18.4s, v0.4s, v17.4s\n"
1435
- "fmla v14.4s, v30.4s, v6.4s\n"
1436
- "bgt 7b\n"
1437
- "mov x20, %x[res_ptr]\n"
1438
- "cmp x10, #0x1\n"
1439
- "str q15, [x20, #0x0]\n"
1440
- "add x20, x20, %x[res_stride]\n"
1441
- "ble 8f\n"
1442
- "cmp x10, #0x2\n"
1443
- "str q19, [x20, #0x0]\n"
1444
- "add x20, x20, %x[res_stride]\n"
1445
- "ble 8f\n"
1446
- "cmp x10, #0x3\n"
1447
- "str q18, [x20, #0x0]\n"
1448
- "add x20, x20, %x[res_stride]\n"
1449
- "ble 8f\n"
1450
- "str q14, [x20, #0x0]\n"
1451
- "8:" // Row tail: Accumulator store skip
1452
- "subs x23, x23, #0x4\n"
1453
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
1454
- "bne 6b\n"
1455
- "subs x10, x10, #0x4\n"
1456
- "add %x[a_ptr], %x[a_ptr], x9\n"
1457
- "mov %x[res_ptr], x22\n"
1458
- "bgt 5b\n"
1459
- "9:" // Row tail: Row loop skip
1460
- : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
1461
- : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
1462
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
1463
- );
1464
- #else
1465
- float sumf[4][4];
1466
- int sumi;
1051
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
1052
+ if (lm_ggml_cpu_has_neon()) {
1053
+ const void * b_ptr = vx;
1054
+ const void * a_ptr = vy;
1055
+ float * res_ptr = s;
1056
+ size_t res_stride = bs * sizeof(float);
1467
1057
 
1468
- for (int y = 0; y < nr / 4; y++) {
1469
- const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1470
- for (int x = 0; x < nc / ncols_interleaved; x++) {
1471
- const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
1472
- for (int m = 0; m < 4; m++) {
1473
- for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1474
- }
1475
- for (int l = 0; l < nb; l++) {
1476
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1477
- for (int m = 0; m < 4; m++) {
1478
- for (int j = 0; j < ncols_interleaved; j++) {
1479
- sumi = 0;
1480
- for (int i = 0; i < blocklen; ++i) {
1481
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1482
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1483
- sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1484
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1058
+ __asm__ __volatile__(
1059
+ "mov x10, %x[nr]\n"
1060
+ "mov x9, #0x88\n"
1061
+ "cmp x10, #0x10\n"
1062
+ "mul x9, %x[nb], x9\n"
1063
+ "blt 4f\n"
1064
+ "1:" // Row loop
1065
+ "add x28, %x[b_ptr], #0x8\n"
1066
+ "mov x27, %x[nc]\n"
1067
+ "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
1068
+ "2:" // Column loop
1069
+ "add x25, %x[a_ptr], #0x8\n"
1070
+ "movi v15.16b, #0x0\n"
1071
+ "movi v19.16b, #0x0\n"
1072
+ "mov x24, %x[nb]\n"
1073
+ "add x23, x25, x9\n"
1074
+ "movi v18.16b, #0x0\n"
1075
+ "movi v14.16b, #0x0\n"
1076
+ "add x22, x23, x9\n"
1077
+ "movi v11.16b, #0x0\n"
1078
+ "movi v13.16b, #0x0\n"
1079
+ "add x21, x22, x9\n"
1080
+ "movi v23.16b, #0x0\n"
1081
+ "movi v16.16b, #0x0\n"
1082
+ "movi v25.16b, #0x0\n"
1083
+ "movi v7.16b, #0x0\n"
1084
+ "movi v0.16b, #0x0\n"
1085
+ "movi v4.16b, #0x0\n"
1086
+ "movi v5.16b, #0x0\n"
1087
+ "movi v21.16b, #0x0\n"
1088
+ "movi v8.16b, #0x0\n"
1089
+ "movi v1.16b, #0x0\n"
1090
+ "3:" // Block loop
1091
+ "ldr q3, [x28, #0x0]\n"
1092
+ "ldr q31, [x25, #0x0]\n"
1093
+ "movi v28.16b, #0x4\n"
1094
+ "movi v10.4s, #0x0\n"
1095
+ "ldr q22, [x28, #0x10]\n"
1096
+ "ldr q6, [x25, #0x10]\n"
1097
+ "movi v29.4s, #0x0\n"
1098
+ "movi v9.4s, #0x0\n"
1099
+ "ldr q27, [x28, #0x20]\n"
1100
+ "ldr q30, [x28, #0x30]\n"
1101
+ "movi v20.4s, #0x0\n"
1102
+ "movi v24.16b, #0xf0\n"
1103
+ "ldr d2, [x25, #-0x8]\n"
1104
+ "ldr d26, [x23, #-0x8]\n"
1105
+ "sshl v12.16b, v3.16b, v28.16b\n"
1106
+ "sub x20, x28, #0x8\n"
1107
+ "ldr d17, [x20, #0x0]\n"
1108
+ "and v3.16b, v3.16b, v24.16b\n"
1109
+ "subs x24, x24, #0x1\n"
1110
+ "add x28, x28, #0x48\n"
1111
+ ".inst 0x4f9fe18a // sdot v10.4s, v12.16b, v31.4b[0]\n"
1112
+ ".inst 0x4fbfe19d // sdot v29.4s, v12.16b, v31.4b[1]\n"
1113
+ ".inst 0x4f9fe989 // sdot v9.4s, v12.16b, v31.4b[2]\n"
1114
+ ".inst 0x4fbfe994 // sdot v20.4s, v12.16b, v31.4b[3]\n"
1115
+ "sshl v31.16b, v22.16b, v28.16b\n"
1116
+ "and v22.16b, v22.16b, v24.16b\n"
1117
+ "fcvtl v17.4s, v17.4h\n"
1118
+ "fcvtl v2.4s, v2.4h\n"
1119
+ "fcvtl v26.4s, v26.4h\n"
1120
+ ".inst 0x4f86e3ea // sdot v10.4s, v31.16b, v6.4b[0]\n"
1121
+ ".inst 0x4fa6e3fd // sdot v29.4s, v31.16b, v6.4b[1]\n"
1122
+ ".inst 0x4f86ebe9 // sdot v9.4s, v31.16b, v6.4b[2]\n"
1123
+ ".inst 0x4fa6ebf4 // sdot v20.4s, v31.16b, v6.4b[3]\n"
1124
+ "sshl v6.16b, v27.16b, v28.16b\n"
1125
+ "sshl v28.16b, v30.16b, v28.16b\n"
1126
+ "and v27.16b, v27.16b, v24.16b\n"
1127
+ "and v30.16b, v30.16b, v24.16b\n"
1128
+ "ldr q24, [x25, #0x20]\n"
1129
+ ".inst 0x4f98e0ca // sdot v10.4s, v6.16b, v24.4b[0]\n"
1130
+ ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
1131
+ ".inst 0x4f98e8c9 // sdot v9.4s, v6.16b, v24.4b[2]\n"
1132
+ ".inst 0x4fb8e8d4 // sdot v20.4s, v6.16b, v24.4b[3]\n"
1133
+ "ldr q24, [x25, #0x30]\n"
1134
+ ".inst 0x4f98e38a // sdot v10.4s, v28.16b, v24.4b[0]\n"
1135
+ ".inst 0x4fb8e39d // sdot v29.4s, v28.16b, v24.4b[1]\n"
1136
+ ".inst 0x4f98eb89 // sdot v9.4s, v28.16b, v24.4b[2]\n"
1137
+ ".inst 0x4fb8eb94 // sdot v20.4s, v28.16b, v24.4b[3]\n"
1138
+ "ldr q24, [x25, #0x40]\n"
1139
+ ".inst 0x4f98e06a // sdot v10.4s, v3.16b, v24.4b[0]\n"
1140
+ ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
1141
+ ".inst 0x4f98e869 // sdot v9.4s, v3.16b, v24.4b[2]\n"
1142
+ ".inst 0x4fb8e874 // sdot v20.4s, v3.16b, v24.4b[3]\n"
1143
+ "ldr q24, [x25, #0x50]\n"
1144
+ ".inst 0x4f98e2ca // sdot v10.4s, v22.16b, v24.4b[0]\n"
1145
+ ".inst 0x4fb8e2dd // sdot v29.4s, v22.16b, v24.4b[1]\n"
1146
+ ".inst 0x4f98eac9 // sdot v9.4s, v22.16b, v24.4b[2]\n"
1147
+ ".inst 0x4fb8ead4 // sdot v20.4s, v22.16b, v24.4b[3]\n"
1148
+ "ldr q24, [x25, #0x60]\n"
1149
+ ".inst 0x4f98e36a // sdot v10.4s, v27.16b, v24.4b[0]\n"
1150
+ ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
1151
+ ".inst 0x4f98eb69 // sdot v9.4s, v27.16b, v24.4b[2]\n"
1152
+ ".inst 0x4fb8eb74 // sdot v20.4s, v27.16b, v24.4b[3]\n"
1153
+ "ldr q24, [x25, #0x70]\n"
1154
+ "add x25, x25, #0x88\n"
1155
+ ".inst 0x4f98e3ca // sdot v10.4s, v30.16b, v24.4b[0]\n"
1156
+ ".inst 0x4fb8e3dd // sdot v29.4s, v30.16b, v24.4b[1]\n"
1157
+ ".inst 0x4f98ebc9 // sdot v9.4s, v30.16b, v24.4b[2]\n"
1158
+ ".inst 0x4fb8ebd4 // sdot v20.4s, v30.16b, v24.4b[3]\n"
1159
+ "fmul v24.4s, v17.4s, v2.s[0]\n"
1160
+ "scvtf v10.4s, v10.4s, #0x4\n"
1161
+ "scvtf v29.4s, v29.4s, #0x4\n"
1162
+ "scvtf v9.4s, v9.4s, #0x4\n"
1163
+ "scvtf v20.4s, v20.4s, #0x4\n"
1164
+ "fmla v15.4s, v10.4s, v24.4s\n"
1165
+ "ldr q24, [x23, #0x0]\n"
1166
+ "fmul v10.4s, v17.4s, v2.s[1]\n"
1167
+ "fmla v19.4s, v29.4s, v10.4s\n"
1168
+ "ldr q10, [x23, #0x10]\n"
1169
+ "fmul v29.4s, v17.4s, v2.s[2]\n"
1170
+ "fmul v2.4s, v17.4s, v2.s[3]\n"
1171
+ "fmla v18.4s, v9.4s, v29.4s\n"
1172
+ "movi v9.4s, #0x0\n"
1173
+ "movi v29.4s, #0x0\n"
1174
+ ".inst 0x4f98e189 // sdot v9.4s, v12.16b, v24.4b[0]\n"
1175
+ ".inst 0x4fb8e19d // sdot v29.4s, v12.16b, v24.4b[1]\n"
1176
+ "fmla v14.4s, v20.4s, v2.4s\n"
1177
+ "movi v20.4s, #0x0\n"
1178
+ "movi v2.4s, #0x0\n"
1179
+ ".inst 0x4f98e994 // sdot v20.4s, v12.16b, v24.4b[2]\n"
1180
+ ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
1181
+ "ldr q24, [x23, #0x20]\n"
1182
+ ".inst 0x4f8ae3e9 // sdot v9.4s, v31.16b, v10.4b[0]\n"
1183
+ ".inst 0x4faae3fd // sdot v29.4s, v31.16b, v10.4b[1]\n"
1184
+ ".inst 0x4f8aebf4 // sdot v20.4s, v31.16b, v10.4b[2]\n"
1185
+ ".inst 0x4faaebe2 // sdot v2.4s, v31.16b, v10.4b[3]\n"
1186
+ "ldr q10, [x23, #0x30]\n"
1187
+ ".inst 0x4f98e0c9 // sdot v9.4s, v6.16b, v24.4b[0]\n"
1188
+ ".inst 0x4fb8e0dd // sdot v29.4s, v6.16b, v24.4b[1]\n"
1189
+ ".inst 0x4f98e8d4 // sdot v20.4s, v6.16b, v24.4b[2]\n"
1190
+ ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
1191
+ "ldr q24, [x23, #0x40]\n"
1192
+ ".inst 0x4f8ae389 // sdot v9.4s, v28.16b, v10.4b[0]\n"
1193
+ ".inst 0x4faae39d // sdot v29.4s, v28.16b, v10.4b[1]\n"
1194
+ ".inst 0x4f8aeb94 // sdot v20.4s, v28.16b, v10.4b[2]\n"
1195
+ ".inst 0x4faaeb82 // sdot v2.4s, v28.16b, v10.4b[3]\n"
1196
+ "ldr q10, [x23, #0x50]\n"
1197
+ ".inst 0x4f98e069 // sdot v9.4s, v3.16b, v24.4b[0]\n"
1198
+ ".inst 0x4fb8e07d // sdot v29.4s, v3.16b, v24.4b[1]\n"
1199
+ ".inst 0x4f98e874 // sdot v20.4s, v3.16b, v24.4b[2]\n"
1200
+ ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
1201
+ "ldr q24, [x23, #0x60]\n"
1202
+ ".inst 0x4f8ae2c9 // sdot v9.4s, v22.16b, v10.4b[0]\n"
1203
+ ".inst 0x4faae2dd // sdot v29.4s, v22.16b, v10.4b[1]\n"
1204
+ ".inst 0x4f8aead4 // sdot v20.4s, v22.16b, v10.4b[2]\n"
1205
+ ".inst 0x4faaeac2 // sdot v2.4s, v22.16b, v10.4b[3]\n"
1206
+ "ldr q10, [x23, #0x70]\n"
1207
+ "add x23, x23, #0x88\n"
1208
+ ".inst 0x4f98e369 // sdot v9.4s, v27.16b, v24.4b[0]\n"
1209
+ ".inst 0x4fb8e37d // sdot v29.4s, v27.16b, v24.4b[1]\n"
1210
+ ".inst 0x4f98eb74 // sdot v20.4s, v27.16b, v24.4b[2]\n"
1211
+ ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
1212
+ "ldr q24, [x22, #0x0]\n"
1213
+ ".inst 0x4f8ae3c9 // sdot v9.4s, v30.16b, v10.4b[0]\n"
1214
+ ".inst 0x4faae3dd // sdot v29.4s, v30.16b, v10.4b[1]\n"
1215
+ ".inst 0x4f8aebd4 // sdot v20.4s, v30.16b, v10.4b[2]\n"
1216
+ ".inst 0x4faaebc2 // sdot v2.4s, v30.16b, v10.4b[3]\n"
1217
+ "fmul v10.4s, v17.4s, v26.s[0]\n"
1218
+ "scvtf v9.4s, v9.4s, #0x4\n"
1219
+ "scvtf v29.4s, v29.4s, #0x4\n"
1220
+ "scvtf v20.4s, v20.4s, #0x4\n"
1221
+ "scvtf v2.4s, v2.4s, #0x4\n"
1222
+ "fmla v11.4s, v9.4s, v10.4s\n"
1223
+ "ldr q9, [x22, #0x10]\n"
1224
+ "fmul v10.4s, v17.4s, v26.s[1]\n"
1225
+ "fmla v13.4s, v29.4s, v10.4s\n"
1226
+ "ldr d29, [x22, #-0x8]\n"
1227
+ "fmul v10.4s, v17.4s, v26.s[2]\n"
1228
+ "fmul v26.4s, v17.4s, v26.s[3]\n"
1229
+ "fcvtl v29.4s, v29.4h\n"
1230
+ "fmla v23.4s, v20.4s, v10.4s\n"
1231
+ "movi v20.4s, #0x0\n"
1232
+ "movi v10.4s, #0x0\n"
1233
+ "fmla v16.4s, v2.4s, v26.4s\n"
1234
+ "movi v26.4s, #0x0\n"
1235
+ "movi v2.4s, #0x0\n"
1236
+ ".inst 0x4f98e194 // sdot v20.4s, v12.16b, v24.4b[0]\n"
1237
+ ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
1238
+ ".inst 0x4f98e99a // sdot v26.4s, v12.16b, v24.4b[2]\n"
1239
+ ".inst 0x4fb8e982 // sdot v2.4s, v12.16b, v24.4b[3]\n"
1240
+ "ldr q24, [x22, #0x20]\n"
1241
+ ".inst 0x4f89e3f4 // sdot v20.4s, v31.16b, v9.4b[0]\n"
1242
+ ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
1243
+ ".inst 0x4f89ebfa // sdot v26.4s, v31.16b, v9.4b[2]\n"
1244
+ ".inst 0x4fa9ebe2 // sdot v2.4s, v31.16b, v9.4b[3]\n"
1245
+ "ldr q9, [x22, #0x30]\n"
1246
+ ".inst 0x4f98e0d4 // sdot v20.4s, v6.16b, v24.4b[0]\n"
1247
+ ".inst 0x4fb8e0ca // sdot v10.4s, v6.16b, v24.4b[1]\n"
1248
+ ".inst 0x4f98e8da // sdot v26.4s, v6.16b, v24.4b[2]\n"
1249
+ ".inst 0x4fb8e8c2 // sdot v2.4s, v6.16b, v24.4b[3]\n"
1250
+ "ldr q24, [x22, #0x40]\n"
1251
+ ".inst 0x4f89e394 // sdot v20.4s, v28.16b, v9.4b[0]\n"
1252
+ ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
1253
+ ".inst 0x4f89eb9a // sdot v26.4s, v28.16b, v9.4b[2]\n"
1254
+ ".inst 0x4fa9eb82 // sdot v2.4s, v28.16b, v9.4b[3]\n"
1255
+ "ldr q9, [x22, #0x50]\n"
1256
+ ".inst 0x4f98e074 // sdot v20.4s, v3.16b, v24.4b[0]\n"
1257
+ ".inst 0x4fb8e06a // sdot v10.4s, v3.16b, v24.4b[1]\n"
1258
+ ".inst 0x4f98e87a // sdot v26.4s, v3.16b, v24.4b[2]\n"
1259
+ ".inst 0x4fb8e862 // sdot v2.4s, v3.16b, v24.4b[3]\n"
1260
+ "ldr q24, [x22, #0x60]\n"
1261
+ ".inst 0x4f89e2d4 // sdot v20.4s, v22.16b, v9.4b[0]\n"
1262
+ ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
1263
+ ".inst 0x4f89eada // sdot v26.4s, v22.16b, v9.4b[2]\n"
1264
+ ".inst 0x4fa9eac2 // sdot v2.4s, v22.16b, v9.4b[3]\n"
1265
+ "ldr q9, [x22, #0x70]\n"
1266
+ "add x22, x22, #0x88\n"
1267
+ ".inst 0x4f98e374 // sdot v20.4s, v27.16b, v24.4b[0]\n"
1268
+ ".inst 0x4fb8e36a // sdot v10.4s, v27.16b, v24.4b[1]\n"
1269
+ ".inst 0x4f98eb7a // sdot v26.4s, v27.16b, v24.4b[2]\n"
1270
+ ".inst 0x4fb8eb62 // sdot v2.4s, v27.16b, v24.4b[3]\n"
1271
+ "ldr q24, [x21, #0x0]\n"
1272
+ ".inst 0x4f89e3d4 // sdot v20.4s, v30.16b, v9.4b[0]\n"
1273
+ ".inst 0x4fa9e3ca // sdot v10.4s, v30.16b, v9.4b[1]\n"
1274
+ ".inst 0x4f89ebda // sdot v26.4s, v30.16b, v9.4b[2]\n"
1275
+ ".inst 0x4fa9ebc2 // sdot v2.4s, v30.16b, v9.4b[3]\n"
1276
+ "fmul v9.4s, v17.4s, v29.s[0]\n"
1277
+ "scvtf v20.4s, v20.4s, #0x4\n"
1278
+ "scvtf v10.4s, v10.4s, #0x4\n"
1279
+ "scvtf v26.4s, v26.4s, #0x4\n"
1280
+ "scvtf v2.4s, v2.4s, #0x4\n"
1281
+ "fmla v25.4s, v20.4s, v9.4s\n"
1282
+ "ldr q9, [x21, #0x10]\n"
1283
+ "fmul v20.4s, v17.4s, v29.s[1]\n"
1284
+ "fmla v7.4s, v10.4s, v20.4s\n"
1285
+ "ldr d20, [x21, #-0x8]\n"
1286
+ "fmul v10.4s, v17.4s, v29.s[2]\n"
1287
+ "fmul v29.4s, v17.4s, v29.s[3]\n"
1288
+ "fcvtl v20.4s, v20.4h\n"
1289
+ "fmla v0.4s, v26.4s, v10.4s\n"
1290
+ "movi v26.4s, #0x0\n"
1291
+ "movi v10.4s, #0x0\n"
1292
+ "fmla v4.4s, v2.4s, v29.4s\n"
1293
+ "movi v2.4s, #0x0\n"
1294
+ "movi v29.4s, #0x0\n"
1295
+ ".inst 0x4f98e19a // sdot v26.4s, v12.16b, v24.4b[0]\n"
1296
+ ".inst 0x4fb8e18a // sdot v10.4s, v12.16b, v24.4b[1]\n"
1297
+ ".inst 0x4f98e982 // sdot v2.4s, v12.16b, v24.4b[2]\n"
1298
+ ".inst 0x4fb8e99d // sdot v29.4s, v12.16b, v24.4b[3]\n"
1299
+ "ldr q12, [x21, #0x20]\n"
1300
+ "fmul v24.4s, v17.4s, v20.s[0]\n"
1301
+ ".inst 0x4f89e3fa // sdot v26.4s, v31.16b, v9.4b[0]\n"
1302
+ ".inst 0x4fa9e3ea // sdot v10.4s, v31.16b, v9.4b[1]\n"
1303
+ ".inst 0x4f89ebe2 // sdot v2.4s, v31.16b, v9.4b[2]\n"
1304
+ ".inst 0x4fa9ebfd // sdot v29.4s, v31.16b, v9.4b[3]\n"
1305
+ "ldr q9, [x21, #0x30]\n"
1306
+ "fmul v31.4s, v17.4s, v20.s[1]\n"
1307
+ ".inst 0x4f8ce0da // sdot v26.4s, v6.16b, v12.4b[0]\n"
1308
+ ".inst 0x4face0ca // sdot v10.4s, v6.16b, v12.4b[1]\n"
1309
+ ".inst 0x4f8ce8c2 // sdot v2.4s, v6.16b, v12.4b[2]\n"
1310
+ ".inst 0x4face8dd // sdot v29.4s, v6.16b, v12.4b[3]\n"
1311
+ "ldr q12, [x21, #0x40]\n"
1312
+ "fmul v6.4s, v17.4s, v20.s[2]\n"
1313
+ "fmul v20.4s, v17.4s, v20.s[3]\n"
1314
+ ".inst 0x4f89e39a // sdot v26.4s, v28.16b, v9.4b[0]\n"
1315
+ ".inst 0x4fa9e38a // sdot v10.4s, v28.16b, v9.4b[1]\n"
1316
+ ".inst 0x4f89eb82 // sdot v2.4s, v28.16b, v9.4b[2]\n"
1317
+ ".inst 0x4fa9eb9d // sdot v29.4s, v28.16b, v9.4b[3]\n"
1318
+ "ldr q9, [x21, #0x50]\n"
1319
+ ".inst 0x4f8ce07a // sdot v26.4s, v3.16b, v12.4b[0]\n"
1320
+ ".inst 0x4face06a // sdot v10.4s, v3.16b, v12.4b[1]\n"
1321
+ ".inst 0x4f8ce862 // sdot v2.4s, v3.16b, v12.4b[2]\n"
1322
+ ".inst 0x4face87d // sdot v29.4s, v3.16b, v12.4b[3]\n"
1323
+ "ldr q12, [x21, #0x60]\n"
1324
+ ".inst 0x4f89e2da // sdot v26.4s, v22.16b, v9.4b[0]\n"
1325
+ ".inst 0x4fa9e2ca // sdot v10.4s, v22.16b, v9.4b[1]\n"
1326
+ ".inst 0x4f89eac2 // sdot v2.4s, v22.16b, v9.4b[2]\n"
1327
+ ".inst 0x4fa9eadd // sdot v29.4s, v22.16b, v9.4b[3]\n"
1328
+ "ldr q17, [x21, #0x70]\n"
1329
+ "add x21, x21, #0x88\n"
1330
+ ".inst 0x4f8ce37a // sdot v26.4s, v27.16b, v12.4b[0]\n"
1331
+ ".inst 0x4face36a // sdot v10.4s, v27.16b, v12.4b[1]\n"
1332
+ ".inst 0x4f8ceb62 // sdot v2.4s, v27.16b, v12.4b[2]\n"
1333
+ ".inst 0x4faceb7d // sdot v29.4s, v27.16b, v12.4b[3]\n"
1334
+ ".inst 0x4f91e3da // sdot v26.4s, v30.16b, v17.4b[0]\n"
1335
+ ".inst 0x4fb1e3ca // sdot v10.4s, v30.16b, v17.4b[1]\n"
1336
+ ".inst 0x4f91ebc2 // sdot v2.4s, v30.16b, v17.4b[2]\n"
1337
+ ".inst 0x4fb1ebdd // sdot v29.4s, v30.16b, v17.4b[3]\n"
1338
+ "scvtf v26.4s, v26.4s, #0x4\n"
1339
+ "scvtf v10.4s, v10.4s, #0x4\n"
1340
+ "fmla v5.4s, v26.4s, v24.4s\n"
1341
+ "scvtf v2.4s, v2.4s, #0x4\n"
1342
+ "scvtf v29.4s, v29.4s, #0x4\n"
1343
+ "fmla v21.4s, v10.4s, v31.4s\n"
1344
+ "fmla v8.4s, v2.4s, v6.4s\n"
1345
+ "fmla v1.4s, v29.4s, v20.4s\n"
1346
+ "bgt 3b\n"
1347
+ "mov x20, %x[res_ptr]\n"
1348
+ "subs x27, x27, #0x4\n"
1349
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
1350
+ "str q15, [x20, #0x0]\n"
1351
+ "add x20, x20, %x[res_stride]\n"
1352
+ "str q19, [x20, #0x0]\n"
1353
+ "add x20, x20, %x[res_stride]\n"
1354
+ "str q18, [x20, #0x0]\n"
1355
+ "add x20, x20, %x[res_stride]\n"
1356
+ "str q14, [x20, #0x0]\n"
1357
+ "add x20, x20, %x[res_stride]\n"
1358
+ "str q11, [x20, #0x0]\n"
1359
+ "add x20, x20, %x[res_stride]\n"
1360
+ "str q13, [x20, #0x0]\n"
1361
+ "add x20, x20, %x[res_stride]\n"
1362
+ "str q23, [x20, #0x0]\n"
1363
+ "add x20, x20, %x[res_stride]\n"
1364
+ "str q16, [x20, #0x0]\n"
1365
+ "add x20, x20, %x[res_stride]\n"
1366
+ "str q25, [x20, #0x0]\n"
1367
+ "add x20, x20, %x[res_stride]\n"
1368
+ "str q7, [x20, #0x0]\n"
1369
+ "add x20, x20, %x[res_stride]\n"
1370
+ "str q0, [x20, #0x0]\n"
1371
+ "add x20, x20, %x[res_stride]\n"
1372
+ "str q4, [x20, #0x0]\n"
1373
+ "add x20, x20, %x[res_stride]\n"
1374
+ "str q5, [x20, #0x0]\n"
1375
+ "add x20, x20, %x[res_stride]\n"
1376
+ "str q21, [x20, #0x0]\n"
1377
+ "add x20, x20, %x[res_stride]\n"
1378
+ "str q8, [x20, #0x0]\n"
1379
+ "add x20, x20, %x[res_stride]\n"
1380
+ "str q1, [x20, #0x0]\n"
1381
+ "bne 2b\n"
1382
+ "mov x20, #0x4\n"
1383
+ "sub x10, x10, #0x10\n"
1384
+ "cmp x10, #0x10\n"
1385
+ "mov %x[res_ptr], x26\n"
1386
+ "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
1387
+ "bge 1b\n"
1388
+ "4:" // Row loop skip
1389
+ "cbz x10, 9f\n"
1390
+ "5:" // Row tail: Row loop
1391
+ "add x24, %x[b_ptr], #0x8\n"
1392
+ "mov x23, %x[nc]\n"
1393
+ "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
1394
+ "6:" // Row tail: Column loop
1395
+ "movi v15.16b, #0x0\n"
1396
+ "movi v19.16b, #0x0\n"
1397
+ "add x25, %x[a_ptr], #0x8\n"
1398
+ "mov x21, %x[nb]\n"
1399
+ "movi v18.16b, #0x0\n"
1400
+ "movi v14.16b, #0x0\n"
1401
+ "7:" // Row tail: Block loop
1402
+ "ldr q7, [x24, #0x0]\n"
1403
+ "ldr q5, [x25, #0x0]\n"
1404
+ "movi v9.16b, #0x4\n"
1405
+ "movi v4.4s, #0x0\n"
1406
+ "ldr q3, [x24, #0x10]\n"
1407
+ "ldr q2, [x25, #0x10]\n"
1408
+ "movi v1.4s, #0x0\n"
1409
+ "movi v0.4s, #0x0\n"
1410
+ "ldr q13, [x24, #0x20]\n"
1411
+ "ldr q31, [x25, #0x20]\n"
1412
+ "movi v30.4s, #0x0\n"
1413
+ "movi v29.16b, #0xf0\n"
1414
+ "ldr q28, [x24, #0x30]\n"
1415
+ "ldr q27, [x25, #0x30]\n"
1416
+ "sshl v20.16b, v7.16b, v9.16b\n"
1417
+ "sub x20, x24, #0x8\n"
1418
+ "ldr q26, [x25, #0x40]\n"
1419
+ "ldr q25, [x25, #0x50]\n"
1420
+ "sshl v17.16b, v3.16b, v9.16b\n"
1421
+ "and v7.16b, v7.16b, v29.16b\n"
1422
+ "ldr q24, [x25, #0x60]\n"
1423
+ "ldr q16, [x25, #0x70]\n"
1424
+ "sshl v22.16b, v13.16b, v9.16b\n"
1425
+ "and v3.16b, v3.16b, v29.16b\n"
1426
+ "ldr d21, [x20, #0x0]\n"
1427
+ "ldr d12, [x25, #-0x8]\n"
1428
+ ".inst 0x4f85e284 // sdot v4.4s, v20.16b, v5.4b[0]\n"
1429
+ ".inst 0x4fa5e281 // sdot v1.4s, v20.16b, v5.4b[1]\n"
1430
+ ".inst 0x4f85ea80 // sdot v0.4s, v20.16b, v5.4b[2]\n"
1431
+ ".inst 0x4fa5ea9e // sdot v30.4s, v20.16b, v5.4b[3]\n"
1432
+ "sshl v9.16b, v28.16b, v9.16b\n"
1433
+ "subs x21, x21, #0x1\n"
1434
+ "and v13.16b, v13.16b, v29.16b\n"
1435
+ "and v28.16b, v28.16b, v29.16b\n"
1436
+ "add x25, x25, #0x88\n"
1437
+ "add x24, x24, #0x48\n"
1438
+ "fcvtl v21.4s, v21.4h\n"
1439
+ "fcvtl v12.4s, v12.4h\n"
1440
+ ".inst 0x4f82e224 // sdot v4.4s, v17.16b, v2.4b[0]\n"
1441
+ ".inst 0x4fa2e221 // sdot v1.4s, v17.16b, v2.4b[1]\n"
1442
+ ".inst 0x4f82ea20 // sdot v0.4s, v17.16b, v2.4b[2]\n"
1443
+ ".inst 0x4fa2ea3e // sdot v30.4s, v17.16b, v2.4b[3]\n"
1444
+ "fmul v11.4s, v21.4s, v12.s[0]\n"
1445
+ "fmul v23.4s, v21.4s, v12.s[1]\n"
1446
+ "fmul v17.4s, v21.4s, v12.s[2]\n"
1447
+ ".inst 0x4f9fe2c4 // sdot v4.4s, v22.16b, v31.4b[0]\n"
1448
+ "fmul v6.4s, v21.4s, v12.s[3]\n"
1449
+ ".inst 0x4fbfe2c1 // sdot v1.4s, v22.16b, v31.4b[1]\n"
1450
+ ".inst 0x4f9feac0 // sdot v0.4s, v22.16b, v31.4b[2]\n"
1451
+ ".inst 0x4fbfeade // sdot v30.4s, v22.16b, v31.4b[3]\n"
1452
+ ".inst 0x4f9be124 // sdot v4.4s, v9.16b, v27.4b[0]\n"
1453
+ ".inst 0x4fbbe121 // sdot v1.4s, v9.16b, v27.4b[1]\n"
1454
+ ".inst 0x4f9be920 // sdot v0.4s, v9.16b, v27.4b[2]\n"
1455
+ ".inst 0x4fbbe93e // sdot v30.4s, v9.16b, v27.4b[3]\n"
1456
+ ".inst 0x4f9ae0e4 // sdot v4.4s, v7.16b, v26.4b[0]\n"
1457
+ ".inst 0x4fbae0e1 // sdot v1.4s, v7.16b, v26.4b[1]\n"
1458
+ ".inst 0x4f9ae8e0 // sdot v0.4s, v7.16b, v26.4b[2]\n"
1459
+ ".inst 0x4fbae8fe // sdot v30.4s, v7.16b, v26.4b[3]\n"
1460
+ ".inst 0x4f99e064 // sdot v4.4s, v3.16b, v25.4b[0]\n"
1461
+ ".inst 0x4fb9e061 // sdot v1.4s, v3.16b, v25.4b[1]\n"
1462
+ ".inst 0x4f99e860 // sdot v0.4s, v3.16b, v25.4b[2]\n"
1463
+ ".inst 0x4fb9e87e // sdot v30.4s, v3.16b, v25.4b[3]\n"
1464
+ ".inst 0x4f98e1a4 // sdot v4.4s, v13.16b, v24.4b[0]\n"
1465
+ ".inst 0x4fb8e1a1 // sdot v1.4s, v13.16b, v24.4b[1]\n"
1466
+ ".inst 0x4f98e9a0 // sdot v0.4s, v13.16b, v24.4b[2]\n"
1467
+ ".inst 0x4fb8e9be // sdot v30.4s, v13.16b, v24.4b[3]\n"
1468
+ ".inst 0x4f90e384 // sdot v4.4s, v28.16b, v16.4b[0]\n"
1469
+ ".inst 0x4fb0e381 // sdot v1.4s, v28.16b, v16.4b[1]\n"
1470
+ ".inst 0x4f90eb80 // sdot v0.4s, v28.16b, v16.4b[2]\n"
1471
+ ".inst 0x4fb0eb9e // sdot v30.4s, v28.16b, v16.4b[3]\n"
1472
+ "scvtf v4.4s, v4.4s, #0x4\n"
1473
+ "scvtf v1.4s, v1.4s, #0x4\n"
1474
+ "scvtf v0.4s, v0.4s, #0x4\n"
1475
+ "fmla v15.4s, v4.4s, v11.4s\n"
1476
+ "scvtf v30.4s, v30.4s, #0x4\n"
1477
+ "fmla v19.4s, v1.4s, v23.4s\n"
1478
+ "fmla v18.4s, v0.4s, v17.4s\n"
1479
+ "fmla v14.4s, v30.4s, v6.4s\n"
1480
+ "bgt 7b\n"
1481
+ "mov x20, %x[res_ptr]\n"
1482
+ "cmp x10, #0x1\n"
1483
+ "str q15, [x20, #0x0]\n"
1484
+ "add x20, x20, %x[res_stride]\n"
1485
+ "ble 8f\n"
1486
+ "cmp x10, #0x2\n"
1487
+ "str q19, [x20, #0x0]\n"
1488
+ "add x20, x20, %x[res_stride]\n"
1489
+ "ble 8f\n"
1490
+ "cmp x10, #0x3\n"
1491
+ "str q18, [x20, #0x0]\n"
1492
+ "add x20, x20, %x[res_stride]\n"
1493
+ "ble 8f\n"
1494
+ "str q14, [x20, #0x0]\n"
1495
+ "8:" // Row tail: Accumulator store skip
1496
+ "subs x23, x23, #0x4\n"
1497
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
1498
+ "bne 6b\n"
1499
+ "subs x10, x10, #0x4\n"
1500
+ "add %x[a_ptr], %x[a_ptr], x9\n"
1501
+ "mov %x[res_ptr], x22\n"
1502
+ "bgt 5b\n"
1503
+ "9:" // Row tail: Row loop skip
1504
+ : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
1505
+ : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
1506
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
1507
+ );
1508
+ return;
1509
+ }
1510
+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON)
1511
+ {
1512
+ float sumf[4][4];
1513
+ int sumi;
1514
+
1515
+ for (int y = 0; y < nr / 4; y++) {
1516
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1517
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1518
+ const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
1519
+ for (int m = 0; m < 4; m++) {
1520
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
1521
+ }
1522
+ for (int l = 0; l < nb; l++) {
1523
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
1524
+ for (int m = 0; m < 4; m++) {
1525
+ for (int j = 0; j < ncols_interleaved; j++) {
1526
+ sumi = 0;
1527
+ for (int i = 0; i < blocklen; ++i) {
1528
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1529
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1530
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1531
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1532
+ }
1533
+ sumf[m][j] += sumi * LM_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_FP16_TO_FP32(a_ptr[l].d[m]);
1485
1534
  }
1486
- sumf[m][j] += sumi * LM_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_FP16_TO_FP32(a_ptr[l].d[m]);
1487
1535
  }
1488
1536
  }
1489
1537
  }
1490
- }
1491
- for (int m = 0; m < 4; m++) {
1492
- for (int j = 0; j < ncols_interleaved; j++)
1493
- s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1538
+ for (int m = 0; m < 4; m++) {
1539
+ for (int j = 0; j < ncols_interleaved; j++)
1540
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1541
+ }
1494
1542
  }
1495
1543
  }
1496
1544
  }
1497
- #endif
1498
1545
  }
1499
1546
 
1500
1547
  void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
@@ -1517,413 +1564,406 @@ void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
1517
1564
  UNUSED(ncols_interleaved);
1518
1565
  UNUSED(blocklen);
1519
1566
 
1520
- #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
1521
- if (lm_ggml_sve_cnt_b == QK8_0) {
1522
- LM_GGML_ASSERT(!(lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
1523
- "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
1567
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
1568
+ if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
1569
+ const void * b_ptr = vx;
1570
+ const void * a_ptr = vy;
1571
+ float * res_ptr = s;
1572
+ size_t res_stride = bs * sizeof(float);
1573
+
1574
+ __asm__ __volatile__(
1575
+ "mov x10, %x[nr]\n"
1576
+ "mov x9, #0x88\n"
1577
+ "cmp x10, #0x10\n"
1578
+ "mul x9, %x[nb], x9\n"
1579
+ "blt 4f\n"
1580
+ "1:" // Row loop
1581
+ "add x28, %x[b_ptr], #0x8\n"
1582
+ "mov x27, %x[nc]\n"
1583
+ "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
1584
+ "2:" // Column loop
1585
+ "add x25, %x[a_ptr], #0x8\n"
1586
+ "movi v2.16b, #0x0\n"
1587
+ "movi v10.16b, #0x0\n"
1588
+ "mov x24, %x[nb]\n"
1589
+ "add x23, x25, x9\n"
1590
+ "movi v12.16b, #0x0\n"
1591
+ "movi v28.16b, #0x0\n"
1592
+ "add x22, x23, x9\n"
1593
+ "movi v11.16b, #0x0\n"
1594
+ "movi v13.16b, #0x0\n"
1595
+ "add x21, x22, x9\n"
1596
+ "movi v22.16b, #0x0\n"
1597
+ "movi v23.16b, #0x0\n"
1598
+ "movi v25.16b, #0x0\n"
1599
+ "movi v5.16b, #0x0\n"
1600
+ "movi v7.16b, #0x0\n"
1601
+ "movi v4.16b, #0x0\n"
1602
+ "movi v6.16b, #0x0\n"
1603
+ "movi v30.16b, #0x0\n"
1604
+ "movi v24.16b, #0x0\n"
1605
+ "movi v14.16b, #0x0\n"
1606
+ "3:" // Block loop
1607
+ "ldr q21, [x28, #0x0]\n"
1608
+ "ldr q16, [x28, #0x10]\n"
1609
+ "movi v1.16b, #0x4\n"
1610
+ "movi v19.4s, #0x0\n"
1611
+ "ldr q27, [x25, #0x0]\n"
1612
+ "ldr q15, [x25, #0x10]\n"
1613
+ "movi v26.4s, #0x0\n"
1614
+ "movi v18.4s, #0x0\n"
1615
+ "ldr q29, [x28, #0x20]\n"
1616
+ "ldr q3, [x28, #0x30]\n"
1617
+ "movi v17.4s, #0x0\n"
1618
+ "movi v0.16b, #0xf0\n"
1619
+ "ldr d20, [x25, #-0x8]\n"
1620
+ "ldr d9, [x23, #-0x8]\n"
1621
+ "sshl v8.16b, v21.16b, v1.16b\n"
1622
+ "sshl v31.16b, v16.16b, v1.16b\n"
1623
+ "and v21.16b, v21.16b, v0.16b\n"
1624
+ "and v16.16b, v16.16b, v0.16b\n"
1625
+ "sub x20, x28, #0x8\n"
1626
+ "subs x24, x24, #0x1\n"
1627
+ "add x28, x28, #0x48\n"
1628
+ ".inst 0x4e88a773 // smmla v19.4s, v27.16b, v8.16b\n"
1629
+ ".inst 0x4e9fa77a // smmla v26.4s, v27.16b, v31.16b\n"
1630
+ "ldr q27, [x25, #0x20]\n"
1631
+ ".inst 0x4e88a5f2 // smmla v18.4s, v15.16b, v8.16b\n"
1632
+ ".inst 0x4e9fa5f1 // smmla v17.4s, v15.16b, v31.16b\n"
1633
+ "sshl v15.16b, v29.16b, v1.16b\n"
1634
+ "sshl v1.16b, v3.16b, v1.16b\n"
1635
+ "and v29.16b, v29.16b, v0.16b\n"
1636
+ "and v3.16b, v3.16b, v0.16b\n"
1637
+ "ldr q0, [x25, #0x30]\n"
1638
+ "fcvtl v20.4s, v20.4h\n"
1639
+ ".inst 0x4e8fa773 // smmla v19.4s, v27.16b, v15.16b\n"
1640
+ "fcvtl v9.4s, v9.4h\n"
1641
+ ".inst 0x4e81a77a // smmla v26.4s, v27.16b, v1.16b\n"
1642
+ "ldr q27, [x25, #0x40]\n"
1643
+ ".inst 0x4e8fa412 // smmla v18.4s, v0.16b, v15.16b\n"
1644
+ ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n"
1645
+ "ldr q0, [x25, #0x50]\n"
1646
+ ".inst 0x4e95a773 // smmla v19.4s, v27.16b, v21.16b\n"
1647
+ ".inst 0x4e90a77a // smmla v26.4s, v27.16b, v16.16b\n"
1648
+ "ldr q27, [x25, #0x60]\n"
1649
+ ".inst 0x4e95a412 // smmla v18.4s, v0.16b, v21.16b\n"
1650
+ ".inst 0x4e90a411 // smmla v17.4s, v0.16b, v16.16b\n"
1651
+ "ldr q0, [x25, #0x70]\n"
1652
+ "add x25, x25, #0x88\n"
1653
+ ".inst 0x4e9da773 // smmla v19.4s, v27.16b, v29.16b\n"
1654
+ ".inst 0x4e83a77a // smmla v26.4s, v27.16b, v3.16b\n"
1655
+ "ldr d27, [x20, #0x0]\n"
1656
+ ".inst 0x4e9da412 // smmla v18.4s, v0.16b, v29.16b\n"
1657
+ ".inst 0x4e83a411 // smmla v17.4s, v0.16b, v3.16b\n"
1658
+ "fcvtl v27.4s, v27.4h\n"
1659
+ "uzp1 v0.2d, v19.2d, v26.2d\n"
1660
+ "uzp2 v26.2d, v19.2d, v26.2d\n"
1661
+ "fmul v19.4s, v27.4s, v20.s[0]\n"
1662
+ "scvtf v0.4s, v0.4s, #0x4\n"
1663
+ "scvtf v26.4s, v26.4s, #0x4\n"
1664
+ "fmla v2.4s, v0.4s, v19.4s\n"
1665
+ "ldr q19, [x23, #0x0]\n"
1666
+ "uzp1 v0.2d, v18.2d, v17.2d\n"
1667
+ "uzp2 v18.2d, v18.2d, v17.2d\n"
1668
+ "fmul v17.4s, v27.4s, v20.s[1]\n"
1669
+ "scvtf v0.4s, v0.4s, #0x4\n"
1670
+ "scvtf v18.4s, v18.4s, #0x4\n"
1671
+ "fmla v10.4s, v26.4s, v17.4s\n"
1672
+ "ldr q17, [x23, #0x10]\n"
1673
+ "fmul v26.4s, v27.4s, v20.s[2]\n"
1674
+ "fmul v20.4s, v27.4s, v20.s[3]\n"
1675
+ "fmla v12.4s, v0.4s, v26.4s\n"
1676
+ "ldr d0, [x22, #-0x8]\n"
1677
+ "ldr d26, [x21, #-0x8]\n"
1678
+ "fcvtl v0.4s, v0.4h\n"
1679
+ "fmla v28.4s, v18.4s, v20.4s\n"
1680
+ "movi v20.4s, #0x0\n"
1681
+ "movi v18.4s, #0x0\n"
1682
+ ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
1683
+ ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
1684
+ "ldr q19, [x23, #0x20]\n"
1685
+ "fcvtl v26.4s, v26.4h\n"
1686
+ ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
1687
+ ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
1688
+ "ldr q19, [x23, #0x40]\n"
1689
+ ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
1690
+ ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
1691
+ "ldr q19, [x23, #0x60]\n"
1692
+ ".inst 0x4e9da674 // smmla v20.4s, v19.16b, v29.16b\n"
1693
+ ".inst 0x4e83a672 // smmla v18.4s, v19.16b, v3.16b\n"
1694
+ "uzp1 v19.2d, v20.2d, v18.2d\n"
1695
+ "scvtf v19.4s, v19.4s, #0x4\n"
1696
+ "uzp2 v20.2d, v20.2d, v18.2d\n"
1697
+ "fmul v18.4s, v27.4s, v9.s[0]\n"
1698
+ "scvtf v20.4s, v20.4s, #0x4\n"
1699
+ "fmla v11.4s, v19.4s, v18.4s\n"
1700
+ "ldr q18, [x22, #0x0]\n"
1701
+ "fmul v19.4s, v27.4s, v9.s[1]\n"
1702
+ "fmla v13.4s, v20.4s, v19.4s\n"
1703
+ "movi v19.4s, #0x0\n"
1704
+ "movi v20.4s, #0x0\n"
1705
+ ".inst 0x4e88a633 // smmla v19.4s, v17.16b, v8.16b\n"
1706
+ ".inst 0x4e9fa634 // smmla v20.4s, v17.16b, v31.16b\n"
1707
+ "ldr q17, [x23, #0x30]\n"
1708
+ ".inst 0x4e8fa633 // smmla v19.4s, v17.16b, v15.16b\n"
1709
+ ".inst 0x4e81a634 // smmla v20.4s, v17.16b, v1.16b\n"
1710
+ "ldr q17, [x23, #0x50]\n"
1711
+ ".inst 0x4e95a633 // smmla v19.4s, v17.16b, v21.16b\n"
1712
+ ".inst 0x4e90a634 // smmla v20.4s, v17.16b, v16.16b\n"
1713
+ "ldr q17, [x23, #0x70]\n"
1714
+ "add x23, x23, #0x88\n"
1715
+ ".inst 0x4e9da633 // smmla v19.4s, v17.16b, v29.16b\n"
1716
+ ".inst 0x4e83a634 // smmla v20.4s, v17.16b, v3.16b\n"
1717
+ "uzp1 v17.2d, v19.2d, v20.2d\n"
1718
+ "scvtf v17.4s, v17.4s, #0x4\n"
1719
+ "uzp2 v20.2d, v19.2d, v20.2d\n"
1720
+ "fmul v19.4s, v27.4s, v9.s[2]\n"
1721
+ "fmul v9.4s, v27.4s, v9.s[3]\n"
1722
+ "scvtf v20.4s, v20.4s, #0x4\n"
1723
+ "fmla v22.4s, v17.4s, v19.4s\n"
1724
+ "ldr q17, [x22, #0x10]\n"
1725
+ "movi v19.4s, #0x0\n"
1726
+ ".inst 0x4e88a653 // smmla v19.4s, v18.16b, v8.16b\n"
1727
+ "fmla v23.4s, v20.4s, v9.4s\n"
1728
+ "movi v20.4s, #0x0\n"
1729
+ "movi v9.4s, #0x0\n"
1730
+ ".inst 0x4e9fa654 // smmla v20.4s, v18.16b, v31.16b\n"
1731
+ "ldr q18, [x22, #0x20]\n"
1732
+ ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
1733
+ ".inst 0x4e8fa653 // smmla v19.4s, v18.16b, v15.16b\n"
1734
+ ".inst 0x4e81a654 // smmla v20.4s, v18.16b, v1.16b\n"
1735
+ "ldr q18, [x22, #0x40]\n"
1736
+ ".inst 0x4e95a653 // smmla v19.4s, v18.16b, v21.16b\n"
1737
+ ".inst 0x4e90a654 // smmla v20.4s, v18.16b, v16.16b\n"
1738
+ "ldr q18, [x22, #0x60]\n"
1739
+ ".inst 0x4e9da653 // smmla v19.4s, v18.16b, v29.16b\n"
1740
+ ".inst 0x4e83a654 // smmla v20.4s, v18.16b, v3.16b\n"
1741
+ "movi v18.4s, #0x0\n"
1742
+ ".inst 0x4e9fa632 // smmla v18.4s, v17.16b, v31.16b\n"
1743
+ "ldr q17, [x22, #0x30]\n"
1744
+ ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
1745
+ ".inst 0x4e81a632 // smmla v18.4s, v17.16b, v1.16b\n"
1746
+ "ldr q17, [x22, #0x50]\n"
1747
+ ".inst 0x4e95a629 // smmla v9.4s, v17.16b, v21.16b\n"
1748
+ ".inst 0x4e90a632 // smmla v18.4s, v17.16b, v16.16b\n"
1749
+ "ldr q17, [x22, #0x70]\n"
1750
+ "add x22, x22, #0x88\n"
1751
+ ".inst 0x4e9da629 // smmla v9.4s, v17.16b, v29.16b\n"
1752
+ ".inst 0x4e83a632 // smmla v18.4s, v17.16b, v3.16b\n"
1753
+ "uzp1 v17.2d, v19.2d, v20.2d\n"
1754
+ "uzp2 v20.2d, v19.2d, v20.2d\n"
1755
+ "fmul v19.4s, v27.4s, v0.s[0]\n"
1756
+ "scvtf v17.4s, v17.4s, #0x4\n"
1757
+ "scvtf v20.4s, v20.4s, #0x4\n"
1758
+ "fmla v25.4s, v17.4s, v19.4s\n"
1759
+ "ldr q19, [x21, #0x0]\n"
1760
+ "fmul v17.4s, v27.4s, v0.s[1]\n"
1761
+ "fmla v5.4s, v20.4s, v17.4s\n"
1762
+ "ldr q17, [x21, #0x10]\n"
1763
+ "uzp1 v20.2d, v9.2d, v18.2d\n"
1764
+ "uzp2 v9.2d, v9.2d, v18.2d\n"
1765
+ "fmul v18.4s, v27.4s, v0.s[2]\n"
1766
+ "fmul v0.4s, v27.4s, v0.s[3]\n"
1767
+ "scvtf v20.4s, v20.4s, #0x4\n"
1768
+ "scvtf v9.4s, v9.4s, #0x4\n"
1769
+ "fmla v7.4s, v20.4s, v18.4s\n"
1770
+ "movi v20.4s, #0x0\n"
1771
+ "movi v18.4s, #0x0\n"
1772
+ ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
1773
+ ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
1774
+ "ldr q19, [x21, #0x20]\n"
1775
+ "fmla v4.4s, v9.4s, v0.4s\n"
1776
+ "movi v9.4s, #0x0\n"
1777
+ "movi v0.4s, #0x0\n"
1778
+ ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
1779
+ "fmul v8.4s, v27.4s, v26.s[0]\n"
1780
+ ".inst 0x4e9fa620 // smmla v0.4s, v17.16b, v31.16b\n"
1781
+ "ldr q17, [x21, #0x30]\n"
1782
+ ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
1783
+ "fmul v31.4s, v27.4s, v26.s[1]\n"
1784
+ ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
1785
+ "ldr q19, [x21, #0x40]\n"
1786
+ ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
1787
+ "fmul v15.4s, v27.4s, v26.s[2]\n"
1788
+ "fmul v27.4s, v27.4s, v26.s[3]\n"
1789
+ ".inst 0x4e81a620 // smmla v0.4s, v17.16b, v1.16b\n"
1790
+ "ldr q1, [x21, #0x50]\n"
1791
+ ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
1792
+ ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
1793
+ "ldr q26, [x21, #0x60]\n"
1794
+ ".inst 0x4e95a429 // smmla v9.4s, v1.16b, v21.16b\n"
1795
+ ".inst 0x4e90a420 // smmla v0.4s, v1.16b, v16.16b\n"
1796
+ "ldr q21, [x21, #0x70]\n"
1797
+ "add x21, x21, #0x88\n"
1798
+ ".inst 0x4e9da754 // smmla v20.4s, v26.16b, v29.16b\n"
1799
+ ".inst 0x4e83a752 // smmla v18.4s, v26.16b, v3.16b\n"
1800
+ ".inst 0x4e9da6a9 // smmla v9.4s, v21.16b, v29.16b\n"
1801
+ ".inst 0x4e83a6a0 // smmla v0.4s, v21.16b, v3.16b\n"
1802
+ "uzp1 v29.2d, v20.2d, v18.2d\n"
1803
+ "uzp2 v21.2d, v20.2d, v18.2d\n"
1804
+ "scvtf v29.4s, v29.4s, #0x4\n"
1805
+ "uzp1 v18.2d, v9.2d, v0.2d\n"
1806
+ "uzp2 v16.2d, v9.2d, v0.2d\n"
1807
+ "scvtf v21.4s, v21.4s, #0x4\n"
1808
+ "fmla v6.4s, v29.4s, v8.4s\n"
1809
+ "scvtf v18.4s, v18.4s, #0x4\n"
1810
+ "scvtf v16.4s, v16.4s, #0x4\n"
1811
+ "fmla v30.4s, v21.4s, v31.4s\n"
1812
+ "fmla v24.4s, v18.4s, v15.4s\n"
1813
+ "fmla v14.4s, v16.4s, v27.4s\n"
1814
+ "bgt 3b\n"
1815
+ "mov x20, %x[res_ptr]\n"
1816
+ "subs x27, x27, #0x4\n"
1817
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
1818
+ "str q2, [x20, #0x0]\n"
1819
+ "add x20, x20, %x[res_stride]\n"
1820
+ "str q10, [x20, #0x0]\n"
1821
+ "add x20, x20, %x[res_stride]\n"
1822
+ "str q12, [x20, #0x0]\n"
1823
+ "add x20, x20, %x[res_stride]\n"
1824
+ "str q28, [x20, #0x0]\n"
1825
+ "add x20, x20, %x[res_stride]\n"
1826
+ "str q11, [x20, #0x0]\n"
1827
+ "add x20, x20, %x[res_stride]\n"
1828
+ "str q13, [x20, #0x0]\n"
1829
+ "add x20, x20, %x[res_stride]\n"
1830
+ "str q22, [x20, #0x0]\n"
1831
+ "add x20, x20, %x[res_stride]\n"
1832
+ "str q23, [x20, #0x0]\n"
1833
+ "add x20, x20, %x[res_stride]\n"
1834
+ "str q25, [x20, #0x0]\n"
1835
+ "add x20, x20, %x[res_stride]\n"
1836
+ "str q5, [x20, #0x0]\n"
1837
+ "add x20, x20, %x[res_stride]\n"
1838
+ "str q7, [x20, #0x0]\n"
1839
+ "add x20, x20, %x[res_stride]\n"
1840
+ "str q4, [x20, #0x0]\n"
1841
+ "add x20, x20, %x[res_stride]\n"
1842
+ "str q6, [x20, #0x0]\n"
1843
+ "add x20, x20, %x[res_stride]\n"
1844
+ "str q30, [x20, #0x0]\n"
1845
+ "add x20, x20, %x[res_stride]\n"
1846
+ "str q24, [x20, #0x0]\n"
1847
+ "add x20, x20, %x[res_stride]\n"
1848
+ "str q14, [x20, #0x0]\n"
1849
+ "bne 2b\n"
1850
+ "mov x20, #0x4\n"
1851
+ "sub x10, x10, #0x10\n"
1852
+ "cmp x10, #0x10\n"
1853
+ "mov %x[res_ptr], x26\n"
1854
+ "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
1855
+ "bge 1b\n"
1856
+ "4:" // Row loop skip
1857
+ "cbz x10, 9f\n"
1858
+ "5:" // Row tail: Row loop
1859
+ "add x24, %x[b_ptr], #0x8\n"
1860
+ "mov x23, %x[nc]\n"
1861
+ "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
1862
+ "6:" // Row tail: Column loop
1863
+ "movi v2.16b, #0x0\n"
1864
+ "movi v10.16b, #0x0\n"
1865
+ "add x25, %x[a_ptr], #0x8\n"
1866
+ "mov x21, %x[nb]\n"
1867
+ "movi v12.16b, #0x0\n"
1868
+ "movi v28.16b, #0x0\n"
1869
+ "7:" // Row tail: Block loop
1870
+ "ldr q6, [x24, #0x0]\n"
1871
+ "ldr q5, [x24, #0x10]\n"
1872
+ "movi v17.16b, #0x4\n"
1873
+ "movi v8.4s, #0x0\n"
1874
+ "ldr q4, [x25, #0x0]\n"
1875
+ "ldr q13, [x25, #0x10]\n"
1876
+ "movi v27.4s, #0x0\n"
1877
+ "movi v0.4s, #0x0\n"
1878
+ "ldr q31, [x24, #0x20]\n"
1879
+ "ldr q14, [x24, #0x30]\n"
1880
+ "movi v29.4s, #0x0\n"
1881
+ "movi v22.16b, #0xf0\n"
1882
+ "ldr q11, [x25, #0x20]\n"
1883
+ "ldr q23, [x25, #0x30]\n"
1884
+ "sshl v21.16b, v6.16b, v17.16b\n"
1885
+ "sshl v16.16b, v5.16b, v17.16b\n"
1886
+ "ldr q20, [x25, #0x40]\n"
1887
+ "ldr q26, [x25, #0x50]\n"
1888
+ "and v6.16b, v6.16b, v22.16b\n"
1889
+ "and v5.16b, v5.16b, v22.16b\n"
1890
+ "ldr q25, [x25, #0x60]\n"
1891
+ "ldr q3, [x25, #0x70]\n"
1892
+ "sshl v19.16b, v31.16b, v17.16b\n"
1893
+ "sshl v18.16b, v14.16b, v17.16b\n"
1894
+ "ldr d17, [x25, #-0x8]\n"
1895
+ ".inst 0x4e95a488 // smmla v8.4s, v4.16b, v21.16b\n"
1896
+ ".inst 0x4e90a49b // smmla v27.4s, v4.16b, v16.16b\n"
1897
+ "and v31.16b, v31.16b, v22.16b\n"
1898
+ ".inst 0x4e95a5a0 // smmla v0.4s, v13.16b, v21.16b\n"
1899
+ ".inst 0x4e90a5bd // smmla v29.4s, v13.16b, v16.16b\n"
1900
+ "and v14.16b, v14.16b, v22.16b\n"
1901
+ "sub x20, x24, #0x8\n"
1902
+ "ldr d16, [x20, #0x0]\n"
1903
+ "subs x21, x21, #0x1\n"
1904
+ "add x25, x25, #0x88\n"
1905
+ "fcvtl v17.4s, v17.4h\n"
1906
+ "add x24, x24, #0x48\n"
1907
+ ".inst 0x4e93a568 // smmla v8.4s, v11.16b, v19.16b\n"
1908
+ ".inst 0x4e92a57b // smmla v27.4s, v11.16b, v18.16b\n"
1909
+ ".inst 0x4e93a6e0 // smmla v0.4s, v23.16b, v19.16b\n"
1910
+ ".inst 0x4e92a6fd // smmla v29.4s, v23.16b, v18.16b\n"
1911
+ "fcvtl v16.4s, v16.4h\n"
1912
+ ".inst 0x4e86a688 // smmla v8.4s, v20.16b, v6.16b\n"
1913
+ ".inst 0x4e85a69b // smmla v27.4s, v20.16b, v5.16b\n"
1914
+ "fmul v23.4s, v16.4s, v17.s[0]\n"
1915
+ "fmul v21.4s, v16.4s, v17.s[1]\n"
1916
+ "fmul v1.4s, v16.4s, v17.s[2]\n"
1917
+ "fmul v20.4s, v16.4s, v17.s[3]\n"
1918
+ ".inst 0x4e86a740 // smmla v0.4s, v26.16b, v6.16b\n"
1919
+ ".inst 0x4e85a75d // smmla v29.4s, v26.16b, v5.16b\n"
1920
+ ".inst 0x4e9fa728 // smmla v8.4s, v25.16b, v31.16b\n"
1921
+ ".inst 0x4e8ea73b // smmla v27.4s, v25.16b, v14.16b\n"
1922
+ ".inst 0x4e9fa460 // smmla v0.4s, v3.16b, v31.16b\n"
1923
+ ".inst 0x4e8ea47d // smmla v29.4s, v3.16b, v14.16b\n"
1924
+ "uzp1 v19.2d, v8.2d, v27.2d\n"
1925
+ "uzp2 v18.2d, v8.2d, v27.2d\n"
1926
+ "scvtf v19.4s, v19.4s, #0x4\n"
1927
+ "uzp1 v17.2d, v0.2d, v29.2d\n"
1928
+ "uzp2 v16.2d, v0.2d, v29.2d\n"
1929
+ "scvtf v18.4s, v18.4s, #0x4\n"
1930
+ "fmla v2.4s, v19.4s, v23.4s\n"
1931
+ "scvtf v17.4s, v17.4s, #0x4\n"
1932
+ "scvtf v16.4s, v16.4s, #0x4\n"
1933
+ "fmla v10.4s, v18.4s, v21.4s\n"
1934
+ "fmla v12.4s, v17.4s, v1.4s\n"
1935
+ "fmla v28.4s, v16.4s, v20.4s\n"
1936
+ "bgt 7b\n"
1937
+ "mov x20, %x[res_ptr]\n"
1938
+ "cmp x10, #0x1\n"
1939
+ "str q2, [x20, #0x0]\n"
1940
+ "add x20, x20, %x[res_stride]\n"
1941
+ "ble 8f\n"
1942
+ "cmp x10, #0x2\n"
1943
+ "str q10, [x20, #0x0]\n"
1944
+ "add x20, x20, %x[res_stride]\n"
1945
+ "ble 8f\n"
1946
+ "cmp x10, #0x3\n"
1947
+ "str q12, [x20, #0x0]\n"
1948
+ "add x20, x20, %x[res_stride]\n"
1949
+ "ble 8f\n"
1950
+ "str q28, [x20, #0x0]\n"
1951
+ "8:" // Row tail: Accumulator store skip
1952
+ "subs x23, x23, #0x4\n"
1953
+ "add %x[res_ptr], %x[res_ptr], #0x10\n"
1954
+ "bne 6b\n"
1955
+ "subs x10, x10, #0x4\n"
1956
+ "add %x[a_ptr], %x[a_ptr], x9\n"
1957
+ "mov %x[res_ptr], x22\n"
1958
+ "bgt 5b\n"
1959
+ "9:" // Row tail: Row loop skip
1960
+ : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
1961
+ : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
1962
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
1963
+ );
1964
+ return;
1524
1965
  }
1525
- #endif
1526
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
1527
- const void * b_ptr = vx;
1528
- const void * a_ptr = vy;
1529
- float * res_ptr = s;
1530
- size_t res_stride = bs * sizeof(float);
1531
-
1532
- __asm__ __volatile__(
1533
- "mov x10, %x[nr]\n"
1534
- "mov x9, #0x88\n"
1535
- "cmp x10, #0x10\n"
1536
- "mul x9, %x[nb], x9\n"
1537
- "blt 4f\n"
1538
- "1:" // Row loop
1539
- "add x28, %x[b_ptr], #0x8\n"
1540
- "mov x27, %x[nc]\n"
1541
- "add x26, %x[res_ptr], %x[res_stride], LSL #4\n"
1542
- "2:" // Column loop
1543
- "add x25, %x[a_ptr], #0x8\n"
1544
- "movi v2.16b, #0x0\n"
1545
- "movi v10.16b, #0x0\n"
1546
- "mov x24, %x[nb]\n"
1547
- "add x23, x25, x9\n"
1548
- "movi v12.16b, #0x0\n"
1549
- "movi v28.16b, #0x0\n"
1550
- "add x22, x23, x9\n"
1551
- "movi v11.16b, #0x0\n"
1552
- "movi v13.16b, #0x0\n"
1553
- "add x21, x22, x9\n"
1554
- "movi v22.16b, #0x0\n"
1555
- "movi v23.16b, #0x0\n"
1556
- "movi v25.16b, #0x0\n"
1557
- "movi v5.16b, #0x0\n"
1558
- "movi v7.16b, #0x0\n"
1559
- "movi v4.16b, #0x0\n"
1560
- "movi v6.16b, #0x0\n"
1561
- "movi v30.16b, #0x0\n"
1562
- "movi v24.16b, #0x0\n"
1563
- "movi v14.16b, #0x0\n"
1564
- "3:" // Block loop
1565
- "ldr q21, [x28, #0x0]\n"
1566
- "ldr q16, [x28, #0x10]\n"
1567
- "movi v1.16b, #0x4\n"
1568
- "movi v19.4s, #0x0\n"
1569
- "ldr q27, [x25, #0x0]\n"
1570
- "ldr q15, [x25, #0x10]\n"
1571
- "movi v26.4s, #0x0\n"
1572
- "movi v18.4s, #0x0\n"
1573
- "ldr q29, [x28, #0x20]\n"
1574
- "ldr q3, [x28, #0x30]\n"
1575
- "movi v17.4s, #0x0\n"
1576
- "movi v0.16b, #0xf0\n"
1577
- "ldr d20, [x25, #-0x8]\n"
1578
- "ldr d9, [x23, #-0x8]\n"
1579
- "sshl v8.16b, v21.16b, v1.16b\n"
1580
- "sshl v31.16b, v16.16b, v1.16b\n"
1581
- "and v21.16b, v21.16b, v0.16b\n"
1582
- "and v16.16b, v16.16b, v0.16b\n"
1583
- "sub x20, x28, #0x8\n"
1584
- "subs x24, x24, #0x1\n"
1585
- "add x28, x28, #0x48\n"
1586
- ".inst 0x4e88a773 // smmla v19.4s, v27.16b, v8.16b\n"
1587
- ".inst 0x4e9fa77a // smmla v26.4s, v27.16b, v31.16b\n"
1588
- "ldr q27, [x25, #0x20]\n"
1589
- ".inst 0x4e88a5f2 // smmla v18.4s, v15.16b, v8.16b\n"
1590
- ".inst 0x4e9fa5f1 // smmla v17.4s, v15.16b, v31.16b\n"
1591
- "sshl v15.16b, v29.16b, v1.16b\n"
1592
- "sshl v1.16b, v3.16b, v1.16b\n"
1593
- "and v29.16b, v29.16b, v0.16b\n"
1594
- "and v3.16b, v3.16b, v0.16b\n"
1595
- "ldr q0, [x25, #0x30]\n"
1596
- "fcvtl v20.4s, v20.4h\n"
1597
- ".inst 0x4e8fa773 // smmla v19.4s, v27.16b, v15.16b\n"
1598
- "fcvtl v9.4s, v9.4h\n"
1599
- ".inst 0x4e81a77a // smmla v26.4s, v27.16b, v1.16b\n"
1600
- "ldr q27, [x25, #0x40]\n"
1601
- ".inst 0x4e8fa412 // smmla v18.4s, v0.16b, v15.16b\n"
1602
- ".inst 0x4e81a411 // smmla v17.4s, v0.16b, v1.16b\n"
1603
- "ldr q0, [x25, #0x50]\n"
1604
- ".inst 0x4e95a773 // smmla v19.4s, v27.16b, v21.16b\n"
1605
- ".inst 0x4e90a77a // smmla v26.4s, v27.16b, v16.16b\n"
1606
- "ldr q27, [x25, #0x60]\n"
1607
- ".inst 0x4e95a412 // smmla v18.4s, v0.16b, v21.16b\n"
1608
- ".inst 0x4e90a411 // smmla v17.4s, v0.16b, v16.16b\n"
1609
- "ldr q0, [x25, #0x70]\n"
1610
- "add x25, x25, #0x88\n"
1611
- ".inst 0x4e9da773 // smmla v19.4s, v27.16b, v29.16b\n"
1612
- ".inst 0x4e83a77a // smmla v26.4s, v27.16b, v3.16b\n"
1613
- "ldr d27, [x20, #0x0]\n"
1614
- ".inst 0x4e9da412 // smmla v18.4s, v0.16b, v29.16b\n"
1615
- ".inst 0x4e83a411 // smmla v17.4s, v0.16b, v3.16b\n"
1616
- "fcvtl v27.4s, v27.4h\n"
1617
- "uzp1 v0.2d, v19.2d, v26.2d\n"
1618
- "uzp2 v26.2d, v19.2d, v26.2d\n"
1619
- "fmul v19.4s, v27.4s, v20.s[0]\n"
1620
- "scvtf v0.4s, v0.4s, #0x4\n"
1621
- "scvtf v26.4s, v26.4s, #0x4\n"
1622
- "fmla v2.4s, v0.4s, v19.4s\n"
1623
- "ldr q19, [x23, #0x0]\n"
1624
- "uzp1 v0.2d, v18.2d, v17.2d\n"
1625
- "uzp2 v18.2d, v18.2d, v17.2d\n"
1626
- "fmul v17.4s, v27.4s, v20.s[1]\n"
1627
- "scvtf v0.4s, v0.4s, #0x4\n"
1628
- "scvtf v18.4s, v18.4s, #0x4\n"
1629
- "fmla v10.4s, v26.4s, v17.4s\n"
1630
- "ldr q17, [x23, #0x10]\n"
1631
- "fmul v26.4s, v27.4s, v20.s[2]\n"
1632
- "fmul v20.4s, v27.4s, v20.s[3]\n"
1633
- "fmla v12.4s, v0.4s, v26.4s\n"
1634
- "ldr d0, [x22, #-0x8]\n"
1635
- "ldr d26, [x21, #-0x8]\n"
1636
- "fcvtl v0.4s, v0.4h\n"
1637
- "fmla v28.4s, v18.4s, v20.4s\n"
1638
- "movi v20.4s, #0x0\n"
1639
- "movi v18.4s, #0x0\n"
1640
- ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
1641
- ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
1642
- "ldr q19, [x23, #0x20]\n"
1643
- "fcvtl v26.4s, v26.4h\n"
1644
- ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
1645
- ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
1646
- "ldr q19, [x23, #0x40]\n"
1647
- ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
1648
- ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
1649
- "ldr q19, [x23, #0x60]\n"
1650
- ".inst 0x4e9da674 // smmla v20.4s, v19.16b, v29.16b\n"
1651
- ".inst 0x4e83a672 // smmla v18.4s, v19.16b, v3.16b\n"
1652
- "uzp1 v19.2d, v20.2d, v18.2d\n"
1653
- "scvtf v19.4s, v19.4s, #0x4\n"
1654
- "uzp2 v20.2d, v20.2d, v18.2d\n"
1655
- "fmul v18.4s, v27.4s, v9.s[0]\n"
1656
- "scvtf v20.4s, v20.4s, #0x4\n"
1657
- "fmla v11.4s, v19.4s, v18.4s\n"
1658
- "ldr q18, [x22, #0x0]\n"
1659
- "fmul v19.4s, v27.4s, v9.s[1]\n"
1660
- "fmla v13.4s, v20.4s, v19.4s\n"
1661
- "movi v19.4s, #0x0\n"
1662
- "movi v20.4s, #0x0\n"
1663
- ".inst 0x4e88a633 // smmla v19.4s, v17.16b, v8.16b\n"
1664
- ".inst 0x4e9fa634 // smmla v20.4s, v17.16b, v31.16b\n"
1665
- "ldr q17, [x23, #0x30]\n"
1666
- ".inst 0x4e8fa633 // smmla v19.4s, v17.16b, v15.16b\n"
1667
- ".inst 0x4e81a634 // smmla v20.4s, v17.16b, v1.16b\n"
1668
- "ldr q17, [x23, #0x50]\n"
1669
- ".inst 0x4e95a633 // smmla v19.4s, v17.16b, v21.16b\n"
1670
- ".inst 0x4e90a634 // smmla v20.4s, v17.16b, v16.16b\n"
1671
- "ldr q17, [x23, #0x70]\n"
1672
- "add x23, x23, #0x88\n"
1673
- ".inst 0x4e9da633 // smmla v19.4s, v17.16b, v29.16b\n"
1674
- ".inst 0x4e83a634 // smmla v20.4s, v17.16b, v3.16b\n"
1675
- "uzp1 v17.2d, v19.2d, v20.2d\n"
1676
- "scvtf v17.4s, v17.4s, #0x4\n"
1677
- "uzp2 v20.2d, v19.2d, v20.2d\n"
1678
- "fmul v19.4s, v27.4s, v9.s[2]\n"
1679
- "fmul v9.4s, v27.4s, v9.s[3]\n"
1680
- "scvtf v20.4s, v20.4s, #0x4\n"
1681
- "fmla v22.4s, v17.4s, v19.4s\n"
1682
- "ldr q17, [x22, #0x10]\n"
1683
- "movi v19.4s, #0x0\n"
1684
- ".inst 0x4e88a653 // smmla v19.4s, v18.16b, v8.16b\n"
1685
- "fmla v23.4s, v20.4s, v9.4s\n"
1686
- "movi v20.4s, #0x0\n"
1687
- "movi v9.4s, #0x0\n"
1688
- ".inst 0x4e9fa654 // smmla v20.4s, v18.16b, v31.16b\n"
1689
- "ldr q18, [x22, #0x20]\n"
1690
- ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
1691
- ".inst 0x4e8fa653 // smmla v19.4s, v18.16b, v15.16b\n"
1692
- ".inst 0x4e81a654 // smmla v20.4s, v18.16b, v1.16b\n"
1693
- "ldr q18, [x22, #0x40]\n"
1694
- ".inst 0x4e95a653 // smmla v19.4s, v18.16b, v21.16b\n"
1695
- ".inst 0x4e90a654 // smmla v20.4s, v18.16b, v16.16b\n"
1696
- "ldr q18, [x22, #0x60]\n"
1697
- ".inst 0x4e9da653 // smmla v19.4s, v18.16b, v29.16b\n"
1698
- ".inst 0x4e83a654 // smmla v20.4s, v18.16b, v3.16b\n"
1699
- "movi v18.4s, #0x0\n"
1700
- ".inst 0x4e9fa632 // smmla v18.4s, v17.16b, v31.16b\n"
1701
- "ldr q17, [x22, #0x30]\n"
1702
- ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
1703
- ".inst 0x4e81a632 // smmla v18.4s, v17.16b, v1.16b\n"
1704
- "ldr q17, [x22, #0x50]\n"
1705
- ".inst 0x4e95a629 // smmla v9.4s, v17.16b, v21.16b\n"
1706
- ".inst 0x4e90a632 // smmla v18.4s, v17.16b, v16.16b\n"
1707
- "ldr q17, [x22, #0x70]\n"
1708
- "add x22, x22, #0x88\n"
1709
- ".inst 0x4e9da629 // smmla v9.4s, v17.16b, v29.16b\n"
1710
- ".inst 0x4e83a632 // smmla v18.4s, v17.16b, v3.16b\n"
1711
- "uzp1 v17.2d, v19.2d, v20.2d\n"
1712
- "uzp2 v20.2d, v19.2d, v20.2d\n"
1713
- "fmul v19.4s, v27.4s, v0.s[0]\n"
1714
- "scvtf v17.4s, v17.4s, #0x4\n"
1715
- "scvtf v20.4s, v20.4s, #0x4\n"
1716
- "fmla v25.4s, v17.4s, v19.4s\n"
1717
- "ldr q19, [x21, #0x0]\n"
1718
- "fmul v17.4s, v27.4s, v0.s[1]\n"
1719
- "fmla v5.4s, v20.4s, v17.4s\n"
1720
- "ldr q17, [x21, #0x10]\n"
1721
- "uzp1 v20.2d, v9.2d, v18.2d\n"
1722
- "uzp2 v9.2d, v9.2d, v18.2d\n"
1723
- "fmul v18.4s, v27.4s, v0.s[2]\n"
1724
- "fmul v0.4s, v27.4s, v0.s[3]\n"
1725
- "scvtf v20.4s, v20.4s, #0x4\n"
1726
- "scvtf v9.4s, v9.4s, #0x4\n"
1727
- "fmla v7.4s, v20.4s, v18.4s\n"
1728
- "movi v20.4s, #0x0\n"
1729
- "movi v18.4s, #0x0\n"
1730
- ".inst 0x4e88a674 // smmla v20.4s, v19.16b, v8.16b\n"
1731
- ".inst 0x4e9fa672 // smmla v18.4s, v19.16b, v31.16b\n"
1732
- "ldr q19, [x21, #0x20]\n"
1733
- "fmla v4.4s, v9.4s, v0.4s\n"
1734
- "movi v9.4s, #0x0\n"
1735
- "movi v0.4s, #0x0\n"
1736
- ".inst 0x4e88a629 // smmla v9.4s, v17.16b, v8.16b\n"
1737
- "fmul v8.4s, v27.4s, v26.s[0]\n"
1738
- ".inst 0x4e9fa620 // smmla v0.4s, v17.16b, v31.16b\n"
1739
- "ldr q17, [x21, #0x30]\n"
1740
- ".inst 0x4e8fa674 // smmla v20.4s, v19.16b, v15.16b\n"
1741
- "fmul v31.4s, v27.4s, v26.s[1]\n"
1742
- ".inst 0x4e81a672 // smmla v18.4s, v19.16b, v1.16b\n"
1743
- "ldr q19, [x21, #0x40]\n"
1744
- ".inst 0x4e8fa629 // smmla v9.4s, v17.16b, v15.16b\n"
1745
- "fmul v15.4s, v27.4s, v26.s[2]\n"
1746
- "fmul v27.4s, v27.4s, v26.s[3]\n"
1747
- ".inst 0x4e81a620 // smmla v0.4s, v17.16b, v1.16b\n"
1748
- "ldr q1, [x21, #0x50]\n"
1749
- ".inst 0x4e95a674 // smmla v20.4s, v19.16b, v21.16b\n"
1750
- ".inst 0x4e90a672 // smmla v18.4s, v19.16b, v16.16b\n"
1751
- "ldr q26, [x21, #0x60]\n"
1752
- ".inst 0x4e95a429 // smmla v9.4s, v1.16b, v21.16b\n"
1753
- ".inst 0x4e90a420 // smmla v0.4s, v1.16b, v16.16b\n"
1754
- "ldr q21, [x21, #0x70]\n"
1755
- "add x21, x21, #0x88\n"
1756
- ".inst 0x4e9da754 // smmla v20.4s, v26.16b, v29.16b\n"
1757
- ".inst 0x4e83a752 // smmla v18.4s, v26.16b, v3.16b\n"
1758
- ".inst 0x4e9da6a9 // smmla v9.4s, v21.16b, v29.16b\n"
1759
- ".inst 0x4e83a6a0 // smmla v0.4s, v21.16b, v3.16b\n"
1760
- "uzp1 v29.2d, v20.2d, v18.2d\n"
1761
- "uzp2 v21.2d, v20.2d, v18.2d\n"
1762
- "scvtf v29.4s, v29.4s, #0x4\n"
1763
- "uzp1 v18.2d, v9.2d, v0.2d\n"
1764
- "uzp2 v16.2d, v9.2d, v0.2d\n"
1765
- "scvtf v21.4s, v21.4s, #0x4\n"
1766
- "fmla v6.4s, v29.4s, v8.4s\n"
1767
- "scvtf v18.4s, v18.4s, #0x4\n"
1768
- "scvtf v16.4s, v16.4s, #0x4\n"
1769
- "fmla v30.4s, v21.4s, v31.4s\n"
1770
- "fmla v24.4s, v18.4s, v15.4s\n"
1771
- "fmla v14.4s, v16.4s, v27.4s\n"
1772
- "bgt 3b\n"
1773
- "mov x20, %x[res_ptr]\n"
1774
- "subs x27, x27, #0x4\n"
1775
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
1776
- "str q2, [x20, #0x0]\n"
1777
- "add x20, x20, %x[res_stride]\n"
1778
- "str q10, [x20, #0x0]\n"
1779
- "add x20, x20, %x[res_stride]\n"
1780
- "str q12, [x20, #0x0]\n"
1781
- "add x20, x20, %x[res_stride]\n"
1782
- "str q28, [x20, #0x0]\n"
1783
- "add x20, x20, %x[res_stride]\n"
1784
- "str q11, [x20, #0x0]\n"
1785
- "add x20, x20, %x[res_stride]\n"
1786
- "str q13, [x20, #0x0]\n"
1787
- "add x20, x20, %x[res_stride]\n"
1788
- "str q22, [x20, #0x0]\n"
1789
- "add x20, x20, %x[res_stride]\n"
1790
- "str q23, [x20, #0x0]\n"
1791
- "add x20, x20, %x[res_stride]\n"
1792
- "str q25, [x20, #0x0]\n"
1793
- "add x20, x20, %x[res_stride]\n"
1794
- "str q5, [x20, #0x0]\n"
1795
- "add x20, x20, %x[res_stride]\n"
1796
- "str q7, [x20, #0x0]\n"
1797
- "add x20, x20, %x[res_stride]\n"
1798
- "str q4, [x20, #0x0]\n"
1799
- "add x20, x20, %x[res_stride]\n"
1800
- "str q6, [x20, #0x0]\n"
1801
- "add x20, x20, %x[res_stride]\n"
1802
- "str q30, [x20, #0x0]\n"
1803
- "add x20, x20, %x[res_stride]\n"
1804
- "str q24, [x20, #0x0]\n"
1805
- "add x20, x20, %x[res_stride]\n"
1806
- "str q14, [x20, #0x0]\n"
1807
- "bne 2b\n"
1808
- "mov x20, #0x4\n"
1809
- "sub x10, x10, #0x10\n"
1810
- "cmp x10, #0x10\n"
1811
- "mov %x[res_ptr], x26\n"
1812
- "madd %x[a_ptr], x20, x9, %x[a_ptr]\n"
1813
- "bge 1b\n"
1814
- "4:" // Row loop skip
1815
- "cbz x10, 9f\n"
1816
- "5:" // Row tail: Row loop
1817
- "add x24, %x[b_ptr], #0x8\n"
1818
- "mov x23, %x[nc]\n"
1819
- "add x22, %x[res_ptr], %x[res_stride], LSL #2\n"
1820
- "6:" // Row tail: Column loop
1821
- "movi v2.16b, #0x0\n"
1822
- "movi v10.16b, #0x0\n"
1823
- "add x25, %x[a_ptr], #0x8\n"
1824
- "mov x21, %x[nb]\n"
1825
- "movi v12.16b, #0x0\n"
1826
- "movi v28.16b, #0x0\n"
1827
- "7:" // Row tail: Block loop
1828
- "ldr q6, [x24, #0x0]\n"
1829
- "ldr q5, [x24, #0x10]\n"
1830
- "movi v17.16b, #0x4\n"
1831
- "movi v8.4s, #0x0\n"
1832
- "ldr q4, [x25, #0x0]\n"
1833
- "ldr q13, [x25, #0x10]\n"
1834
- "movi v27.4s, #0x0\n"
1835
- "movi v0.4s, #0x0\n"
1836
- "ldr q31, [x24, #0x20]\n"
1837
- "ldr q14, [x24, #0x30]\n"
1838
- "movi v29.4s, #0x0\n"
1839
- "movi v22.16b, #0xf0\n"
1840
- "ldr q11, [x25, #0x20]\n"
1841
- "ldr q23, [x25, #0x30]\n"
1842
- "sshl v21.16b, v6.16b, v17.16b\n"
1843
- "sshl v16.16b, v5.16b, v17.16b\n"
1844
- "ldr q20, [x25, #0x40]\n"
1845
- "ldr q26, [x25, #0x50]\n"
1846
- "and v6.16b, v6.16b, v22.16b\n"
1847
- "and v5.16b, v5.16b, v22.16b\n"
1848
- "ldr q25, [x25, #0x60]\n"
1849
- "ldr q3, [x25, #0x70]\n"
1850
- "sshl v19.16b, v31.16b, v17.16b\n"
1851
- "sshl v18.16b, v14.16b, v17.16b\n"
1852
- "ldr d17, [x25, #-0x8]\n"
1853
- ".inst 0x4e95a488 // smmla v8.4s, v4.16b, v21.16b\n"
1854
- ".inst 0x4e90a49b // smmla v27.4s, v4.16b, v16.16b\n"
1855
- "and v31.16b, v31.16b, v22.16b\n"
1856
- ".inst 0x4e95a5a0 // smmla v0.4s, v13.16b, v21.16b\n"
1857
- ".inst 0x4e90a5bd // smmla v29.4s, v13.16b, v16.16b\n"
1858
- "and v14.16b, v14.16b, v22.16b\n"
1859
- "sub x20, x24, #0x8\n"
1860
- "ldr d16, [x20, #0x0]\n"
1861
- "subs x21, x21, #0x1\n"
1862
- "add x25, x25, #0x88\n"
1863
- "fcvtl v17.4s, v17.4h\n"
1864
- "add x24, x24, #0x48\n"
1865
- ".inst 0x4e93a568 // smmla v8.4s, v11.16b, v19.16b\n"
1866
- ".inst 0x4e92a57b // smmla v27.4s, v11.16b, v18.16b\n"
1867
- ".inst 0x4e93a6e0 // smmla v0.4s, v23.16b, v19.16b\n"
1868
- ".inst 0x4e92a6fd // smmla v29.4s, v23.16b, v18.16b\n"
1869
- "fcvtl v16.4s, v16.4h\n"
1870
- ".inst 0x4e86a688 // smmla v8.4s, v20.16b, v6.16b\n"
1871
- ".inst 0x4e85a69b // smmla v27.4s, v20.16b, v5.16b\n"
1872
- "fmul v23.4s, v16.4s, v17.s[0]\n"
1873
- "fmul v21.4s, v16.4s, v17.s[1]\n"
1874
- "fmul v1.4s, v16.4s, v17.s[2]\n"
1875
- "fmul v20.4s, v16.4s, v17.s[3]\n"
1876
- ".inst 0x4e86a740 // smmla v0.4s, v26.16b, v6.16b\n"
1877
- ".inst 0x4e85a75d // smmla v29.4s, v26.16b, v5.16b\n"
1878
- ".inst 0x4e9fa728 // smmla v8.4s, v25.16b, v31.16b\n"
1879
- ".inst 0x4e8ea73b // smmla v27.4s, v25.16b, v14.16b\n"
1880
- ".inst 0x4e9fa460 // smmla v0.4s, v3.16b, v31.16b\n"
1881
- ".inst 0x4e8ea47d // smmla v29.4s, v3.16b, v14.16b\n"
1882
- "uzp1 v19.2d, v8.2d, v27.2d\n"
1883
- "uzp2 v18.2d, v8.2d, v27.2d\n"
1884
- "scvtf v19.4s, v19.4s, #0x4\n"
1885
- "uzp1 v17.2d, v0.2d, v29.2d\n"
1886
- "uzp2 v16.2d, v0.2d, v29.2d\n"
1887
- "scvtf v18.4s, v18.4s, #0x4\n"
1888
- "fmla v2.4s, v19.4s, v23.4s\n"
1889
- "scvtf v17.4s, v17.4s, #0x4\n"
1890
- "scvtf v16.4s, v16.4s, #0x4\n"
1891
- "fmla v10.4s, v18.4s, v21.4s\n"
1892
- "fmla v12.4s, v17.4s, v1.4s\n"
1893
- "fmla v28.4s, v16.4s, v20.4s\n"
1894
- "bgt 7b\n"
1895
- "mov x20, %x[res_ptr]\n"
1896
- "cmp x10, #0x1\n"
1897
- "str q2, [x20, #0x0]\n"
1898
- "add x20, x20, %x[res_stride]\n"
1899
- "ble 8f\n"
1900
- "cmp x10, #0x2\n"
1901
- "str q10, [x20, #0x0]\n"
1902
- "add x20, x20, %x[res_stride]\n"
1903
- "ble 8f\n"
1904
- "cmp x10, #0x3\n"
1905
- "str q12, [x20, #0x0]\n"
1906
- "add x20, x20, %x[res_stride]\n"
1907
- "ble 8f\n"
1908
- "str q28, [x20, #0x0]\n"
1909
- "8:" // Row tail: Accumulator store skip
1910
- "subs x23, x23, #0x4\n"
1911
- "add %x[res_ptr], %x[res_ptr], #0x10\n"
1912
- "bne 6b\n"
1913
- "subs x10, x10, #0x4\n"
1914
- "add %x[a_ptr], %x[a_ptr], x9\n"
1915
- "mov %x[res_ptr], x22\n"
1916
- "bgt 5b\n"
1917
- "9:" // Row tail: Row loop skip
1918
- : [a_ptr] "+&r" (a_ptr), [res_ptr] "+&r" (res_ptr)
1919
- : [b_ptr] "r" (b_ptr), [nr] "r" (nr), [nb] "r" (nb), [res_stride] "r" (res_stride), [nc] "r" (nc)
1920
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
1921
- );
1922
- #elif defined(__ARM_NEON) && defined(__aarch64__)
1923
- LM_GGML_ASSERT((lm_ggml_cpu_has_sve() || lm_ggml_cpu_has_matmul_int8()) &&
1924
- "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
1925
- "performance");
1926
- #else
1966
+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
1927
1967
  float sumf[4][4];
1928
1968
  int sumi;
1929
1969
 
@@ -1943,7 +1983,7 @@ void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
1943
1983
  const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
1944
1984
  const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
1945
1985
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1946
- (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1986
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1947
1987
  }
1948
1988
  sumf[m][j] += sumi * LM_GGML_FP16_TO_FP32(b_ptr[l].d[j]) * LM_GGML_FP16_TO_FP32(a_ptr[l].d[m]);
1949
1989
  }
@@ -1956,7 +1996,6 @@ void lm_ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void
1956
1996
  }
1957
1997
  }
1958
1998
  }
1959
- #endif
1960
1999
  }
1961
2000
 
1962
2001
  void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
@@ -1979,8 +2018,9 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
1979
2018
  UNUSED(ncols_interleaved);
1980
2019
  UNUSED(blocklen);
1981
2020
 
1982
- #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
1983
- if (lm_ggml_sve_cnt_b == QK8_0) {
2021
+ #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
2022
+ #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
2023
+ if (lm_ggml_cpu_has_sve() && lm_ggml_cpu_has_matmul_int8() && sve_lane_count() == QK8_0) {
1984
2024
  const void * b_ptr = vx;
1985
2025
  const void * a_ptr = vy;
1986
2026
  float * res_ptr = s;
@@ -2390,134 +2430,682 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
2390
2430
  );
2391
2431
  return;
2392
2432
  }
2393
- else if (lm_ggml_cpu_has_neon() && lm_ggml_cpu_has_matmul_int8()) {
2394
- LM_GGML_ASSERT((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) &&
2395
- "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
2396
- "performance");
2397
- }
2398
- else if (lm_ggml_cpu_has_neon()) {
2399
- LM_GGML_ASSERT(((lm_ggml_cpu_has_sve() && (lm_ggml_sve_cnt_b == QK8_0)) || lm_ggml_cpu_has_matmul_int8()) &&
2400
- "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
2401
- "quantization format for optimal performance");
2402
- }
2403
- #endif
2404
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
2405
- LM_GGML_ASSERT(lm_ggml_cpu_has_sve() &&
2406
- "__ARM_FEATURE_SVE not defined, use the Q4_0_4_8 quantization format for optimal performance");
2407
- #elif defined(__ARM_NEON) && defined(__aarch64__)
2408
- LM_GGML_ASSERT((lm_ggml_cpu_has_sve() || lm_ggml_cpu_has_matmul_int8()) &&
2409
- "__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
2410
- "performance");
2433
+ #endif // #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
2411
2434
  #elif defined(__AVX2__) || defined(__AVX512F__)
2412
- const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
2413
- const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
2414
- int64_t b_nb = n / QK4_0;
2415
- int64_t y = 0;
2416
- // Mask to mask out nibbles from packed bytes
2417
- const __m256i m4b = _mm256_set1_epi8(0x0F);
2418
- const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3);
2419
- // Lookup table to convert signed nibbles to signed bytes
2420
- __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
2421
- signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
2422
- // Permute mask used for easier vector processing at later stages
2423
- __m256i requiredOrder = _mm256_set_epi32(3 ,2 ,1 ,0, 7 ,6, 5, 4);
2435
+ {
2436
+ const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
2437
+ const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
2438
+ int64_t b_nb = n / QK4_0;
2439
+ int64_t y = 0;
2440
+ // Mask to mask out nibbles from packed bytes
2441
+ const __m256i m4b = _mm256_set1_epi8(0x0F);
2442
+ const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3);
2443
+ // Lookup table to convert signed nibbles to signed bytes
2444
+ __m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
2445
+ signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
2446
+ // Permute mask used for easier vector processing at later stages
2447
+ __m256i requiredOrder = _mm256_set_epi32(3, 2, 1, 0, 7, 6, 5, 4);
2448
+ int64_t xstart = 0;
2449
+ int anr = nr - nr%16; // Used to align nr with boundary of 16
2450
+ #ifdef __AVX512F__
2451
+ int anc = nc - nc%16; // Used to align nc with boundary of 16
2452
+ // Mask to mask out nibbles from packed bytes expanded to 512 bit length
2453
+ const __m512i m4bexpanded = _mm512_set1_epi8(0x0F);
2454
+ // Lookup table to convert signed nibbles to signed bytes expanded to 512 bit length
2455
+ __m512i signextendlutexpanded = _mm512_inserti32x8(_mm512_castsi256_si512(signextendlut), signextendlut, 1);
2456
+
2457
+ // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
2458
+ for (; y < anr / 4; y += 4) {
2459
+
2460
+ const block_q8_0x4 * a_ptrs[4];
2461
+
2462
+ a_ptrs[0] = a_ptr_start + (y * nb);
2463
+ for (int i = 0; i < 3; ++i) {
2464
+ a_ptrs[i + 1] = a_ptrs[i] + nb;
2465
+ }
2466
+
2467
+ // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
2468
+ for (int64_t x = 0; x < anc / 8; x += 2) {
2469
+
2470
+ const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x) * b_nb);
2471
+ const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
2472
+
2473
+ // Master FP accumulators
2474
+ __m512 acc_rows[16];
2475
+ for (int i = 0; i < 16; i++) {
2476
+ acc_rows[i] = _mm512_setzero_ps();
2477
+ }
2478
+
2479
+ for (int64_t b = 0; b < nb; b++) {
2480
+ // Load the sixteen block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
2481
+ const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
2482
+ const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
2483
+ const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
2484
+ const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
2485
+
2486
+ const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
2487
+ const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
2488
+ const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
2489
+ const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
2490
+
2491
+ // Save the values in the following vectors in the formats B0B1B4B5B8B9BCBD, B2B3B6B7BABBBEBF for further processing and storing of values
2492
+ const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
2493
+ const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
2494
+ const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
2495
+ const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
2424
2496
 
2425
- // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
2426
- int anr = nr - nr %16; // Used to align nr with boundary of 16
2497
+ const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
2498
+ const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
2499
+ const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
2500
+ const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
2427
2501
 
2428
- for (; y < anr / 4; y += 4) {
2429
- const block_q8_0x4 * a_ptrs[4];
2502
+ const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
2503
+ const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
2504
+ const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
2505
+ const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
2506
+
2507
+ // 4-bit -> 8-bit - Sign is maintained
2508
+ const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
2509
+ const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
2510
+
2511
+ const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
2512
+ const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
2513
+
2514
+ const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
2515
+ const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
2516
+
2517
+ const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
2518
+ const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
2519
+
2520
+ // Shuffle pattern one - right side input
2521
+ const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
2522
+ const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
2523
+
2524
+ const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
2525
+ const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
2526
+
2527
+ const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
2528
+ const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
2529
+
2530
+ const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
2531
+ const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
2532
+
2533
+ // Shuffle pattern two - right side input
2534
+
2535
+ const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
2536
+ const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
2537
+
2538
+ const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
2539
+ const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
2540
+
2541
+ const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
2542
+ const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
2543
+
2544
+ const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
2545
+ const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
2546
+
2547
+ // Scale values - Load the weight scale values of two block_q4_0x8
2548
+ const __m512 col_scale_f32 = LM_GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
2549
+
2550
+ // Process LHS in pairs of rows
2551
+ for (int rp = 0; rp < 4; rp++) {
2552
+
2553
+ // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
2554
+ // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
2555
+ __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
2556
+ __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
2557
+ __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
2558
+ __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
2559
+ __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
2560
+ __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
2561
+ __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
2562
+ __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
2563
+ __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
2564
+ __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
2565
+ __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
2566
+ __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
2567
+
2568
+ __m512i lhs_mat_01_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_0), lhs_mat_ymm_01_0, 1);
2569
+ __m512i lhs_mat_23_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_0), lhs_mat_ymm_23_0, 1);
2570
+ __m512i lhs_mat_01_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_1), lhs_mat_ymm_01_1, 1);
2571
+ __m512i lhs_mat_23_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_1), lhs_mat_ymm_23_1, 1);
2572
+ __m512i lhs_mat_01_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_2), lhs_mat_ymm_01_2, 1);
2573
+ __m512i lhs_mat_23_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_2), lhs_mat_ymm_23_2, 1);
2574
+ __m512i lhs_mat_01_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_3), lhs_mat_ymm_01_3, 1);
2575
+ __m512i lhs_mat_23_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_3), lhs_mat_ymm_23_3, 1);
2576
+
2577
+ // Shuffle pattern one - left side input
2578
+
2579
+ const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
2580
+ const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
2581
+
2582
+ const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
2583
+ const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
2584
+
2585
+ const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
2586
+ const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
2587
+
2588
+ const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
2589
+ const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
2590
+
2591
+ // Shuffle pattern two - left side input
2592
+
2593
+ const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
2594
+ const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
2595
+
2596
+ const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
2597
+ const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
2598
+
2599
+ const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
2600
+ const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
2601
+
2602
+ const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
2603
+ const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
2604
+
2605
+ // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
2606
+ // Resembles MMLAs into 2x2 matrices in ARM Version
2607
+ __m512i iacc_mat_00_sp1 =
2608
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_014589CD_0_sp1));
2609
+ __m512i iacc_mat_01_sp1 =
2610
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_2367ABEF_0_sp1));
2611
+ __m512i iacc_mat_10_sp1 =
2612
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_014589CD_0_sp1));
2613
+ __m512i iacc_mat_11_sp1 =
2614
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_2367ABEF_0_sp1));
2615
+ __m512i iacc_mat_00_sp2 =
2616
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_014589CD_0_sp2));
2617
+ __m512i iacc_mat_01_sp2 =
2618
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_2367ABEF_0_sp2));
2619
+ __m512i iacc_mat_10_sp2 =
2620
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_014589CD_0_sp2));
2621
+ __m512i iacc_mat_11_sp2 =
2622
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_2367ABEF_0_sp2));
2623
+
2624
+ // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
2625
+ __m512i iacc_mat_00 = _mm512_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
2626
+ __m512i iacc_mat_01 = _mm512_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
2627
+ __m512i iacc_mat_10 = _mm512_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
2628
+ __m512i iacc_mat_11 = _mm512_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
2629
+
2630
+
2631
+ // Straighten out to make 4 row vectors
2632
+ __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78));
2633
+ __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01);
2634
+ __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78));
2635
+ __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
2636
+
2637
+ // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
2638
+ const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
2639
+ const __m512 row_scale_f32 = LM_GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
2640
+
2641
+ // Multiply with appropiate scales and accumulate
2642
+ acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
2643
+ acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
2644
+ acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
2645
+ acc_rows[rp * 4 + 3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]);
2646
+ }
2647
+ }
2430
2648
 
2431
- a_ptrs[0] = a_ptr_start + (y * nb);
2432
- for (int i = 0; i < 3; ++i) {
2433
- a_ptrs[i + 1] = a_ptrs[i] + nb;
2649
+ // Store the accumulated values
2650
+ for (int i = 0; i < 16; i++) {
2651
+ _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
2652
+ }
2653
+ }
2434
2654
  }
2655
+ // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
2656
+ for (; y < nr / 4; y ++) {
2435
2657
 
2436
- // Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
2437
- for (int64_t x = 0; x < nc / 8; x++) {
2658
+ const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
2438
2659
 
2439
- const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
2660
+ // Take group of two block_q4_0x8 structures at each pass of the loop and perform dot product operation
2661
+ for (int64_t x = 0; x < anc / 8; x += 2) {
2662
+
2663
+ const block_q4_0x8 * b_ptr_0 = b_ptr_start + ((x) * b_nb);
2664
+ const block_q4_0x8 * b_ptr_1 = b_ptr_start + ((x + 1) * b_nb);
2665
+
2666
+ // Master FP accumulators
2667
+ __m512 acc_rows[4];
2668
+ for (int i = 0; i < 4; i++) {
2669
+ acc_rows[i] = _mm512_setzero_ps();
2670
+ }
2671
+
2672
+ for (int64_t b = 0; b < nb; b++) {
2673
+ // Load the sixteen block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....BE,BF
2674
+ const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs));
2675
+ const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 32));
2676
+ const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 64));
2677
+ const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_0[b].qs + 96));
2678
+
2679
+ const __m256i rhs_raw_mat_89AB_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs));
2680
+ const __m256i rhs_raw_mat_CDEF_0 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 32));
2681
+ const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
2682
+ const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
2683
+
2684
+ // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
2685
+ const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
2686
+ const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
2687
+ const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
2688
+ const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
2689
+
2690
+ const __m256i rhs_raw_mat_89CD_0 = _mm256_blend_epi32(rhs_raw_mat_89AB_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_0, requiredOrder), 240);
2691
+ const __m256i rhs_raw_mat_ABEF_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_0, requiredOrder), rhs_raw_mat_CDEF_0, 240);
2692
+ const __m256i rhs_raw_mat_89CD_1 = _mm256_blend_epi32(rhs_raw_mat_89AB_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_CDEF_1, requiredOrder), 240);
2693
+ const __m256i rhs_raw_mat_ABEF_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_89AB_1, requiredOrder), rhs_raw_mat_CDEF_1, 240);
2694
+
2695
+ const __m512i rhs_raw_mat_014589CD_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_0), rhs_raw_mat_89CD_0, 1);
2696
+ const __m512i rhs_raw_mat_2367ABEF_0 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_0), rhs_raw_mat_ABEF_0, 1);
2697
+ const __m512i rhs_raw_mat_014589CD_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_0145_1), rhs_raw_mat_89CD_1, 1);
2698
+ const __m512i rhs_raw_mat_2367ABEF_1 = _mm512_inserti32x8(_mm512_castsi256_si512(rhs_raw_mat_2367_1), rhs_raw_mat_ABEF_1, 1);
2699
+
2700
+ // 4-bit -> 8-bit - Sign is maintained
2701
+ const __m512i rhs_mat_014589CD_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_0, m4bexpanded)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7) B8(0-7) B9(0-7) BC(0-7) BD(0-7)
2702
+ const __m512i rhs_mat_2367ABEF_0 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_0, m4bexpanded)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7) BA(0-7) BB(0-7) BE(0-7) BF(0-7)
2703
+
2704
+ const __m512i rhs_mat_014589CD_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_014589CD_1, m4bexpanded)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15) B8(8-15) B9(8-15) BC(8-15) BD(8-15)
2705
+ const __m512i rhs_mat_2367ABEF_1 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(rhs_raw_mat_2367ABEF_1, m4bexpanded)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15) BA(8-15) BB(8-15) BE(8-15) BF(8-15)
2706
+
2707
+ const __m512i rhs_mat_014589CD_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_0, 4), m4bexpanded)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23) B8(16-23) B9(16-23) BC(16-23) BD(16-23)
2708
+ const __m512i rhs_mat_2367ABEF_2 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_0, 4), m4bexpanded)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23) BA(16-23) BB(16-23) BE(16-23) BF(16-23)
2709
+
2710
+ const __m512i rhs_mat_014589CD_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_014589CD_1, 4), m4bexpanded)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31) B8(24-31) B9(24-31) BC(24-31) BD(24-31)
2711
+ const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31)
2440
2712
 
2441
- // Master FP accumulators
2442
- __m256 acc_rows[16];
2443
- for (int i = 0; i < 16; i++) {
2444
- acc_rows[i] = _mm256_setzero_ps();
2713
+ // Shuffle pattern one - right side input
2714
+ const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3)
2715
+ const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3)
2716
+
2717
+ const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11)
2718
+ const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11)
2719
+
2720
+ const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19)
2721
+ const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19)
2722
+
2723
+ const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27)
2724
+ const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27)
2725
+
2726
+ // Shuffle pattern two - right side input
2727
+
2728
+ const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7)
2729
+ const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7)
2730
+
2731
+ const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15)
2732
+ const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15)
2733
+
2734
+ const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23)
2735
+ const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23)
2736
+
2737
+ const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31)
2738
+ const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31)
2739
+
2740
+
2741
+ // Scale values - Load the weight scale values of two block_q4_0x8
2742
+ const __m512 col_scale_f32 = LM_GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d);
2743
+
2744
+ // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
2745
+ // Loaded as set of 128 bit vectors and repeated and stored into a 256 bit vector before again repeating into 512 bit vector
2746
+ __m256i lhs_mat_ymm_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
2747
+ __m256i lhs_mat_ymm_01_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 0);
2748
+ __m256i lhs_mat_ymm_23_0 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_0, lhs_mat_ymm_0123_0, 17);
2749
+ __m256i lhs_mat_ymm_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
2750
+ __m256i lhs_mat_ymm_01_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 0);
2751
+ __m256i lhs_mat_ymm_23_1 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_1, lhs_mat_ymm_0123_1, 17);
2752
+ __m256i lhs_mat_ymm_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
2753
+ __m256i lhs_mat_ymm_01_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 0);
2754
+ __m256i lhs_mat_ymm_23_2 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_2, lhs_mat_ymm_0123_2, 17);
2755
+ __m256i lhs_mat_ymm_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
2756
+ __m256i lhs_mat_ymm_01_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 0);
2757
+ __m256i lhs_mat_ymm_23_3 = _mm256_permute2f128_si256(lhs_mat_ymm_0123_3, lhs_mat_ymm_0123_3, 17);
2758
+
2759
+ __m512i lhs_mat_01_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_0), lhs_mat_ymm_01_0, 1);
2760
+ __m512i lhs_mat_23_0 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_0), lhs_mat_ymm_23_0, 1);
2761
+ __m512i lhs_mat_01_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_1), lhs_mat_ymm_01_1, 1);
2762
+ __m512i lhs_mat_23_1 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_1), lhs_mat_ymm_23_1, 1);
2763
+ __m512i lhs_mat_01_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_2), lhs_mat_ymm_01_2, 1);
2764
+ __m512i lhs_mat_23_2 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_2), lhs_mat_ymm_23_2, 1);
2765
+ __m512i lhs_mat_01_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_01_3), lhs_mat_ymm_01_3, 1);
2766
+ __m512i lhs_mat_23_3 = _mm512_inserti32x8(_mm512_castsi256_si512(lhs_mat_ymm_23_3), lhs_mat_ymm_23_3, 1);
2767
+
2768
+ // Shuffle pattern one - left side input
2769
+
2770
+ const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
2771
+ const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
2772
+
2773
+ const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
2774
+ const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
2775
+
2776
+ const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
2777
+ const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
2778
+
2779
+ const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
2780
+ const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
2781
+
2782
+ // Shuffle pattern two - left side input
2783
+
2784
+ const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
2785
+ const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
2786
+
2787
+ const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
2788
+ const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
2789
+
2790
+ const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
2791
+ const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
2792
+
2793
+ const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
2794
+ const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
2795
+
2796
+ // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
2797
+ // Resembles MMLAs into 2x2 matrices in ARM Version
2798
+ __m512i iacc_mat_00_sp1 =
2799
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_014589CD_0_sp1));
2800
+ __m512i iacc_mat_01_sp1 =
2801
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp1, rhs_mat_2367ABEF_0_sp1));
2802
+ __m512i iacc_mat_10_sp1 =
2803
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_014589CD_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_014589CD_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_014589CD_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_014589CD_0_sp1));
2804
+ __m512i iacc_mat_11_sp1 =
2805
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp1, rhs_mat_2367ABEF_3_sp1), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp1, rhs_mat_2367ABEF_2_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp1, rhs_mat_2367ABEF_1_sp1)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp1, rhs_mat_2367ABEF_0_sp1));
2806
+ __m512i iacc_mat_00_sp2 =
2807
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_014589CD_0_sp2));
2808
+ __m512i iacc_mat_01_sp2 =
2809
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_01_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_01_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_01_0_sp2, rhs_mat_2367ABEF_0_sp2));
2810
+ __m512i iacc_mat_10_sp2 =
2811
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_014589CD_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_014589CD_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_014589CD_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_014589CD_0_sp2));
2812
+ __m512i iacc_mat_11_sp2 =
2813
+ _mm512_add_epi32(_mm512_add_epi32(_mm512_add_epi32(mul_sum_i8_pairs_int32x16(lhs_mat_23_3_sp2, rhs_mat_2367ABEF_3_sp2), mul_sum_i8_pairs_int32x16(lhs_mat_23_2_sp2, rhs_mat_2367ABEF_2_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_1_sp2, rhs_mat_2367ABEF_1_sp2)), mul_sum_i8_pairs_int32x16(lhs_mat_23_0_sp2, rhs_mat_2367ABEF_0_sp2));
2814
+
2815
+ // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
2816
+ __m512i iacc_mat_00 = _mm512_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
2817
+ __m512i iacc_mat_01 = _mm512_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
2818
+ __m512i iacc_mat_10 = _mm512_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
2819
+ __m512i iacc_mat_11 = _mm512_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
2820
+
2821
+
2822
+ // Straighten out to make 4 row vectors
2823
+ __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78));
2824
+ __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01);
2825
+ __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78));
2826
+ __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11);
2827
+
2828
+ // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
2829
+ const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
2830
+ const __m512 row_scale_f32 = LM_GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
2831
+
2832
+ // Multiply with appropiate scales and accumulate
2833
+ acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
2834
+ acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
2835
+ acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
2836
+ acc_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_3), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
2837
+ }
2838
+
2839
+ // Store the accumulated values
2840
+ for (int i = 0; i < 4; i++) {
2841
+ _mm512_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
2842
+ }
2445
2843
  }
2844
+ }
2845
+ if (anc != nc) {
2846
+ xstart = anc/8;
2847
+ y = 0;
2848
+ }
2849
+ #endif // __AVX512F__
2446
2850
 
2447
- for (int64_t b = 0; b < nb; b++) {
2448
- // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
2449
- const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
2450
- const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
2451
- const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
2452
- const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
2453
-
2454
- // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
2455
- const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
2456
- const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
2457
- const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
2458
- const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
2851
+ // Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
2459
2852
 
2460
- // 4-bit -> 8-bit - Sign is maintained
2461
- const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
2462
- const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
2853
+ for (; y < anr / 4; y += 4) {
2854
+ const block_q8_0x4 * a_ptrs[4];
2463
2855
 
2464
- const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
2465
- const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
2856
+ a_ptrs[0] = a_ptr_start + (y * nb);
2857
+ for (int i = 0; i < 3; ++i) {
2858
+ a_ptrs[i + 1] = a_ptrs[i] + nb;
2859
+ }
2466
2860
 
2467
- const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
2468
- const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
2861
+ // Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
2862
+ for (int64_t x = xstart; x < nc / 8; x++) {
2469
2863
 
2470
- const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
2471
- const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
2864
+ const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
2472
2865
 
2473
- // Shuffle pattern one - right side input
2474
- const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
2475
- const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
2866
+ // Master FP accumulators
2867
+ __m256 acc_rows[16];
2868
+ for (int i = 0; i < 16; i++) {
2869
+ acc_rows[i] = _mm256_setzero_ps();
2870
+ }
2476
2871
 
2477
- const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
2478
- const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
2872
+ for (int64_t b = 0; b < nb; b++) {
2873
+ // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
2874
+ const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
2875
+ const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
2876
+ const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
2877
+ const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
2878
+
2879
+ // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
2880
+ const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
2881
+ const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
2882
+ const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
2883
+ const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
2884
+
2885
+ // 4-bit -> 8-bit - Sign is maintained
2886
+ const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
2887
+ const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
2888
+
2889
+ const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
2890
+ const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
2891
+
2892
+ const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
2893
+ const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
2894
+
2895
+ const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
2896
+ const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
2897
+
2898
+ // Shuffle pattern one - right side input
2899
+ const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
2900
+ const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
2901
+
2902
+ const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
2903
+ const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
2904
+
2905
+ const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
2906
+ const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
2907
+
2908
+ const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
2909
+ const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
2910
+
2911
+ // Shuffle pattern two - right side input
2912
+
2913
+ const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
2914
+ const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
2915
+
2916
+ const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
2917
+ const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
2918
+
2919
+ const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
2920
+ const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
2921
+
2922
+ const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
2923
+ const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
2924
+
2925
+ // Scale values - Load the wight scale values of block_q4_0x8
2926
+ const __m256 col_scale_f32 = LM_GGML_F32Cx8_LOAD(b_ptr[b].d);
2927
+
2928
+ // Process LHS in groups of four
2929
+ for (int rp = 0; rp < 4; rp++) {
2930
+ // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
2931
+ // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
2932
+ __m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
2933
+ __m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
2934
+ __m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
2935
+ __m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
2936
+ __m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
2937
+ __m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
2938
+ __m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
2939
+ __m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
2940
+ __m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
2941
+ __m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
2942
+ __m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
2943
+ __m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
2944
+
2945
+ // Shuffle pattern one - left side input
2946
+ const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
2947
+ const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
2948
+
2949
+ const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
2950
+ const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
2951
+
2952
+ const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
2953
+ const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
2954
+
2955
+ const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
2956
+ const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
2957
+
2958
+ // Shuffle pattern two - left side input
2959
+ const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
2960
+ const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
2961
+
2962
+ const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
2963
+ const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
2964
+
2965
+ const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
2966
+ const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
2967
+
2968
+ const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
2969
+ const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
2970
+
2971
+ // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
2972
+ // Resembles MMLAs into 2x2 matrices in ARM Version
2973
+ __m256i iacc_mat_00_sp1 =
2974
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
2975
+ __m256i iacc_mat_01_sp1 =
2976
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
2977
+ __m256i iacc_mat_10_sp1 =
2978
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
2979
+ __m256i iacc_mat_11_sp1 =
2980
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
2981
+ __m256i iacc_mat_00_sp2 =
2982
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
2983
+ __m256i iacc_mat_01_sp2 =
2984
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
2985
+ __m256i iacc_mat_10_sp2 =
2986
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
2987
+ __m256i iacc_mat_11_sp2 =
2988
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
2989
+
2990
+ // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
2991
+ __m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
2992
+ __m256i iacc_mat_01 = _mm256_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
2993
+ __m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
2994
+ __m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
2995
+
2996
+ // Straighten out to make 4 row vectors
2997
+ __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
2998
+ __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
2999
+ __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
3000
+ __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
3001
+
3002
+ // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
3003
+ const __m256 row_scale_f32 = LM_GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
3004
+
3005
+ // Multiply with appropiate scales and accumulate
3006
+ acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
3007
+ acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
3008
+ acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
3009
+ acc_rows[rp * 4 + 3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]);
3010
+ }
3011
+ }
2479
3012
 
2480
- const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
2481
- const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
3013
+ // Store the accumulated values
3014
+ for (int i = 0; i < 16; i++) {
3015
+ _mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
3016
+ }
3017
+ }
3018
+ }
3019
+
3020
+ // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
3021
+ for (; y < nr / 4; y ++) {
3022
+
3023
+ const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
3024
+
3025
+ // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
3026
+ for (int64_t x = xstart; x < nc / 8; x++) {
3027
+
3028
+ const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
3029
+
3030
+ // Master FP accumulators
3031
+ __m256 acc_rows[4];
3032
+ for (int i = 0; i < 4; i++) {
3033
+ acc_rows[i] = _mm256_setzero_ps();
3034
+ }
3035
+
3036
+ for (int64_t b = 0; b < nb; b++) {
3037
+ // Load the eight block_q8_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
3038
+ const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
3039
+ const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
3040
+ const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
3041
+ const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
3042
+
3043
+ // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
3044
+ const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
3045
+ const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
3046
+ const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
3047
+ const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
3048
+
3049
+ // 4-bit -> 8-bit - Sign is maintained
3050
+ const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
3051
+ const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
3052
+
3053
+ const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
3054
+ const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
2482
3055
 
2483
- const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
2484
- const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
3056
+ const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
3057
+ const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
2485
3058
 
2486
- // Shuffle pattern two - right side input
3059
+ const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
3060
+ const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
2487
3061
 
2488
- const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
2489
- const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
3062
+ // Shuffle pattern one - right side input
3063
+ const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
3064
+ const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
2490
3065
 
2491
- const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
2492
- const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
3066
+ const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
3067
+ const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
2493
3068
 
2494
- const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
2495
- const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
3069
+ const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
3070
+ const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
2496
3071
 
2497
- const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
2498
- const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
3072
+ const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
3073
+ const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
2499
3074
 
2500
- // Scale values - Load the wight scale values of block_q4_0x8
2501
- const __m256 col_scale_f32 = LM_GGML_F32Cx8_LOAD(b_ptr[b].d);
3075
+ // Shuffle pattern two - right side input
3076
+
3077
+ const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
3078
+ const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
3079
+
3080
+ const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
3081
+ const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
3082
+
3083
+ const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
3084
+ const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
3085
+
3086
+ const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
3087
+ const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
3088
+
3089
+ // Scale values - Load the wight scale values of block_q4_0x8
3090
+ const __m256 col_scale_f32 = LM_GGML_F32Cx8_LOAD(b_ptr[b].d);
2502
3091
 
2503
- // Process LHS in groups of four
2504
- for (int rp = 0; rp < 4; rp++) {
2505
3092
  // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
2506
3093
  // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
2507
- __m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
3094
+ __m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
2508
3095
  __m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
2509
3096
  __m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
2510
- __m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
3097
+ __m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
2511
3098
  __m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
2512
3099
  __m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
2513
- __m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
3100
+ __m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
2514
3101
  __m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
2515
3102
  __m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
2516
- __m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
3103
+ __m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
2517
3104
  __m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
2518
3105
  __m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
2519
3106
 
2520
3107
  // Shuffle pattern one - left side input
3108
+
2521
3109
  const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
2522
3110
  const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
2523
3111
 
@@ -2531,6 +3119,7 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
2531
3119
  const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
2532
3120
 
2533
3121
  // Shuffle pattern two - left side input
3122
+
2534
3123
  const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
2535
3124
  const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
2536
3125
 
@@ -2546,21 +3135,21 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
2546
3135
  // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
2547
3136
  // Resembles MMLAs into 2x2 matrices in ARM Version
2548
3137
  __m256i iacc_mat_00_sp1 =
2549
- _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
3138
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
2550
3139
  __m256i iacc_mat_01_sp1 =
2551
- _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
3140
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
2552
3141
  __m256i iacc_mat_10_sp1 =
2553
- _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
3142
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
2554
3143
  __m256i iacc_mat_11_sp1 =
2555
- _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
3144
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
2556
3145
  __m256i iacc_mat_00_sp2 =
2557
- _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
3146
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
2558
3147
  __m256i iacc_mat_01_sp2 =
2559
- _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
3148
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
2560
3149
  __m256i iacc_mat_10_sp2 =
2561
- _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
3150
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
2562
3151
  __m256i iacc_mat_11_sp2 =
2563
- _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
3152
+ _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int32x8(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int32x8(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int32x8(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
2564
3153
 
2565
3154
  // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
2566
3155
  __m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
@@ -2568,6 +3157,7 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
2568
3157
  __m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
2569
3158
  __m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
2570
3159
 
3160
+
2571
3161
  // Straighten out to make 4 row vectors
2572
3162
  __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
2573
3163
  __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
@@ -2575,187 +3165,24 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
2575
3165
  __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
2576
3166
 
2577
3167
  // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
2578
- const __m256 row_scale_f32 = LM_GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
3168
+ const __m256 row_scale_f32 = LM_GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);
2579
3169
 
2580
3170
  // Multiply with appropiate scales and accumulate
2581
- acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
2582
- acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
2583
- acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
2584
- acc_rows[rp * 4 + 3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]);
3171
+ acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
3172
+ acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
3173
+ acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
3174
+ acc_rows[3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
2585
3175
  }
2586
- }
2587
3176
 
2588
- // Store the accumulated values
2589
- for (int i = 0; i < 16; i++) {
2590
- _mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
2591
- }
2592
- }
2593
- }
2594
-
2595
- // Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
2596
- for (; y < nr / 4; y ++) {
2597
-
2598
- const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
2599
-
2600
- // Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
2601
- for (int64_t x = 0; x < nc / 8; x++) {
2602
-
2603
- const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
2604
-
2605
- // Master FP accumulators
2606
- __m256 acc_rows[4];
2607
- for (int i = 0; i < 4; i++) {
2608
- acc_rows[i] = _mm256_setzero_ps();
2609
- }
2610
-
2611
- for (int64_t b = 0; b < nb; b++) {
2612
- // Load the eight block_q8_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
2613
- const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
2614
- const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
2615
- const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
2616
- const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
2617
-
2618
- // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
2619
- const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
2620
- const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
2621
- const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
2622
- const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
2623
-
2624
- // 4-bit -> 8-bit - Sign is maintained
2625
- const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
2626
- const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
2627
-
2628
- const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
2629
- const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
2630
-
2631
- const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
2632
- const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
2633
-
2634
- const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
2635
- const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
2636
-
2637
- // Shuffle pattern one - right side input
2638
- const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
2639
- const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
2640
-
2641
- const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
2642
- const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
2643
-
2644
- const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
2645
- const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
2646
-
2647
- const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
2648
- const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
2649
-
2650
- // Shuffle pattern two - right side input
2651
-
2652
- const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
2653
- const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
2654
-
2655
- const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
2656
- const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
2657
-
2658
- const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
2659
- const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
2660
-
2661
- const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
2662
- const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
2663
-
2664
- // Scale values - Load the wight scale values of block_q4_0x8
2665
- const __m256 col_scale_f32 = LM_GGML_F32Cx8_LOAD(b_ptr[b].d);
2666
-
2667
- // Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
2668
- // Loaded as set of 128 bit vectors and repeated into a 256 bit vector
2669
- __m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
2670
- __m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
2671
- __m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
2672
- __m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
2673
- __m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
2674
- __m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
2675
- __m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
2676
- __m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
2677
- __m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
2678
- __m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
2679
- __m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
2680
- __m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
2681
-
2682
- // Shuffle pattern one - left side input
2683
-
2684
- const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
2685
- const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
2686
-
2687
- const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
2688
- const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
2689
-
2690
- const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
2691
- const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
2692
-
2693
- const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
2694
- const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
2695
-
2696
- // Shuffle pattern two - left side input
2697
-
2698
- const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
2699
- const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
2700
-
2701
- const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
2702
- const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
2703
-
2704
- const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
2705
- const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
2706
-
2707
- const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
2708
- const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
2709
-
2710
- // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
2711
- // Resembles MMLAs into 2x2 matrices in ARM Version
2712
- __m256i iacc_mat_00_sp1 =
2713
- _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
2714
- __m256i iacc_mat_01_sp1 =
2715
- _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
2716
- __m256i iacc_mat_10_sp1 =
2717
- _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
2718
- __m256i iacc_mat_11_sp1 =
2719
- _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
2720
- __m256i iacc_mat_00_sp2 =
2721
- _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
2722
- __m256i iacc_mat_01_sp2 =
2723
- _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
2724
- __m256i iacc_mat_10_sp2 =
2725
- _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
2726
- __m256i iacc_mat_11_sp2 =
2727
- _mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
2728
-
2729
- // Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
2730
- __m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
2731
- __m256i iacc_mat_01 = _mm256_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
2732
- __m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
2733
- __m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
2734
-
2735
-
2736
- // Straighten out to make 4 row vectors
2737
- __m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
2738
- __m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
2739
- __m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
2740
- __m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
2741
-
2742
- // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
2743
- const __m256 row_scale_f32 = LM_GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);
2744
-
2745
- // Multiply with appropiate scales and accumulate
2746
- acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
2747
- acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
2748
- acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
2749
- acc_rows[3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
2750
- }
2751
-
2752
- // Store the accumulated values
2753
- for (int i = 0; i < 4; i++) {
2754
- _mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
3177
+ // Store the accumulated values
3178
+ for (int i = 0; i < 4; i++) {
3179
+ _mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
3180
+ }
2755
3181
  }
2756
3182
  }
3183
+ return;
2757
3184
  }
2758
- #else
3185
+ #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
2759
3186
  float sumf[4][8];
2760
3187
  int sumi;
2761
3188
 
@@ -2788,5 +3215,4 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
2788
3215
  }
2789
3216
  }
2790
3217
  }
2791
- #endif
2792
3218
  }